102 lines
		
	
	
		
			3.2 KiB
		
	
	
	
		
			Python
		
	
	
	
		
		
			
		
	
	
			102 lines
		
	
	
		
			3.2 KiB
		
	
	
	
		
			Python
		
	
	
	
| 
								 | 
							
								import numpy as np
							 | 
						|||
| 
								 | 
							
								import datasets
							 | 
						|||
| 
								 | 
							
								from typing import Dict, Tuple
							 | 
						|||
| 
								 | 
							
								from datasets import Dataset, load_from_disk
							 | 
						|||
| 
								 | 
							
								from torch.utils.data import DataLoader
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								from datasets import load_dataset
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								# dataset = load_dataset("liwu/MNBVC", "wikipedia", split="train", streaming=True)
							 | 
						|||
| 
								 | 
							
								# dataset = load_dataset("liwu/MNBVC", "wikipedia", split="train")
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								# dataset = load_dataset("./liwu/MNBVC/MNBVC.py", "wikipedia", split="train")
							 | 
						|||
| 
								 | 
							
								dataset = load_dataset("json", data_files="./liwu/MNBVC/wiki/20230197/0.jsonl.gz")
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								# dataset = load_dataset("./liwu/MNBVC/wiki", split="train", streaming=True)
							 | 
						|||
| 
								 | 
							
								# dataset = load_dataset("./liwu/MNBVC/wiki", split="train")
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								from functools import partial
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								def format_inputs(examples):
							 | 
						|||
| 
								 | 
							
								    p = examples["段落"]
							 | 
						|||
| 
								 | 
							
								    mergeLine = ""
							 | 
						|||
| 
								 | 
							
								    for line in p:
							 | 
						|||
| 
								 | 
							
								        mergeLine += line["内容"] + "\n"
							 | 
						|||
| 
								 | 
							
								    return {"text": mergeLine}
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								dataset = dataset.map(
							 | 
						|||
| 
								 | 
							
								    partial(format_inputs),
							 | 
						|||
| 
								 | 
							
								    batched=False,
							 | 
						|||
| 
								 | 
							
								    num_proc=12,
							 | 
						|||
| 
								 | 
							
								    remove_columns="段落",
							 | 
						|||
| 
								 | 
							
								)
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								d = dataset["train"][0]
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								# def split_raw_dataset(
							 | 
						|||
| 
								 | 
							
								#     raw_dataset: datasets.DatasetDict,
							 | 
						|||
| 
								 | 
							
								# ) -> Tuple[datasets.Dataset, datasets.Dataset]:
							 | 
						|||
| 
								 | 
							
								#     if 'validation' in raw_dataset:
							 | 
						|||
| 
								 | 
							
								#         train_dataset, val_dataset = raw_dataset['train'], raw_dataset['validation']
							 | 
						|||
| 
								 | 
							
								#     else:
							 | 
						|||
| 
								 | 
							
								#         raw_dataset = raw_dataset['train'].train_test_split(test_size=0.05, seed=1234)
							 | 
						|||
| 
								 | 
							
								#         train_dataset, val_dataset = raw_dataset['train'], raw_dataset['test']
							 | 
						|||
| 
								 | 
							
								#     return train_dataset, val_dataset
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								# dataset = split_raw_dataset(dataset)
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								dataloader = DataLoader(dataset, batch_size=32, num_workers=4)
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								count = 0
							 | 
						|||
| 
								 | 
							
								for _ in dataloader:
							 | 
						|||
| 
								 | 
							
								    count += 1
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								print(next(iter(dataset)))  # get the first line
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								# {
							 | 
						|||
| 
								 | 
							
								#     "文件名": "cleaned/zhwiki-20230320/folder_0/486404.txt",
							 | 
						|||
| 
								 | 
							
								#     "是否待查文件": False,
							 | 
						|||
| 
								 | 
							
								#     "是否重复文件": False,
							 | 
						|||
| 
								 | 
							
								#     "文件大小": 388,
							 | 
						|||
| 
								 | 
							
								#     "simhash": "5350070205913746051",
							 | 
						|||
| 
								 | 
							
								#     "最长段落长度": 150,
							 | 
						|||
| 
								 | 
							
								#     "段落数": 4,
							 | 
						|||
| 
								 | 
							
								#     "去重段落数": 4,
							 | 
						|||
| 
								 | 
							
								#     "低质量段落数": 0,
							 | 
						|||
| 
								 | 
							
								#     "段落": [
							 | 
						|||
| 
								 | 
							
								#         {
							 | 
						|||
| 
								 | 
							
								#             "行号": 0,
							 | 
						|||
| 
								 | 
							
								#             "是否重复": False,
							 | 
						|||
| 
								 | 
							
								#             "是否跨文件重复": False,
							 | 
						|||
| 
								 | 
							
								#             "md5": "b687acdad842a48d260f709e93e7df9b",
							 | 
						|||
| 
								 | 
							
								#             "内容": "【昂斯尼区】",
							 | 
						|||
| 
								 | 
							
								#         },
							 | 
						|||
| 
								 | 
							
								#         {
							 | 
						|||
| 
								 | 
							
								#             "行号": 2,
							 | 
						|||
| 
								 | 
							
								#             "是否重复": False,
							 | 
						|||
| 
								 | 
							
								#             "是否跨文件重复": False,
							 | 
						|||
| 
								 | 
							
								#             "md5": "275fffb0a600559e5c37ee01f2f3cea3",
							 | 
						|||
| 
								 | 
							
								#             "内容": "昂斯尼区(arrondissement de Ancenis;arondisamant Ankiniz)是法国大西洋卢瓦尔省所辖的一个旧区。总面积785平方公里,总人口61457,人口密度78人/平方公里(2012年)。主要城镇为昂斯尼。2017年1月,该区与沙托布里扬区合并为沙托布里扬-昂斯尼区。",
							 | 
						|||
| 
								 | 
							
								#         },
							 | 
						|||
| 
								 | 
							
								#         {
							 | 
						|||
| 
								 | 
							
								#             "行号": 4,
							 | 
						|||
| 
								 | 
							
								#             "是否重复": False,
							 | 
						|||
| 
								 | 
							
								#             "是否跨文件重复": False,
							 | 
						|||
| 
								 | 
							
								#             "md5": "3cf5a8dd2ea5a3feeafec5c5abeeea07",
							 | 
						|||
| 
								 | 
							
								#             "内容": "== 辖区 ==",
							 | 
						|||
| 
								 | 
							
								#         },
							 | 
						|||
| 
								 | 
							
								#         {
							 | 
						|||
| 
								 | 
							
								#             "行号": 5,
							 | 
						|||
| 
								 | 
							
								#             "是否重复": False,
							 | 
						|||
| 
								 | 
							
								#             "是否跨文件重复": False,
							 | 
						|||
| 
								 | 
							
								#             "md5": "45ff2c3113b4cf9b4b7f0fe1e22131a7",
							 | 
						|||
| 
								 | 
							
								#             "内容": "昂斯尼区曾辖有24个市镇。",
							 | 
						|||
| 
								 | 
							
								#         },
							 | 
						|||
| 
								 | 
							
								#     ],
							 | 
						|||
| 
								 | 
							
								# }
							 |