import numpy as np import datasets from typing import Dict, Tuple from datasets import Dataset, load_from_disk from torch.utils.data import DataLoader from datasets import load_dataset # dataset = load_dataset("liwu/MNBVC", "wikipedia", split="train", streaming=True) # dataset = load_dataset("liwu/MNBVC", "wikipedia", split="train") # dataset = load_dataset("./liwu/MNBVC/MNBVC.py", "wikipedia", split="train") dataset = load_dataset("json", data_files="./liwu/MNBVC/wiki/20230197/0.jsonl.gz") # dataset = load_dataset("./liwu/MNBVC/wiki", split="train", streaming=True) # dataset = load_dataset("./liwu/MNBVC/wiki", split="train") from functools import partial def format_inputs(examples): p = examples["段落"] mergeLine = "" for line in p: mergeLine += line["内容"] + "\n" return {"text": mergeLine} dataset = dataset.map( partial(format_inputs), batched=False, num_proc=12, remove_columns="段落", ) d = dataset["train"][0] # def split_raw_dataset( # raw_dataset: datasets.DatasetDict, # ) -> Tuple[datasets.Dataset, datasets.Dataset]: # if 'validation' in raw_dataset: # train_dataset, val_dataset = raw_dataset['train'], raw_dataset['validation'] # else: # raw_dataset = raw_dataset['train'].train_test_split(test_size=0.05, seed=1234) # train_dataset, val_dataset = raw_dataset['train'], raw_dataset['test'] # return train_dataset, val_dataset # dataset = split_raw_dataset(dataset) dataloader = DataLoader(dataset, batch_size=32, num_workers=4) count = 0 for _ in dataloader: count += 1 print(next(iter(dataset))) # get the first line # { # "文件名": "cleaned/zhwiki-20230320/folder_0/486404.txt", # "是否待查文件": False, # "是否重复文件": False, # "文件大小": 388, # "simhash": "5350070205913746051", # "最长段落长度": 150, # "段落数": 4, # "去重段落数": 4, # "低质量段落数": 0, # "段落": [ # { # "行号": 0, # "是否重复": False, # "是否跨文件重复": False, # "md5": "b687acdad842a48d260f709e93e7df9b", # "内容": "【昂斯尼区】", # }, # { # "行号": 2, # "是否重复": False, # "是否跨文件重复": False, # "md5": "275fffb0a600559e5c37ee01f2f3cea3", # "内容": "昂斯尼区(arrondissement de Ancenis;arondisamant Ankiniz)是法国大西洋卢瓦尔省所辖的一个旧区。总面积785平方公里,总人口61457,人口密度78人/平方公里(2012年)。主要城镇为昂斯尼。2017年1月,该区与沙托布里扬区合并为沙托布里扬-昂斯尼区。", # }, # { # "行号": 4, # "是否重复": False, # "是否跨文件重复": False, # "md5": "3cf5a8dd2ea5a3feeafec5c5abeeea07", # "内容": "== 辖区 ==", # }, # { # "行号": 5, # "是否重复": False, # "是否跨文件重复": False, # "md5": "45ff2c3113b4cf9b4b7f0fe1e22131a7", # "内容": "昂斯尼区曾辖有24个市镇。", # }, # ], # }