102 lines
3.2 KiB
Python
102 lines
3.2 KiB
Python
|
import numpy as np
|
|||
|
import datasets
|
|||
|
from typing import Dict, Tuple
|
|||
|
from datasets import Dataset, load_from_disk
|
|||
|
from torch.utils.data import DataLoader
|
|||
|
|
|||
|
from datasets import load_dataset
|
|||
|
|
|||
|
# dataset = load_dataset("liwu/MNBVC", "wikipedia", split="train", streaming=True)
|
|||
|
# dataset = load_dataset("liwu/MNBVC", "wikipedia", split="train")
|
|||
|
|
|||
|
# dataset = load_dataset("./liwu/MNBVC/MNBVC.py", "wikipedia", split="train")
|
|||
|
dataset = load_dataset("json", data_files="./liwu/MNBVC/wiki/20230197/0.jsonl.gz")
|
|||
|
|
|||
|
# dataset = load_dataset("./liwu/MNBVC/wiki", split="train", streaming=True)
|
|||
|
# dataset = load_dataset("./liwu/MNBVC/wiki", split="train")
|
|||
|
|
|||
|
|
|||
|
from functools import partial
|
|||
|
|
|||
|
|
|||
|
def format_inputs(examples):
|
|||
|
p = examples["段落"]
|
|||
|
mergeLine = ""
|
|||
|
for line in p:
|
|||
|
mergeLine += line["内容"] + "\n"
|
|||
|
return {"text": mergeLine}
|
|||
|
|
|||
|
|
|||
|
dataset = dataset.map(
|
|||
|
partial(format_inputs),
|
|||
|
batched=False,
|
|||
|
num_proc=12,
|
|||
|
remove_columns="段落",
|
|||
|
)
|
|||
|
|
|||
|
d = dataset["train"][0]
|
|||
|
|
|||
|
# def split_raw_dataset(
|
|||
|
# raw_dataset: datasets.DatasetDict,
|
|||
|
# ) -> Tuple[datasets.Dataset, datasets.Dataset]:
|
|||
|
# if 'validation' in raw_dataset:
|
|||
|
# train_dataset, val_dataset = raw_dataset['train'], raw_dataset['validation']
|
|||
|
# else:
|
|||
|
# raw_dataset = raw_dataset['train'].train_test_split(test_size=0.05, seed=1234)
|
|||
|
# train_dataset, val_dataset = raw_dataset['train'], raw_dataset['test']
|
|||
|
# return train_dataset, val_dataset
|
|||
|
|
|||
|
# dataset = split_raw_dataset(dataset)
|
|||
|
|
|||
|
dataloader = DataLoader(dataset, batch_size=32, num_workers=4)
|
|||
|
|
|||
|
|
|||
|
count = 0
|
|||
|
for _ in dataloader:
|
|||
|
count += 1
|
|||
|
|
|||
|
print(next(iter(dataset))) # get the first line
|
|||
|
|
|||
|
|
|||
|
# {
|
|||
|
# "文件名": "cleaned/zhwiki-20230320/folder_0/486404.txt",
|
|||
|
# "是否待查文件": False,
|
|||
|
# "是否重复文件": False,
|
|||
|
# "文件大小": 388,
|
|||
|
# "simhash": "5350070205913746051",
|
|||
|
# "最长段落长度": 150,
|
|||
|
# "段落数": 4,
|
|||
|
# "去重段落数": 4,
|
|||
|
# "低质量段落数": 0,
|
|||
|
# "段落": [
|
|||
|
# {
|
|||
|
# "行号": 0,
|
|||
|
# "是否重复": False,
|
|||
|
# "是否跨文件重复": False,
|
|||
|
# "md5": "b687acdad842a48d260f709e93e7df9b",
|
|||
|
# "内容": "【昂斯尼区】",
|
|||
|
# },
|
|||
|
# {
|
|||
|
# "行号": 2,
|
|||
|
# "是否重复": False,
|
|||
|
# "是否跨文件重复": False,
|
|||
|
# "md5": "275fffb0a600559e5c37ee01f2f3cea3",
|
|||
|
# "内容": "昂斯尼区(arrondissement de Ancenis;arondisamant Ankiniz)是法国大西洋卢瓦尔省所辖的一个旧区。总面积785平方公里,总人口61457,人口密度78人/平方公里(2012年)。主要城镇为昂斯尼。2017年1月,该区与沙托布里扬区合并为沙托布里扬-昂斯尼区。",
|
|||
|
# },
|
|||
|
# {
|
|||
|
# "行号": 4,
|
|||
|
# "是否重复": False,
|
|||
|
# "是否跨文件重复": False,
|
|||
|
# "md5": "3cf5a8dd2ea5a3feeafec5c5abeeea07",
|
|||
|
# "内容": "== 辖区 ==",
|
|||
|
# },
|
|||
|
# {
|
|||
|
# "行号": 5,
|
|||
|
# "是否重复": False,
|
|||
|
# "是否跨文件重复": False,
|
|||
|
# "md5": "45ff2c3113b4cf9b4b7f0fe1e22131a7",
|
|||
|
# "内容": "昂斯尼区曾辖有24个市镇。",
|
|||
|
# },
|
|||
|
# ],
|
|||
|
# }
|