Dataset/dataset.py

102 lines
3.2 KiB
Python
Raw Permalink Normal View History

2024-02-24 11:29:41 +08:00
import numpy as np
import datasets
from typing import Dict, Tuple
from datasets import Dataset, load_from_disk
from torch.utils.data import DataLoader
from datasets import load_dataset
# dataset = load_dataset("liwu/MNBVC", "wikipedia", split="train", streaming=True)
# dataset = load_dataset("liwu/MNBVC", "wikipedia", split="train")
# dataset = load_dataset("./liwu/MNBVC/MNBVC.py", "wikipedia", split="train")
dataset = load_dataset("json", data_files="./liwu/MNBVC/wiki/20230197/0.jsonl.gz")
# dataset = load_dataset("./liwu/MNBVC/wiki", split="train", streaming=True)
# dataset = load_dataset("./liwu/MNBVC/wiki", split="train")
from functools import partial
def format_inputs(examples):
p = examples["段落"]
mergeLine = ""
for line in p:
mergeLine += line["内容"] + "\n"
return {"text": mergeLine}
dataset = dataset.map(
partial(format_inputs),
batched=False,
num_proc=12,
remove_columns="段落",
)
d = dataset["train"][0]
# def split_raw_dataset(
# raw_dataset: datasets.DatasetDict,
# ) -> Tuple[datasets.Dataset, datasets.Dataset]:
# if 'validation' in raw_dataset:
# train_dataset, val_dataset = raw_dataset['train'], raw_dataset['validation']
# else:
# raw_dataset = raw_dataset['train'].train_test_split(test_size=0.05, seed=1234)
# train_dataset, val_dataset = raw_dataset['train'], raw_dataset['test']
# return train_dataset, val_dataset
# dataset = split_raw_dataset(dataset)
dataloader = DataLoader(dataset, batch_size=32, num_workers=4)
count = 0
for _ in dataloader:
count += 1
print(next(iter(dataset))) # get the first line
# {
# "文件名": "cleaned/zhwiki-20230320/folder_0/486404.txt",
# "是否待查文件": False,
# "是否重复文件": False,
# "文件大小": 388,
# "simhash": "5350070205913746051",
# "最长段落长度": 150,
# "段落数": 4,
# "去重段落数": 4,
# "低质量段落数": 0,
# "段落": [
# {
# "行号": 0,
# "是否重复": False,
# "是否跨文件重复": False,
# "md5": "b687acdad842a48d260f709e93e7df9b",
# "内容": "【昂斯尼区】",
# },
# {
# "行号": 2,
# "是否重复": False,
# "是否跨文件重复": False,
# "md5": "275fffb0a600559e5c37ee01f2f3cea3",
# "内容": "昂斯尼区arrondissement de Ancenisarondisamant Ankiniz是法国大西洋卢瓦尔省所辖的一个旧区。总面积785平方公里总人口61457人口密度78人/平方公里2012年。主要城镇为昂斯尼。2017年1月该区与沙托布里扬区合并为沙托布里扬-昂斯尼区。",
# },
# {
# "行号": 4,
# "是否重复": False,
# "是否跨文件重复": False,
# "md5": "3cf5a8dd2ea5a3feeafec5c5abeeea07",
# "内容": "== 辖区 ==",
# },
# {
# "行号": 5,
# "是否重复": False,
# "是否跨文件重复": False,
# "md5": "45ff2c3113b4cf9b4b7f0fe1e22131a7",
# "内容": "昂斯尼区曾辖有24个市镇。",
# },
# ],
# }