102 lines
3.2 KiB
Python
102 lines
3.2 KiB
Python
import numpy as np
|
||
import datasets
|
||
from typing import Dict, Tuple
|
||
from datasets import Dataset, load_from_disk
|
||
from torch.utils.data import DataLoader
|
||
|
||
from datasets import load_dataset
|
||
|
||
# dataset = load_dataset("liwu/MNBVC", "wikipedia", split="train", streaming=True)
|
||
# dataset = load_dataset("liwu/MNBVC", "wikipedia", split="train")
|
||
|
||
# dataset = load_dataset("./liwu/MNBVC/MNBVC.py", "wikipedia", split="train")
|
||
dataset = load_dataset("json", data_files="./liwu/MNBVC/wiki/20230197/0.jsonl.gz")
|
||
|
||
# dataset = load_dataset("./liwu/MNBVC/wiki", split="train", streaming=True)
|
||
# dataset = load_dataset("./liwu/MNBVC/wiki", split="train")
|
||
|
||
|
||
from functools import partial
|
||
|
||
|
||
def format_inputs(examples):
|
||
p = examples["段落"]
|
||
mergeLine = ""
|
||
for line in p:
|
||
mergeLine += line["内容"] + "\n"
|
||
return {"text": mergeLine}
|
||
|
||
|
||
dataset = dataset.map(
|
||
partial(format_inputs),
|
||
batched=False,
|
||
num_proc=12,
|
||
remove_columns="段落",
|
||
)
|
||
|
||
d = dataset["train"][0]
|
||
|
||
# def split_raw_dataset(
|
||
# raw_dataset: datasets.DatasetDict,
|
||
# ) -> Tuple[datasets.Dataset, datasets.Dataset]:
|
||
# if 'validation' in raw_dataset:
|
||
# train_dataset, val_dataset = raw_dataset['train'], raw_dataset['validation']
|
||
# else:
|
||
# raw_dataset = raw_dataset['train'].train_test_split(test_size=0.05, seed=1234)
|
||
# train_dataset, val_dataset = raw_dataset['train'], raw_dataset['test']
|
||
# return train_dataset, val_dataset
|
||
|
||
# dataset = split_raw_dataset(dataset)
|
||
|
||
dataloader = DataLoader(dataset, batch_size=32, num_workers=4)
|
||
|
||
|
||
count = 0
|
||
for _ in dataloader:
|
||
count += 1
|
||
|
||
print(next(iter(dataset))) # get the first line
|
||
|
||
|
||
# {
|
||
# "文件名": "cleaned/zhwiki-20230320/folder_0/486404.txt",
|
||
# "是否待查文件": False,
|
||
# "是否重复文件": False,
|
||
# "文件大小": 388,
|
||
# "simhash": "5350070205913746051",
|
||
# "最长段落长度": 150,
|
||
# "段落数": 4,
|
||
# "去重段落数": 4,
|
||
# "低质量段落数": 0,
|
||
# "段落": [
|
||
# {
|
||
# "行号": 0,
|
||
# "是否重复": False,
|
||
# "是否跨文件重复": False,
|
||
# "md5": "b687acdad842a48d260f709e93e7df9b",
|
||
# "内容": "【昂斯尼区】",
|
||
# },
|
||
# {
|
||
# "行号": 2,
|
||
# "是否重复": False,
|
||
# "是否跨文件重复": False,
|
||
# "md5": "275fffb0a600559e5c37ee01f2f3cea3",
|
||
# "内容": "昂斯尼区(arrondissement de Ancenis;arondisamant Ankiniz)是法国大西洋卢瓦尔省所辖的一个旧区。总面积785平方公里,总人口61457,人口密度78人/平方公里(2012年)。主要城镇为昂斯尼。2017年1月,该区与沙托布里扬区合并为沙托布里扬-昂斯尼区。",
|
||
# },
|
||
# {
|
||
# "行号": 4,
|
||
# "是否重复": False,
|
||
# "是否跨文件重复": False,
|
||
# "md5": "3cf5a8dd2ea5a3feeafec5c5abeeea07",
|
||
# "内容": "== 辖区 ==",
|
||
# },
|
||
# {
|
||
# "行号": 5,
|
||
# "是否重复": False,
|
||
# "是否跨文件重复": False,
|
||
# "md5": "45ff2c3113b4cf9b4b7f0fe1e22131a7",
|
||
# "内容": "昂斯尼区曾辖有24个市镇。",
|
||
# },
|
||
# ],
|
||
# }
|