Dataset/dataset.py

102 lines
3.2 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import numpy as np
import datasets
from typing import Dict, Tuple
from datasets import Dataset, load_from_disk
from torch.utils.data import DataLoader
from datasets import load_dataset
# dataset = load_dataset("liwu/MNBVC", "wikipedia", split="train", streaming=True)
# dataset = load_dataset("liwu/MNBVC", "wikipedia", split="train")
# dataset = load_dataset("./liwu/MNBVC/MNBVC.py", "wikipedia", split="train")
dataset = load_dataset("json", data_files="./liwu/MNBVC/wiki/20230197/0.jsonl.gz")
# dataset = load_dataset("./liwu/MNBVC/wiki", split="train", streaming=True)
# dataset = load_dataset("./liwu/MNBVC/wiki", split="train")
from functools import partial
def format_inputs(examples):
p = examples["段落"]
mergeLine = ""
for line in p:
mergeLine += line["内容"] + "\n"
return {"text": mergeLine}
dataset = dataset.map(
partial(format_inputs),
batched=False,
num_proc=12,
remove_columns="段落",
)
d = dataset["train"][0]
# def split_raw_dataset(
# raw_dataset: datasets.DatasetDict,
# ) -> Tuple[datasets.Dataset, datasets.Dataset]:
# if 'validation' in raw_dataset:
# train_dataset, val_dataset = raw_dataset['train'], raw_dataset['validation']
# else:
# raw_dataset = raw_dataset['train'].train_test_split(test_size=0.05, seed=1234)
# train_dataset, val_dataset = raw_dataset['train'], raw_dataset['test']
# return train_dataset, val_dataset
# dataset = split_raw_dataset(dataset)
dataloader = DataLoader(dataset, batch_size=32, num_workers=4)
count = 0
for _ in dataloader:
count += 1
print(next(iter(dataset))) # get the first line
# {
# "文件名": "cleaned/zhwiki-20230320/folder_0/486404.txt",
# "是否待查文件": False,
# "是否重复文件": False,
# "文件大小": 388,
# "simhash": "5350070205913746051",
# "最长段落长度": 150,
# "段落数": 4,
# "去重段落数": 4,
# "低质量段落数": 0,
# "段落": [
# {
# "行号": 0,
# "是否重复": False,
# "是否跨文件重复": False,
# "md5": "b687acdad842a48d260f709e93e7df9b",
# "内容": "【昂斯尼区】",
# },
# {
# "行号": 2,
# "是否重复": False,
# "是否跨文件重复": False,
# "md5": "275fffb0a600559e5c37ee01f2f3cea3",
# "内容": "昂斯尼区arrondissement de Ancenisarondisamant Ankiniz是法国大西洋卢瓦尔省所辖的一个旧区。总面积785平方公里总人口61457人口密度78人/平方公里2012年。主要城镇为昂斯尼。2017年1月该区与沙托布里扬区合并为沙托布里扬-昂斯尼区。",
# },
# {
# "行号": 4,
# "是否重复": False,
# "是否跨文件重复": False,
# "md5": "3cf5a8dd2ea5a3feeafec5c5abeeea07",
# "内容": "== 辖区 ==",
# },
# {
# "行号": 5,
# "是否重复": False,
# "是否跨文件重复": False,
# "md5": "45ff2c3113b4cf9b4b7f0fe1e22131a7",
# "内容": "昂斯尼区曾辖有24个市镇。",
# },
# ],
# }