init dateset repo.
This commit is contained in:
commit
cf6fe659c8
|
@ -0,0 +1 @@
|
||||||
|
*.jsonl.gz
|
|
@ -0,0 +1,17 @@
|
||||||
|
{
|
||||||
|
// Use IntelliSense to learn about possible attributes.
|
||||||
|
// Hover to view descriptions of existing attributes.
|
||||||
|
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
|
||||||
|
"version": "0.2.0",
|
||||||
|
"configurations": [
|
||||||
|
{
|
||||||
|
"name": "Python Debugger: Current File",
|
||||||
|
"type": "debugpy",
|
||||||
|
"request": "launch",
|
||||||
|
"cwd": "${fileDirname}",
|
||||||
|
"program": "${file}",
|
||||||
|
"console": "integratedTerminal",
|
||||||
|
"justMyCode": false
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
|
@ -0,0 +1,6 @@
|
||||||
|
## Download
|
||||||
|
|
||||||
|
```
|
||||||
|
# auto download dataset from Hugginface
|
||||||
|
load_dataset("./liwu/MNBVC/MNBVC.py", "wikipedia", split="train")
|
||||||
|
```
|
|
@ -0,0 +1,101 @@
|
||||||
|
import numpy as np
|
||||||
|
import datasets
|
||||||
|
from typing import Dict, Tuple
|
||||||
|
from datasets import Dataset, load_from_disk
|
||||||
|
from torch.utils.data import DataLoader
|
||||||
|
|
||||||
|
from datasets import load_dataset
|
||||||
|
|
||||||
|
# dataset = load_dataset("liwu/MNBVC", "wikipedia", split="train", streaming=True)
|
||||||
|
# dataset = load_dataset("liwu/MNBVC", "wikipedia", split="train")
|
||||||
|
|
||||||
|
# dataset = load_dataset("./liwu/MNBVC/MNBVC.py", "wikipedia", split="train")
|
||||||
|
dataset = load_dataset("json", data_files="./liwu/MNBVC/wiki/20230197/0.jsonl.gz")
|
||||||
|
|
||||||
|
# dataset = load_dataset("./liwu/MNBVC/wiki", split="train", streaming=True)
|
||||||
|
# dataset = load_dataset("./liwu/MNBVC/wiki", split="train")
|
||||||
|
|
||||||
|
|
||||||
|
from functools import partial
|
||||||
|
|
||||||
|
|
||||||
|
def format_inputs(examples):
|
||||||
|
p = examples["段落"]
|
||||||
|
mergeLine = ""
|
||||||
|
for line in p:
|
||||||
|
mergeLine += line["内容"] + "\n"
|
||||||
|
return {"text": mergeLine}
|
||||||
|
|
||||||
|
|
||||||
|
dataset = dataset.map(
|
||||||
|
partial(format_inputs),
|
||||||
|
batched=False,
|
||||||
|
num_proc=12,
|
||||||
|
remove_columns="段落",
|
||||||
|
)
|
||||||
|
|
||||||
|
d = dataset["train"][0]
|
||||||
|
|
||||||
|
# def split_raw_dataset(
|
||||||
|
# raw_dataset: datasets.DatasetDict,
|
||||||
|
# ) -> Tuple[datasets.Dataset, datasets.Dataset]:
|
||||||
|
# if 'validation' in raw_dataset:
|
||||||
|
# train_dataset, val_dataset = raw_dataset['train'], raw_dataset['validation']
|
||||||
|
# else:
|
||||||
|
# raw_dataset = raw_dataset['train'].train_test_split(test_size=0.05, seed=1234)
|
||||||
|
# train_dataset, val_dataset = raw_dataset['train'], raw_dataset['test']
|
||||||
|
# return train_dataset, val_dataset
|
||||||
|
|
||||||
|
# dataset = split_raw_dataset(dataset)
|
||||||
|
|
||||||
|
dataloader = DataLoader(dataset, batch_size=32, num_workers=4)
|
||||||
|
|
||||||
|
|
||||||
|
count = 0
|
||||||
|
for _ in dataloader:
|
||||||
|
count += 1
|
||||||
|
|
||||||
|
print(next(iter(dataset))) # get the first line
|
||||||
|
|
||||||
|
|
||||||
|
# {
|
||||||
|
# "文件名": "cleaned/zhwiki-20230320/folder_0/486404.txt",
|
||||||
|
# "是否待查文件": False,
|
||||||
|
# "是否重复文件": False,
|
||||||
|
# "文件大小": 388,
|
||||||
|
# "simhash": "5350070205913746051",
|
||||||
|
# "最长段落长度": 150,
|
||||||
|
# "段落数": 4,
|
||||||
|
# "去重段落数": 4,
|
||||||
|
# "低质量段落数": 0,
|
||||||
|
# "段落": [
|
||||||
|
# {
|
||||||
|
# "行号": 0,
|
||||||
|
# "是否重复": False,
|
||||||
|
# "是否跨文件重复": False,
|
||||||
|
# "md5": "b687acdad842a48d260f709e93e7df9b",
|
||||||
|
# "内容": "【昂斯尼区】",
|
||||||
|
# },
|
||||||
|
# {
|
||||||
|
# "行号": 2,
|
||||||
|
# "是否重复": False,
|
||||||
|
# "是否跨文件重复": False,
|
||||||
|
# "md5": "275fffb0a600559e5c37ee01f2f3cea3",
|
||||||
|
# "内容": "昂斯尼区(arrondissement de Ancenis;arondisamant Ankiniz)是法国大西洋卢瓦尔省所辖的一个旧区。总面积785平方公里,总人口61457,人口密度78人/平方公里(2012年)。主要城镇为昂斯尼。2017年1月,该区与沙托布里扬区合并为沙托布里扬-昂斯尼区。",
|
||||||
|
# },
|
||||||
|
# {
|
||||||
|
# "行号": 4,
|
||||||
|
# "是否重复": False,
|
||||||
|
# "是否跨文件重复": False,
|
||||||
|
# "md5": "3cf5a8dd2ea5a3feeafec5c5abeeea07",
|
||||||
|
# "内容": "== 辖区 ==",
|
||||||
|
# },
|
||||||
|
# {
|
||||||
|
# "行号": 5,
|
||||||
|
# "是否重复": False,
|
||||||
|
# "是否跨文件重复": False,
|
||||||
|
# "md5": "45ff2c3113b4cf9b4b7f0fe1e22131a7",
|
||||||
|
# "内容": "昂斯尼区曾辖有24个市镇。",
|
||||||
|
# },
|
||||||
|
# ],
|
||||||
|
# }
|
|
@ -0,0 +1,115 @@
|
||||||
|
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
"""
|
||||||
|
MNBVC: Massive Never-ending BT Vast Chinese corpus
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import datasets
|
||||||
|
import numpy as np
|
||||||
|
import traceback
|
||||||
|
from .meta import MNBVC_META
|
||||||
|
from .features import Features
|
||||||
|
|
||||||
|
|
||||||
|
_CITATION = """\
|
||||||
|
"""
|
||||||
|
|
||||||
|
_DESCRIPTION = """\
|
||||||
|
MNBVC: Massive Never-ending BT Vast Chinese corpus
|
||||||
|
"""
|
||||||
|
|
||||||
|
_HOMEPAGE = "https://github.com/esbatmop/MNBVC"
|
||||||
|
|
||||||
|
_LICENSE = "MIT"
|
||||||
|
|
||||||
|
|
||||||
|
class MNBVC(datasets.GeneratorBasedBuilder):
|
||||||
|
"""Massive Never-ending BT Vast Chinese corpus."""
|
||||||
|
BUILDER_CONFIGS = [
|
||||||
|
datasets.BuilderConfig(
|
||||||
|
name=key, version=datasets.Version("0.0.1"), description=value['description']) for key, value in MNBVC_META.items()]
|
||||||
|
|
||||||
|
def _info(self):
|
||||||
|
return datasets.DatasetInfo(
|
||||||
|
# This is the description that will appear on the datasets page.
|
||||||
|
description=_DESCRIPTION,
|
||||||
|
# This defines the different columns of the dataset and their types
|
||||||
|
features=Features[MNBVC_META[self.config.name]['feature_type']], # Here we define them above because they are different between the two configurations
|
||||||
|
# If there's a common (input, target) tuple from the features, uncomment supervised_keys line below and
|
||||||
|
# specify them. They'll be used if as_supervised=True in builder.as_dataset.
|
||||||
|
# supervised_keys=("sentence", "label"),
|
||||||
|
# Homepage of the dataset for documentation
|
||||||
|
homepage=_HOMEPAGE,
|
||||||
|
# License for the dataset if available
|
||||||
|
license=_LICENSE,
|
||||||
|
# Citation for the dataset
|
||||||
|
citation=_CITATION,
|
||||||
|
)
|
||||||
|
|
||||||
|
def _split_generators(self, dl_manager):
|
||||||
|
data_dir = dl_manager.download_and_extract(MNBVC_META[self.config.name]['files'])
|
||||||
|
|
||||||
|
return [
|
||||||
|
datasets.SplitGenerator(
|
||||||
|
name=datasets.Split.TRAIN,
|
||||||
|
gen_kwargs={
|
||||||
|
"data_files": data_dir,
|
||||||
|
},
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
# method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
|
||||||
|
def _generate_examples(self, data_files):
|
||||||
|
id = 0
|
||||||
|
features = self._info().features
|
||||||
|
feature_keys = set(features.keys())
|
||||||
|
|
||||||
|
def _drop_unused_keys(data):
|
||||||
|
rm_keys = []
|
||||||
|
for key in data.keys():
|
||||||
|
if key not in feature_keys:
|
||||||
|
rm_keys.append(key)
|
||||||
|
for key in rm_keys:
|
||||||
|
del data[key]
|
||||||
|
return data
|
||||||
|
|
||||||
|
try:
|
||||||
|
for file_i, data_file in enumerate(data_files):
|
||||||
|
with open(data_file, encoding="utf-8") as f:
|
||||||
|
for line_i, line in enumerate(f):
|
||||||
|
id += 1
|
||||||
|
data = json.loads(line)
|
||||||
|
if self.config.name == 'law_judgement':
|
||||||
|
text = data['详情']
|
||||||
|
del data['详情']
|
||||||
|
yield id, {
|
||||||
|
"text": text,
|
||||||
|
"meta": json.dumps(data, ensure_ascii=False),
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
data = _drop_unused_keys(data)
|
||||||
|
if 'simhash' in data: # for issue https://github.com/huggingface/datasets/issues/6007
|
||||||
|
data['simhash'] = str(data['simhash'])
|
||||||
|
|
||||||
|
yield id, data
|
||||||
|
except Exception as e:
|
||||||
|
error_msg = 'oops, we find an error when loading the dataset\n'
|
||||||
|
error_msg += f'Dataset: {self.config.name}\n'
|
||||||
|
error_msg += f'Data File: {file_i} {data_file}\n'
|
||||||
|
error_msg += f'Row: {line_i}'
|
||||||
|
print(error_msg)
|
||||||
|
traceback.print_exc()
|
||||||
|
|
||||||
|
raise e
|
|
@ -0,0 +1,92 @@
|
||||||
|
import datasets
|
||||||
|
|
||||||
|
Features = {}
|
||||||
|
|
||||||
|
# 项目早期所使用的数据集字段,后续会在更新过程中逐渐废弃
|
||||||
|
Features['TEXT_CORPUS_LEGACY'] = datasets.Features(
|
||||||
|
{
|
||||||
|
"text": datasets.Value("string"),
|
||||||
|
"meta": datasets.Value("string")
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
# 文本数据集所使用的格式
|
||||||
|
Features['TEXT_CORPUS'] = datasets.Features(
|
||||||
|
{
|
||||||
|
"文件名": datasets.Value("string"),
|
||||||
|
"是否待查文件": datasets.Value("bool"),
|
||||||
|
"是否重复文件": datasets.Value("bool"),
|
||||||
|
"文件大小": datasets.Value("int64"),
|
||||||
|
"simhash": datasets.Value("string"), # for issue https://github.com/huggingface/datasets/issues/6007
|
||||||
|
"最长段落长度": datasets.Value("int64"),
|
||||||
|
"段落数": datasets.Value("int64"),
|
||||||
|
"去重段落数": datasets.Value("int64"),
|
||||||
|
"低质量段落数": datasets.Value("int64"),
|
||||||
|
"段落": [
|
||||||
|
datasets.Features(
|
||||||
|
{
|
||||||
|
"行号": datasets.Value("int64"),
|
||||||
|
"是否重复": datasets.Value("bool"),
|
||||||
|
"是否跨文件重复": datasets.Value("bool"),
|
||||||
|
"md5": datasets.Value("string"),
|
||||||
|
"内容": datasets.Value("string"),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
]
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
# 问答数据所使用的格式
|
||||||
|
Features['QA_CORPUS'] = datasets.Features(
|
||||||
|
{
|
||||||
|
"id": datasets.Value("string"),
|
||||||
|
"问": datasets.Value("string"),
|
||||||
|
"答": datasets.Value("string"),
|
||||||
|
"来源": datasets.Value("string"),
|
||||||
|
"元数据": {
|
||||||
|
"create_time": datasets.Value("string"),
|
||||||
|
"问题明细": datasets.Value("string"),
|
||||||
|
"回答明细": datasets.Value("string"),
|
||||||
|
"扩展字段": datasets.Value("string"),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
# 代码数据所使用的格式
|
||||||
|
Features['CODE_CORPUS'] = datasets.Features(
|
||||||
|
{
|
||||||
|
"来源": datasets.Value("string"),
|
||||||
|
"仓库名": datasets.Value("string"),
|
||||||
|
"path": datasets.Value("string"),
|
||||||
|
"文件名": datasets.Value("string"),
|
||||||
|
"ext": datasets.Value("string"),
|
||||||
|
"size": datasets.Value("int64"),
|
||||||
|
"原始编码": datasets.Value("string"),
|
||||||
|
"md5": datasets.Value("string"),
|
||||||
|
"text": datasets.Value("string")
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
# 论坛对话语料
|
||||||
|
Features['FORUM_DIALOGUE'] = datasets.Features(
|
||||||
|
{
|
||||||
|
"ID": datasets.Value('int32'),
|
||||||
|
"主题": datasets.Value("string"),
|
||||||
|
"来源": datasets.Value("string"),
|
||||||
|
"回复": [
|
||||||
|
datasets.Features(
|
||||||
|
{
|
||||||
|
"楼ID": datasets.Value("string"),
|
||||||
|
"回复": datasets.Value("string"),
|
||||||
|
"扩展字段": datasets.Value("string"),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
],
|
||||||
|
"元数据": {
|
||||||
|
"发帖时间": datasets.Value("string"),
|
||||||
|
"回复数": datasets.Value("int32"),
|
||||||
|
"扩展字段": datasets.Value("string")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
|
@ -0,0 +1,445 @@
|
||||||
|
# This dict is all that matters in this file
|
||||||
|
# Each key in this dict corresponds to a sub-dataset
|
||||||
|
# Each key corresponds to a dict, which specify the config info for this sub-dataset
|
||||||
|
# Something like:
|
||||||
|
# {
|
||||||
|
# "law_judgement": {
|
||||||
|
# "files": [...],
|
||||||
|
# "feature_type": "TEXT_CORPUS_LEGACY",
|
||||||
|
# "description": "Texts from open law suits",
|
||||||
|
# },
|
||||||
|
# ...
|
||||||
|
# }
|
||||||
|
MNBVC_META = {}
|
||||||
|
|
||||||
|
base_url = 'https://huggingface.co/datasets/liwu/MNBVC/resolve/main'
|
||||||
|
def _flatten_list(input):
|
||||||
|
res = []
|
||||||
|
for i in input:
|
||||||
|
if type(i) == list:
|
||||||
|
res += _flatten_list(i)
|
||||||
|
else:
|
||||||
|
res.append(i)
|
||||||
|
return res
|
||||||
|
|
||||||
|
# =============================
|
||||||
|
# law_judgement
|
||||||
|
MNBVC_META['law_judgement'] = {}
|
||||||
|
MNBVC_META['law_judgement']['files'] = [
|
||||||
|
[f'{base_url}/law/judgement/{folder}/{i}.jsonl.gz' for i in range(1, count+1)]
|
||||||
|
for folder, count in [
|
||||||
|
('20230134', 49),
|
||||||
|
('20230135', 44),
|
||||||
|
('20230136', 44),
|
||||||
|
('20230137', 49),
|
||||||
|
('20230138', 41),
|
||||||
|
('20230139', 46),
|
||||||
|
('20230140', 46),
|
||||||
|
('20230141', 50),
|
||||||
|
]
|
||||||
|
]
|
||||||
|
MNBVC_META['law_judgement']['files'] = _flatten_list(MNBVC_META['law_judgement']['files'])
|
||||||
|
MNBVC_META['law_judgement']['feature_type'] = "TEXT_CORPUS_LEGACY"
|
||||||
|
MNBVC_META['law_judgement']['description'] = "Texts from open law suits"
|
||||||
|
|
||||||
|
_CONFIG2FORMAT = {
|
||||||
|
"law_judgement": "TEXT_CORPUS_LEGACY",
|
||||||
|
"gov_xuexiqiangguo": "TEXT_CORPUS",
|
||||||
|
"gov_report": "TEXT_CORPUS_LEGACY",
|
||||||
|
"co_ann_report": "TEXT_CORPUS_LEGACY",
|
||||||
|
"code_metadata": "TEXT_CORPUS_LEGACY",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# =============================
|
||||||
|
# gov_xuexiqiangguo
|
||||||
|
MNBVC_META['gov_xuexiqiangguo'] = {}
|
||||||
|
MNBVC_META['gov_xuexiqiangguo']['files'] = ['https://huggingface.co/datasets/liwu/MNBVC/resolve/main/gov/20230172/XueXiQiangGuo.jsonl.gz']
|
||||||
|
MNBVC_META['gov_xuexiqiangguo']['feature_type'] = "TEXT_CORPUS"
|
||||||
|
MNBVC_META['gov_xuexiqiangguo']['description'] = "Texts from government files (XueXiQiangGuo)"
|
||||||
|
|
||||||
|
|
||||||
|
# =============================
|
||||||
|
# gov_report
|
||||||
|
MNBVC_META['gov_report'] = {}
|
||||||
|
MNBVC_META['gov_report']['files'] = ['https://huggingface.co/datasets/liwu/MNBVC/resolve/main/gov/20230172/GovReport.jsonl.gz']
|
||||||
|
MNBVC_META['gov_report']['feature_type'] = "TEXT_CORPUS_LEGACY"
|
||||||
|
MNBVC_META['gov_report']['description'] = "Texts from government files (Gov Report)"
|
||||||
|
|
||||||
|
|
||||||
|
# =============================
|
||||||
|
# co_ann_report
|
||||||
|
MNBVC_META['co_ann_report'] = {}
|
||||||
|
MNBVC_META['co_ann_report']['files'] = [
|
||||||
|
[f'{base_url}/co_ann_report/{folder}/{i}.jsonl.gz' for i in range(count)]
|
||||||
|
for folder, count in [
|
||||||
|
('20230125', 49)
|
||||||
|
]
|
||||||
|
]
|
||||||
|
MNBVC_META['co_ann_report']['files'] = _flatten_list(MNBVC_META['co_ann_report']['files'])
|
||||||
|
MNBVC_META['co_ann_report']['feature_type'] = "TEXT_CORPUS_LEGACY"
|
||||||
|
MNBVC_META['co_ann_report']['description'] = "Texts from company annual reports"
|
||||||
|
|
||||||
|
|
||||||
|
# =============================
|
||||||
|
# code_metadata
|
||||||
|
MNBVC_META['code_metadata'] = {}
|
||||||
|
MNBVC_META['code_metadata']['files'] = [
|
||||||
|
[f'{base_url}/code/metadata/{folder}/{(start+i)*1000000}-{(start+i+1)*1000000}.jsonl.gz' for i in range(start, start+count)]
|
||||||
|
for folder, start, count in [
|
||||||
|
('20230301', 0, 10),
|
||||||
|
('20230302', 10, 10),
|
||||||
|
('20230303', 20, 10),
|
||||||
|
('20230304', 30, 10),
|
||||||
|
('20230305', 40, 10),
|
||||||
|
('20230306', 50, 10),
|
||||||
|
('20230307', 60, 10),
|
||||||
|
('20230308', 70, 10),
|
||||||
|
('20230309', 80, 10),
|
||||||
|
('20230310', 90, 10),
|
||||||
|
('20230311', 100, 10),
|
||||||
|
('20230312', 110, 10),
|
||||||
|
('20230313', 120, 10),
|
||||||
|
('20230314', 130, 10),
|
||||||
|
('20230315', 140, 10),
|
||||||
|
('20230316', 150, 10),
|
||||||
|
('20230317', 160, 10),
|
||||||
|
('20230318', 170, 10),
|
||||||
|
('20230319', 180, 10),
|
||||||
|
('20230320', 190, 10),
|
||||||
|
]
|
||||||
|
]
|
||||||
|
MNBVC_META['code_metadata']['files'] = _flatten_list(MNBVC_META['code_metadata']['files'])
|
||||||
|
MNBVC_META['code_metadata']['feature_type'] = "TEXT_CORPUS_LEGACY"
|
||||||
|
MNBVC_META['code_metadata']['description'] = "Meta data for the code corpus"
|
||||||
|
|
||||||
|
|
||||||
|
# =============================
|
||||||
|
# qa_zhihu
|
||||||
|
MNBVC_META['qa_zhihu'] = {}
|
||||||
|
MNBVC_META['qa_zhihu']['files'] = [
|
||||||
|
[f'{base_url}/qa/{folder}/zhihu/{i}.jsonl.gz' for i in range(count)]
|
||||||
|
for folder, count in [('20230196', 5)]
|
||||||
|
]
|
||||||
|
MNBVC_META['qa_zhihu']['files'] = _flatten_list(MNBVC_META['qa_zhihu']['files'])
|
||||||
|
MNBVC_META['qa_zhihu']['feature_type'] = "QA_CORPUS"
|
||||||
|
MNBVC_META['qa_zhihu']['description'] = "QA data from zhihu"
|
||||||
|
|
||||||
|
|
||||||
|
# =============================
|
||||||
|
# qa_wikihow
|
||||||
|
MNBVC_META['qa_wikihow'] = {}
|
||||||
|
MNBVC_META['qa_wikihow']['files'] = [
|
||||||
|
[f'{base_url}/qa/{folder}/wikihow/wikihow_en.{i}.jsonl.gz' for i in range(count)]
|
||||||
|
for folder, count in [('20230196', 4)]
|
||||||
|
]
|
||||||
|
MNBVC_META['qa_wikihow']['files'] += [
|
||||||
|
[f'{base_url}/qa/{folder}/wikihow/wikihow_zh.{i}.jsonl.gz' for i in range(count)]
|
||||||
|
for folder, count in [('20230196', 1)]
|
||||||
|
]
|
||||||
|
MNBVC_META['qa_wikihow']['files'] = _flatten_list(MNBVC_META['qa_wikihow']['files'])
|
||||||
|
MNBVC_META['qa_wikihow']['feature_type'] = "QA_CORPUS"
|
||||||
|
MNBVC_META['qa_wikihow']['description'] = "QA data from wikihow (both in zh and en)"
|
||||||
|
|
||||||
|
|
||||||
|
# =============================
|
||||||
|
# qa_mfa
|
||||||
|
MNBVC_META['qa_mfa'] = {}
|
||||||
|
MNBVC_META['qa_mfa']['files'] = [
|
||||||
|
[f'{base_url}/qa/{folder}/mfa/{i}.jsonl.gz' for i in range(count)]
|
||||||
|
for folder, count in [('20230196', 1)]
|
||||||
|
]
|
||||||
|
MNBVC_META['qa_mfa']['files'] = _flatten_list(MNBVC_META['qa_mfa']['files'])
|
||||||
|
MNBVC_META['qa_mfa']['feature_type'] = "QA_CORPUS"
|
||||||
|
MNBVC_META['qa_mfa']['description'] = "QA data from Ministry of Foreign Affairs"
|
||||||
|
|
||||||
|
|
||||||
|
# =============================
|
||||||
|
# news_peoples_daily
|
||||||
|
MNBVC_META['news_peoples_daily'] = {}
|
||||||
|
MNBVC_META['news_peoples_daily']['files'] = [
|
||||||
|
[f'{base_url}/news/{folder}/{i}.jsonl.gz' for i in range(count)]
|
||||||
|
for folder, count in [('20230196', 12)]
|
||||||
|
]
|
||||||
|
MNBVC_META['news_peoples_daily']['files'] = _flatten_list(MNBVC_META['news_peoples_daily']['files'])
|
||||||
|
MNBVC_META['news_peoples_daily']['feature_type'] = "TEXT_CORPUS"
|
||||||
|
MNBVC_META['news_peoples_daily']['description'] = "News data from people's daily"
|
||||||
|
|
||||||
|
|
||||||
|
# =============================
|
||||||
|
# wikipedia
|
||||||
|
MNBVC_META['wikipedia'] = {}
|
||||||
|
MNBVC_META['wikipedia']['files'] = [
|
||||||
|
[f'{base_url}/wiki/{folder}/{i}.jsonl.gz' for i in range(count)]
|
||||||
|
for folder, count in [('20230197', 46)]
|
||||||
|
]
|
||||||
|
MNBVC_META['wikipedia']['files'] += [
|
||||||
|
[f'{base_url}/wiki/{folder}/{i}.jsonl.gz' for i in range(beg, end)]
|
||||||
|
for folder, beg, end in [('20230198', 46, 78)]
|
||||||
|
]
|
||||||
|
MNBVC_META['wikipedia']['files'] = _flatten_list(MNBVC_META['wikipedia']['files'])
|
||||||
|
MNBVC_META['wikipedia']['feature_type'] = "TEXT_CORPUS"
|
||||||
|
MNBVC_META['wikipedia']['description'] = "Texts from wikipedia"
|
||||||
|
|
||||||
|
|
||||||
|
# =============================
|
||||||
|
# qa_stackexchange
|
||||||
|
MNBVC_META['qa_stackexchange'] = {}
|
||||||
|
MNBVC_META['qa_stackexchange']['files'] = []
|
||||||
|
|
||||||
|
MNBVC_META['qa_stackexchange']['files'] += [
|
||||||
|
[f'{base_url}/qa/stackexchange/{folder}/{name}/{i}.jsonl.gz' for i in range(1, count+1)]
|
||||||
|
for folder, name, count in [
|
||||||
|
("20230204", "conlang.20230204.33.问答", 1),
|
||||||
|
("20230204", "aviation.20230204.10.问答", 1),
|
||||||
|
("20230204", "drupal.20230204.46.问答", 1),
|
||||||
|
("20230204", "gamedev.20230204.66.问答", 1),
|
||||||
|
("20230204", "linguistics.20230204.95.问答", 1),
|
||||||
|
("20230204", "datascience.20230204.41.问答", 1),
|
||||||
|
("20230204", "coffee.20230204.30.问答", 1),
|
||||||
|
("20230204", "german.20230204.70.问答", 1),
|
||||||
|
("20230204", "italian.20230204.85.问答", 1),
|
||||||
|
("20230204", "health.20230204.75.问答", 1),
|
||||||
|
("20230204", "gaming.20230204.67.问答", 2),
|
||||||
|
("20230204", "dba.20230204.42.问答", 2),
|
||||||
|
("20230204", "bitcoin.20230204.17.问答", 1),
|
||||||
|
("20230204", "anime.20230204.5.问答", 1),
|
||||||
|
("20230204", "iot.20230204.82.问答", 1),
|
||||||
|
("20230204", "avpaviation.20230204.11.问答", 1),
|
||||||
|
("20230204", "hsm.20230204.80.问答", 1),
|
||||||
|
("20230204", "english.20230204.56.问答", 3),
|
||||||
|
("20230204", "eosio.20230204.57.问答", 1),
|
||||||
|
("20230204", "astronomy.20230204.9.问答", 1),
|
||||||
|
("20230204", "materials.20230204.99.问答", 1),
|
||||||
|
("20230204", "homebrew.20230204.79.问答", 1),
|
||||||
|
("20230204", "elementaryos.20230204.52.问答", 1),
|
||||||
|
("20230204", "korean.20230204.90.问答", 1),
|
||||||
|
("20230204", "academia.20230204.2.问答", 2),
|
||||||
|
("20230204", "expatriates.20230204.61.问答", 1),
|
||||||
|
("20230204", "islam.20230204.84.问答", 1),
|
||||||
|
("20230204", "cogsci.20230204.31.问答", 1),
|
||||||
|
("20230204", "ethereum.20230204.60.问答", 1),
|
||||||
|
("20230204", "cardano.20230204.22.问答", 1),
|
||||||
|
("20230204", "bioinformatics.20230204.15.问答", 1),
|
||||||
|
("20230204", "magento.20230204.97.问答", 2),
|
||||||
|
("20230204", "literature.20230204.96.问答", 1),
|
||||||
|
("20230204", "ebooks.20230204.49.问答", 1),
|
||||||
|
("20230204", "devops.20230204.43.问答", 1),
|
||||||
|
("20230204", "economics.20230204.50.问答", 1),
|
||||||
|
("20230204", "chemistry.20230204.23.问答", 1),
|
||||||
|
("20230204", "judaism.20230204.89.问答", 1),
|
||||||
|
("20230204", "iota.20230204.83.问答", 1),
|
||||||
|
("20230204", "ell.20230204.53.问答", 2),
|
||||||
|
("20230204", "diy.20230204.44.问答", 2),
|
||||||
|
("20230204", "chess.20230204.24.问答", 1),
|
||||||
|
("20230204", "boardgames.20230204.19.问答", 1),
|
||||||
|
("20230204", "freelancing.20230204.64.问答", 1),
|
||||||
|
("20230204", "computergraphics.20230204.32.问答", 1),
|
||||||
|
("20230204", "engineering.20230204.55.问答", 1),
|
||||||
|
("20230204", "hermeneutics.20230204.76.问答", 1),
|
||||||
|
("20230204", "codereview.20230204.29.问答", 4),
|
||||||
|
("20230204", "civicrm.20230204.27.问答", 1),
|
||||||
|
("20230204", "expressionengine.20230204.62.问答", 1),
|
||||||
|
("20230204", "hinduism.20230204.77.问答", 1),
|
||||||
|
("20230204", "lifehacks.20230204.94.问答", 1),
|
||||||
|
("20230204", "chinese.20230204.25.问答", 1),
|
||||||
|
("20230204", "interpersonal.20230204.81.问答", 1),
|
||||||
|
("20230204", "blender.20230204.18.问答", 1),
|
||||||
|
("20230204", "emacs.20230204.54.问答", 1),
|
||||||
|
("20230204", "cstheory.20230204.40.问答", 1),
|
||||||
|
("20230204", "history.20230204.78.问答", 1),
|
||||||
|
("20230204", "earthscience.20230204.48.问答", 1),
|
||||||
|
("20230204", "askubuntu.20230204.8.问答", 5),
|
||||||
|
("20230204", "gis.20230204.71.问答", 2),
|
||||||
|
("20230204", "japanese.20230204.87.问答", 1),
|
||||||
|
("20230204", "gardening.20230204.68.问答", 1),
|
||||||
|
("20230204", "biology.20230204.16.问答", 1),
|
||||||
|
("20230204", "joomla.20230204.88.问答", 1),
|
||||||
|
("20230204", "drones.20230204.45.问答", 1),
|
||||||
|
("20230204", "latin.20230204.92.问答", 1),
|
||||||
|
("20230204", "buddhism.20230204.21.问答", 1),
|
||||||
|
("20230204", "apple.20230204.6.问答", 2),
|
||||||
|
("20230204", "ai.20230204.3.问答", 1),
|
||||||
|
("20230204", "es.20230204.58.问答", 4),
|
||||||
|
("20230204", "cooking.20230204.34.问答", 1),
|
||||||
|
("20230204", "crypto.20230204.37.问答", 1),
|
||||||
|
("20230204", "christianity.20230204.26.问答", 1),
|
||||||
|
("20230204", "bricks.20230204.20.问答", 1),
|
||||||
|
("20230204", "bicycles.20230204.13.问答", 1),
|
||||||
|
("20230204", "codegolf.20230204.28.问答", 4),
|
||||||
|
("20230204", "martialarts.20230204.98.问答", 1),
|
||||||
|
("20230204", "dsp.20230204.47.问答", 1),
|
||||||
|
("20230204", "bioacoustics.20230204.14.问答", 1),
|
||||||
|
("20230204", "electronics.20230204.51.问答", 4),
|
||||||
|
("20230204", "crafts.20230204.36.问答", 1),
|
||||||
|
("20230204", "ham.20230204.73.问答", 1),
|
||||||
|
("20230204", "cs.20230204.38.问答", 1),
|
||||||
|
("20230204", "beer.20230204.12.问答", 1),
|
||||||
|
("20230204", "graphicdesign.20230204.72.问答", 1),
|
||||||
|
("20230204", "french.20230204.65.问答", 1),
|
||||||
|
("20230204", "languagelearning.20230204.91.问答", 1),
|
||||||
|
("20230204", "genealogy.20230204.69.问答", 1),
|
||||||
|
("20230204", "3dprinting.20230204.1.问答", 1),
|
||||||
|
("20230204", "hardwarerecs.20230204.74.问答", 1),
|
||||||
|
("20230204", "law.20230204.93.问答", 1),
|
||||||
|
("20230204", "fitness.20230204.63.问答", 1),
|
||||||
|
("20230204", "esperanto.20230204.59.问答", 1),
|
||||||
|
("20230204", "craftcms.20230204.35.问答", 1),
|
||||||
|
("20230204", "android.20230204.4.问答", 1),
|
||||||
|
("20230204", "arduino.20230204.7.问答", 1),
|
||||||
|
("20230204", "ja.20230204.86.问答", 1),
|
||||||
|
("20230204", "cseducators.20230204.39.问答", 1),
|
||||||
|
("20230205", "writers.20230205.80.问答", 1),
|
||||||
|
("20230205", "scifi.20230205.43.问答", 2),
|
||||||
|
("20230205", "opensource.20230205.15.问答", 1),
|
||||||
|
("20230205", "outdoors.20230205.17.问答", 1),
|
||||||
|
("20230205", "quantumcomputing.20230205.32.问答", 1),
|
||||||
|
("20230205", "salesforce.20230205.41.问答", 2),
|
||||||
|
("20230205", "philosophy.20230205.21.问答", 1),
|
||||||
|
("20230205", "or.20230205.16.问答", 1),
|
||||||
|
("20230205", "tex.20230205.63.问答", 5),
|
||||||
|
("20230205", "music.20230205.10.问答", 1),
|
||||||
|
("20230205", "russian.20230205.40.问答", 1),
|
||||||
|
("20230205", "superuser.20230205.61.问答", 6),
|
||||||
|
("20230205", "tridion.20230205.67.问答", 1),
|
||||||
|
("20230205", "stackapps.20230205.57.问答", 1),
|
||||||
|
("20230205", "webmasters.20230205.74.问答", 1),
|
||||||
|
("20230205", "musicfans.20230205.11.问答", 1),
|
||||||
|
("20230205", "pm.20230205.24.问答", 1),
|
||||||
|
("20230205", "sitecore.20230205.47.问答", 1),
|
||||||
|
("20230205", "space.20230205.53.问答", 1),
|
||||||
|
("20230205", "sharepoint.20230205.46.问答", 2),
|
||||||
|
("20230205", "windowsphone.20230205.75.问答", 1),
|
||||||
|
("20230205", "unix.20230205.69.问答", 4),
|
||||||
|
("20230205", "mathematica.20230205.3.问答", 2),
|
||||||
|
("20230205", "ux.20230205.70.问答", 1),
|
||||||
|
("20230205", "moderators.20230205.6.问答", 1),
|
||||||
|
("20230205", "vi.20230205.72.问答", 1),
|
||||||
|
("20230205", "sustainability.20230205.62.问答", 1),
|
||||||
|
("20230205", "rus.20230205.39.问答", 1),
|
||||||
|
("20230205", "money.20230205.8.问答", 1),
|
||||||
|
("20230205", "travel.20230205.66.问答", 1),
|
||||||
|
("20230205", "tezos.20230205.64.问答", 1),
|
||||||
|
("20230205", "movies.20230205.9.问答", 1),
|
||||||
|
("20230205", "retrocomputing.20230205.34.问答", 1),
|
||||||
|
("20230205", "robotics.20230205.36.问答", 1),
|
||||||
|
("20230205", "mathoverflow.20230205.4.问答", 3),
|
||||||
|
("20230205", "rpg.20230205.37.问答", 2),
|
||||||
|
("20230205", "networkengineering.20230205.13.问答", 1),
|
||||||
|
("20230205", "monero.20230205.7.问答", 1),
|
||||||
|
("20230205", "math.20230205.1.问答", 20),
|
||||||
|
("20230205", "quant.20230205.31.问答", 1),
|
||||||
|
("20230205", "photo.20230205.22.问答", 1),
|
||||||
|
("20230205", "scicomp.20230205.42.问答", 1),
|
||||||
|
("20230205", "softwarerecs.20230205.50.问答", 1),
|
||||||
|
("20230205", "patents.20230205.19.问答", 1),
|
||||||
|
("20230205", "workplace.20230205.78.问答", 2),
|
||||||
|
("20230205", "sqa.20230205.56.问答", 1),
|
||||||
|
("20230205", "portuguese.20230205.27.问答", 1),
|
||||||
|
("20230205", "woodworking.20230205.76.问答", 1),
|
||||||
|
("20230205", "tor.20230205.65.问答", 1),
|
||||||
|
("20230205", "worldbuilding.20230205.79.问答", 3),
|
||||||
|
("20230205", "reverseengineering.20230205.35.问答", 1),
|
||||||
|
("20230205", "proofassistants.20230205.28.问答", 1),
|
||||||
|
("20230205", "spanish.20230205.54.问答", 1),
|
||||||
|
("20230205", "security.20230205.44.问答", 2),
|
||||||
|
("20230205", "skeptics.20230205.48.问答", 1),
|
||||||
|
("20230205", "sound.20230205.52.问答", 1),
|
||||||
|
("20230205", "wordpress.20230205.77.问答", 2),
|
||||||
|
("20230205", "mythology.20230205.12.问答", 1),
|
||||||
|
("20230205", "serverfault.20230205.45.问答", 5),
|
||||||
|
("20230205", "pets.20230205.20.问答", 1),
|
||||||
|
("20230205", "solana.20230205.51.问答", 1),
|
||||||
|
("20230205", "webapps.20230205.73.问答", 1),
|
||||||
|
("20230205", "vegetarianism.20230205.71.问答", 1),
|
||||||
|
("20230205", "puzzling.20230205.30.问答", 1),
|
||||||
|
("20230205", "politics.20230205.26.问答", 1),
|
||||||
|
("20230205", "pt.20230205.29.问答", 3),
|
||||||
|
("20230205", "matheducators.20230205.2.问答", 1),
|
||||||
|
("20230205", "ru.20230205.38.问答", 6),
|
||||||
|
("20230205", "physics.20230205.23.问答", 4),
|
||||||
|
("20230205", "substrate.20230205.60.问答", 1),
|
||||||
|
("20230205", "parenting.20230205.18.问答", 1),
|
||||||
|
("20230205", "stats.20230205.58.问答", 3),
|
||||||
|
("20230205", "sports.20230205.55.问答", 1),
|
||||||
|
("20230205", "mechanics.20230205.5.问答", 1),
|
||||||
|
("20230205", "raspberrypi.20230205.33.问答", 1),
|
||||||
|
("20230205", "stellar.20230205.59.问答", 1),
|
||||||
|
("20230205", "ukrainian.20230205.68.问答", 1),
|
||||||
|
("20230205", "opendata.20230205.14.问答", 1),
|
||||||
|
("20230205", "poker.20230205.25.问答", 1),
|
||||||
|
("20230205", "softwareengineering.20230205.49.问答", 3), ]
|
||||||
|
]
|
||||||
|
|
||||||
|
MNBVC_META['qa_stackexchange']['files'] += [
|
||||||
|
[f'{base_url}/qa/stackexchange/{folder}/{i}.jsonl.gz' for i in range(beg, end)]
|
||||||
|
for folder, beg, end in [('20230199', 1, 81)]
|
||||||
|
]
|
||||||
|
|
||||||
|
MNBVC_META['qa_stackexchange']['files'] = _flatten_list(MNBVC_META['qa_stackexchange']['files'])
|
||||||
|
MNBVC_META['qa_stackexchange']['feature_type'] = "QA_CORPUS"
|
||||||
|
MNBVC_META['qa_stackexchange']['description'] = "QA data from StackExchange"
|
||||||
|
|
||||||
|
|
||||||
|
# =============================
|
||||||
|
# qa_chatgpt
|
||||||
|
MNBVC_META['qa_chatgpt'] = {}
|
||||||
|
MNBVC_META['qa_chatgpt']['files'] = [
|
||||||
|
f'{base_url}/qa/chatgpt/20230211/chatgpt_baiduzhidao_2023-10-04-08-35-40_01.jsonl.gz',
|
||||||
|
f'{base_url}/qa/chatgpt/20230211/chatgpt_baiduzhidao_2023-10-04-08-35-40_02.jsonl.gz',
|
||||||
|
f'{base_url}/qa/chatgpt/20230211/chatgpt_baiduzhidao_2023-10-04-08-35-40_03.jsonl.gz',
|
||||||
|
f'{base_url}/qa/chatgpt/20230211/chatgpt_baiduzhidao_2023-10-04-08-35-40_04.jsonl.gz',
|
||||||
|
f'{base_url}/qa/chatgpt/20230211/chatgpt_baiduzhidao_2023-10-04-08-35-40_05.jsonl.gz',
|
||||||
|
f'{base_url}/qa/chatgpt/20230211/chatgpt_baiduzhidao_2023-10-04-08-35-40_06.jsonl.gz',
|
||||||
|
f'{base_url}/qa/chatgpt/20230211/chatgpt_baiduzhidao_2023-10-04-08-35-40_07.jsonl.gz',
|
||||||
|
f'{base_url}/qa/chatgpt/20230211/chatgpt_baiduzhidao_2023-10-04-08-35-40_08.jsonl.gz',
|
||||||
|
f'{base_url}/qa/chatgpt/20230211/chatgpt_baiduzhidao_2023-10-04-08-35-40_09.jsonl.gz',
|
||||||
|
f'{base_url}/qa/chatgpt/20230211/chatgpt_baiduzhidao_2023-10-04-08-35-40_10.jsonl.gz',
|
||||||
|
f'{base_url}/qa/chatgpt/20230211/chatgpt_baiduzhidao_2023-10-04-08-35-40_11.jsonl.gz',
|
||||||
|
]
|
||||||
|
|
||||||
|
MNBVC_META['qa_chatgpt']['feature_type'] = "QA_CORPUS"
|
||||||
|
MNBVC_META['qa_chatgpt']['description'] = "QA data made by ChatGPT"
|
||||||
|
|
||||||
|
|
||||||
|
# =============================
|
||||||
|
# math_qa
|
||||||
|
MNBVC_META['math_qa'] = {}
|
||||||
|
MNBVC_META['math_qa']['files'] = [
|
||||||
|
f'{base_url}/math/qa/20230206/math_qa.jsonl.gz'
|
||||||
|
]
|
||||||
|
|
||||||
|
MNBVC_META['math_qa']['feature_type'] = "QA_CORPUS"
|
||||||
|
MNBVC_META['math_qa']['description'] = "QA data in the math domain"
|
||||||
|
|
||||||
|
|
||||||
|
# =============================
|
||||||
|
# math_chat
|
||||||
|
MNBVC_META['math_chat'] = {}
|
||||||
|
MNBVC_META['math_chat']['files'] = [
|
||||||
|
f'{base_url}/math/conv/20230206/math_cot.jsonl.gz'
|
||||||
|
]
|
||||||
|
|
||||||
|
MNBVC_META['math_chat']['feature_type'] = "FORUM_DIALOGUE"
|
||||||
|
MNBVC_META['math_chat']['description'] = "Chain of thought conv. data in the math domain"
|
||||||
|
|
||||||
|
|
||||||
|
# =============================
|
||||||
|
# crawler_oscar
|
||||||
|
MNBVC_META['crawler_oscar'] = {}
|
||||||
|
MNBVC_META['crawler_oscar']['files'] = []
|
||||||
|
MNBVC_META['crawler_oscar']['files'] += [
|
||||||
|
f'{base_url}/crawler/oscar/20230207/oscar_202201.part_00{i:02}.jsonl.gz' for i in range(25)
|
||||||
|
]
|
||||||
|
MNBVC_META['crawler_oscar']['files'] += [
|
||||||
|
f'{base_url}/crawler/oscar/20230208/oscar_202201.part_00{i:02}.jsonl.gz' for i in range(25, 50)
|
||||||
|
]
|
||||||
|
MNBVC_META['crawler_oscar']['files'] += [
|
||||||
|
f'{base_url}/crawler/oscar/20230209/oscar_202201.part_00{i:02}.jsonl.gz' for i in range(50, 75)
|
||||||
|
]
|
||||||
|
MNBVC_META['crawler_oscar']['files'] += [
|
||||||
|
f'{base_url}/crawler/oscar/20230210/oscar_202201.part_00{i:02}.jsonl.gz' for i in range(75, 100)
|
||||||
|
]
|
||||||
|
|
||||||
|
MNBVC_META['crawler_oscar']['feature_type'] = "TEXT_CORPUS"
|
||||||
|
MNBVC_META['crawler_oscar']['description'] = "General text corpus from common crawl"
|
|
@ -0,0 +1,11 @@
|
||||||
|
import os
|
||||||
|
|
||||||
|
# for i in range(46):
|
||||||
|
# os.system("wget https://huggingface.co/datasets/liwu/MNBVC/resolve/main/wiki/20230197/" + str(i) + ".jsonl.gz")
|
||||||
|
|
||||||
|
|
||||||
|
# for i in range(46, 78):
|
||||||
|
# os.system("wget https://huggingface.co/datasets/liwu/MNBVC/resolve/main/wiki/20230198/" + str(i) + ".jsonl.gz")
|
||||||
|
|
||||||
|
os.system("wget https://huggingface.co/datasets/liwu/MNBVC/resolve/main/wiki/20230197/36.jsonl.gz")
|
||||||
|
|
Loading…
Reference in New Issue