116 lines
4.3 KiB
Python
116 lines
4.3 KiB
Python
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
"""
|
|
MNBVC: Massive Never-ending BT Vast Chinese corpus
|
|
"""
|
|
|
|
import json
|
|
import datasets
|
|
import numpy as np
|
|
import traceback
|
|
from .meta import MNBVC_META
|
|
from .features import Features
|
|
|
|
|
|
_CITATION = """\
|
|
"""
|
|
|
|
_DESCRIPTION = """\
|
|
MNBVC: Massive Never-ending BT Vast Chinese corpus
|
|
"""
|
|
|
|
_HOMEPAGE = "https://github.com/esbatmop/MNBVC"
|
|
|
|
_LICENSE = "MIT"
|
|
|
|
|
|
class MNBVC(datasets.GeneratorBasedBuilder):
|
|
"""Massive Never-ending BT Vast Chinese corpus."""
|
|
BUILDER_CONFIGS = [
|
|
datasets.BuilderConfig(
|
|
name=key, version=datasets.Version("0.0.1"), description=value['description']) for key, value in MNBVC_META.items()]
|
|
|
|
def _info(self):
|
|
return datasets.DatasetInfo(
|
|
# This is the description that will appear on the datasets page.
|
|
description=_DESCRIPTION,
|
|
# This defines the different columns of the dataset and their types
|
|
features=Features[MNBVC_META[self.config.name]['feature_type']], # Here we define them above because they are different between the two configurations
|
|
# If there's a common (input, target) tuple from the features, uncomment supervised_keys line below and
|
|
# specify them. They'll be used if as_supervised=True in builder.as_dataset.
|
|
# supervised_keys=("sentence", "label"),
|
|
# Homepage of the dataset for documentation
|
|
homepage=_HOMEPAGE,
|
|
# License for the dataset if available
|
|
license=_LICENSE,
|
|
# Citation for the dataset
|
|
citation=_CITATION,
|
|
)
|
|
|
|
def _split_generators(self, dl_manager):
|
|
data_dir = dl_manager.download_and_extract(MNBVC_META[self.config.name]['files'])
|
|
|
|
return [
|
|
datasets.SplitGenerator(
|
|
name=datasets.Split.TRAIN,
|
|
gen_kwargs={
|
|
"data_files": data_dir,
|
|
},
|
|
),
|
|
]
|
|
|
|
# method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
|
|
def _generate_examples(self, data_files):
|
|
id = 0
|
|
features = self._info().features
|
|
feature_keys = set(features.keys())
|
|
|
|
def _drop_unused_keys(data):
|
|
rm_keys = []
|
|
for key in data.keys():
|
|
if key not in feature_keys:
|
|
rm_keys.append(key)
|
|
for key in rm_keys:
|
|
del data[key]
|
|
return data
|
|
|
|
try:
|
|
for file_i, data_file in enumerate(data_files):
|
|
with open(data_file, encoding="utf-8") as f:
|
|
for line_i, line in enumerate(f):
|
|
id += 1
|
|
data = json.loads(line)
|
|
if self.config.name == 'law_judgement':
|
|
text = data['详情']
|
|
del data['详情']
|
|
yield id, {
|
|
"text": text,
|
|
"meta": json.dumps(data, ensure_ascii=False),
|
|
}
|
|
else:
|
|
data = _drop_unused_keys(data)
|
|
if 'simhash' in data: # for issue https://github.com/huggingface/datasets/issues/6007
|
|
data['simhash'] = str(data['simhash'])
|
|
|
|
yield id, data
|
|
except Exception as e:
|
|
error_msg = 'oops, we find an error when loading the dataset\n'
|
|
error_msg += f'Dataset: {self.config.name}\n'
|
|
error_msg += f'Data File: {file_i} {data_file}\n'
|
|
error_msg += f'Row: {line_i}'
|
|
print(error_msg)
|
|
traceback.print_exc()
|
|
|
|
raise e
|