Dataset/liwu/MNBVC/MNBVC.py

# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
MNBVC: Massive Never-ending BT Vast Chinese corpus
"""

import json
import datasets
import numpy as np
import traceback
from .meta import MNBVC_META
from .features import Features


_CITATION = """\
"""

_DESCRIPTION = """\
MNBVC: Massive Never-ending BT Vast Chinese corpus
"""

_HOMEPAGE = "https://github.com/esbatmop/MNBVC"

_LICENSE = "MIT"


class MNBVC(datasets.GeneratorBasedBuilder):
    """Massive Never-ending BT Vast Chinese corpus."""
    BUILDER_CONFIGS = [
        datasets.BuilderConfig(
            name=key, version=datasets.Version("0.0.1"), description=value['description']) for key, value in MNBVC_META.items()]

    def _info(self):
        return datasets.DatasetInfo(
            # This is the description that will appear on the datasets page.
            description=_DESCRIPTION,
            # This defines the different columns of the dataset and their types
            features=Features[MNBVC_META[self.config.name]['feature_type']],  # Here we define them above because they are different between the two configurations
            # If there's a common (input, target) tuple from the features, uncomment supervised_keys line below and
            # specify them. They'll be used if as_supervised=True in builder.as_dataset.
            # supervised_keys=("sentence", "label"),
            # Homepage of the dataset for documentation
            homepage=_HOMEPAGE,
            # License for the dataset if available
            license=_LICENSE,
            # Citation for the dataset
            citation=_CITATION,
        )

    def _split_generators(self, dl_manager):
        data_dir = dl_manager.download_and_extract(MNBVC_META[self.config.name]['files'])

        return [
            datasets.SplitGenerator(
                name=datasets.Split.TRAIN,
                gen_kwargs={
                    "data_files": data_dir,
                },
            ),
        ]

    # method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
    def _generate_examples(self, data_files):
        id = 0
        features = self._info().features
        feature_keys = set(features.keys())

        def _drop_unused_keys(data):
            rm_keys = []
            for key in data.keys():
                if key not in feature_keys:
                    rm_keys.append(key)
            for key in rm_keys:
                del data[key]
            return data

        try:
            for file_i, data_file in enumerate(data_files):
                with open(data_file, encoding="utf-8") as f:
                    for line_i, line in enumerate(f):
                        id += 1
                        data = json.loads(line)
                        if self.config.name == 'law_judgement':
                            text = data['详情']
                            del data['详情']
                            yield id, {
                                "text": text,
                                "meta": json.dumps(data, ensure_ascii=False),
                            }
                        else:
                            data = _drop_unused_keys(data)
                            if 'simhash' in data:  # for issue https://github.com/huggingface/datasets/issues/6007
                                data['simhash'] = str(data['simhash'])

                            yield id, data
        except Exception as e:
            error_msg = 'oops, we find an error when loading the dataset\n'
            error_msg += f'Dataset: {self.config.name}\n'
            error_msg += f'Data File: {file_i} {data_file}\n'
            error_msg += f'Row: {line_i}'
            print(error_msg)
            traceback.print_exc()

            raise e