From b62444a9dc8058b869406a563933baecc362fbfe Mon Sep 17 00:00:00 2001 From: Colin <> Date: Sun, 10 Aug 2025 14:30:51 +0800 Subject: [PATCH] Refine meaning dataset import. --- wit/meaning/__init__.py | 0 wit/meaning/meaning_dataset.py | 21 ++++++++++++++++++++- 2 files changed, 20 insertions(+), 1 deletion(-) create mode 100644 wit/meaning/__init__.py diff --git a/wit/meaning/__init__.py b/wit/meaning/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/wit/meaning/meaning_dataset.py b/wit/meaning/meaning_dataset.py index fbb3c0d..f80a09c 100644 --- a/wit/meaning/meaning_dataset.py +++ b/wit/meaning/meaning_dataset.py @@ -1,12 +1,13 @@ import os import torch, datasets +import tracemalloc import math, gc, time, random, copy from itertools import chain from typing import Dict, Tuple from torch.utils.data import ConcatDataset, DataLoader, Dataset, random_split import numpy as np from torch.utils.data import BatchSampler -from meaning.node_tree import NodeTree +from node_tree import NodeTree class MeaningMap: @@ -518,6 +519,24 @@ class BatchGroupMeaningDataloader(Dataset): if __name__ == "__main__": + tracemalloc.start() + + md = MeaningDataset( + 100000, + 300000, + min_subitem=2, + max_subitem=6, + vocab_size=32, + size=1024, + stride=2, + with_tree=False, + use_cache=True, + ) + current, peak = tracemalloc.get_traced_memory() + print(f"当前内存使用: {current / 1024 / 1024 / 1024:.4f} GB") + print(f"峰值内存使用: {peak / 1024 / 1024 / 1024:.4f} GB") + tracemalloc.stop() + md = MeaningDataset(100000, 115200, vocab_size=32, size=1024, stride=2, with_tree=False, use_cache=False) item = md.__getitem__(920) mm = md.get_meaning_map()