Remove weight and height in the meaning dataset.

This commit is contained in:
Colin 2025-08-17 14:43:13 +08:00
parent b2fe00c157
commit ee30eb4aab
1 changed files with 1 additions and 17 deletions

View File

@ -73,15 +73,13 @@ class MeaningMap:
):
print("Mapping Load from disk cache: " + file)
slhwm = np.load(file_prop)
self.ms_map = slhwm[:, 4:]
self.ms_map = slhwm[:, 2:]
self.ms_data = np.load(file_data)
self.ms_start = slhwm[:, 0]
self.ms_len = slhwm[:, 1]
self.ms_level = np.load(file_level)
self.ms_rank_idx = np.load(file_rank_idx)
self.ms_rank_all = np.load(file_rank_all)
self.ms_height = slhwm[:, 2]
self.ms_weight = slhwm[:, 3]
print("Mapping Load end, elapsed:" + str(time.time() - start_time) + "s")
else:
print("Mapping Disk cache miss, build new one. size:" + str(size))
@ -112,8 +110,6 @@ class MeaningMap:
ms_start = np.zeros((size), dtype=np.int32) # meaning sequence start
ms_end = np.zeros((size), dtype=np.int32) # meaning sequence end
ms_len = np.zeros((size), dtype=np.int32) # meaning sequence len
ms_height = np.zeros((size), dtype=np.int32) # meaning tree height
ms_weight = np.zeros((size), dtype=np.int32) # meaning tree weight
ms_data = np.zeros((datastep), dtype=np.int32) # meaning sequence
ms_level = np.zeros((datastep), dtype=np.uint32) # meaning level, vocab's level is 0
ms_rank_idx = np.zeros((datastep), dtype=np.uint32) # meaning index of all level
@ -134,8 +130,6 @@ class MeaningMap:
ms_start[i] = index
ms_end[i] = index + stride
ms_len[i] = stride
ms_height[i] = 0
ms_weight[i] = 1
index = index + stride
for i in range(self.normal_vocab, size):
@ -185,8 +179,6 @@ class MeaningMap:
ms_start[i] = index
ms_end[i] = end
ms_len[i] = len_ma
ms_height[i] = max(ms_height[m_list]) + 1
ms_weight[i] = sum(ms_weight[m_list])
index = index + len_ma
if i % 10000 == 0:
print(i)
@ -199,15 +191,11 @@ class MeaningMap:
np.save(file_rank_all, ms_rank_all)
ms_start = np.array(ms_start).astype(np.int32)
ms_height = np.array(ms_height).astype(np.int32)
ms_weight = np.array(ms_weight).astype(np.int32)
ms_len = np.array(ms_len).astype(np.int32)
slhwm = np.concatenate(
(
ms_start.reshape((-1, 1)),
ms_len.reshape((-1, 1)),
ms_height.reshape((-1, 1)),
ms_weight.reshape((-1, 1)),
map,
),
axis=1,
@ -222,8 +210,6 @@ class MeaningMap:
self.ms_map = map # ms_map[i] = [sub(i),sub(i),sub(i),sub(i)...sub(i)]
self.ms_start = ms_start
self.ms_len = ms_len
self.ms_height = ms_height
self.ms_weight = ms_weight
print("Mapping Disk cache build end, elapsed:" + str(time.time() - start_time) + "s")
def get_sequence(self, meaning): # return sequence[meaning]
@ -352,8 +338,6 @@ class MeaningDataset(Dataset):
self.rank_all = []
self.seq_meaning = []
map = self.get_meaning_map()
self.m_height = map.ms_height
self.m_weight = map.ms_weight
if size:
meanings = np.random.randint(start, end, size=(size))
else: