Remove weight and height in the meaning dataset.

This commit is contained in:
Colin 2025-08-17 14:43:13 +08:00
parent b2fe00c157
commit ee30eb4aab
1 changed files with 1 additions and 17 deletions

View File

@ -73,15 +73,13 @@ class MeaningMap:
): ):
print("Mapping Load from disk cache: " + file) print("Mapping Load from disk cache: " + file)
slhwm = np.load(file_prop) slhwm = np.load(file_prop)
self.ms_map = slhwm[:, 4:] self.ms_map = slhwm[:, 2:]
self.ms_data = np.load(file_data) self.ms_data = np.load(file_data)
self.ms_start = slhwm[:, 0] self.ms_start = slhwm[:, 0]
self.ms_len = slhwm[:, 1] self.ms_len = slhwm[:, 1]
self.ms_level = np.load(file_level) self.ms_level = np.load(file_level)
self.ms_rank_idx = np.load(file_rank_idx) self.ms_rank_idx = np.load(file_rank_idx)
self.ms_rank_all = np.load(file_rank_all) self.ms_rank_all = np.load(file_rank_all)
self.ms_height = slhwm[:, 2]
self.ms_weight = slhwm[:, 3]
print("Mapping Load end, elapsed:" + str(time.time() - start_time) + "s") print("Mapping Load end, elapsed:" + str(time.time() - start_time) + "s")
else: else:
print("Mapping Disk cache miss, build new one. size:" + str(size)) print("Mapping Disk cache miss, build new one. size:" + str(size))
@ -112,8 +110,6 @@ class MeaningMap:
ms_start = np.zeros((size), dtype=np.int32) # meaning sequence start ms_start = np.zeros((size), dtype=np.int32) # meaning sequence start
ms_end = np.zeros((size), dtype=np.int32) # meaning sequence end ms_end = np.zeros((size), dtype=np.int32) # meaning sequence end
ms_len = np.zeros((size), dtype=np.int32) # meaning sequence len ms_len = np.zeros((size), dtype=np.int32) # meaning sequence len
ms_height = np.zeros((size), dtype=np.int32) # meaning tree height
ms_weight = np.zeros((size), dtype=np.int32) # meaning tree weight
ms_data = np.zeros((datastep), dtype=np.int32) # meaning sequence ms_data = np.zeros((datastep), dtype=np.int32) # meaning sequence
ms_level = np.zeros((datastep), dtype=np.uint32) # meaning level, vocab's level is 0 ms_level = np.zeros((datastep), dtype=np.uint32) # meaning level, vocab's level is 0
ms_rank_idx = np.zeros((datastep), dtype=np.uint32) # meaning index of all level ms_rank_idx = np.zeros((datastep), dtype=np.uint32) # meaning index of all level
@ -134,8 +130,6 @@ class MeaningMap:
ms_start[i] = index ms_start[i] = index
ms_end[i] = index + stride ms_end[i] = index + stride
ms_len[i] = stride ms_len[i] = stride
ms_height[i] = 0
ms_weight[i] = 1
index = index + stride index = index + stride
for i in range(self.normal_vocab, size): for i in range(self.normal_vocab, size):
@ -185,8 +179,6 @@ class MeaningMap:
ms_start[i] = index ms_start[i] = index
ms_end[i] = end ms_end[i] = end
ms_len[i] = len_ma ms_len[i] = len_ma
ms_height[i] = max(ms_height[m_list]) + 1
ms_weight[i] = sum(ms_weight[m_list])
index = index + len_ma index = index + len_ma
if i % 10000 == 0: if i % 10000 == 0:
print(i) print(i)
@ -199,15 +191,11 @@ class MeaningMap:
np.save(file_rank_all, ms_rank_all) np.save(file_rank_all, ms_rank_all)
ms_start = np.array(ms_start).astype(np.int32) ms_start = np.array(ms_start).astype(np.int32)
ms_height = np.array(ms_height).astype(np.int32)
ms_weight = np.array(ms_weight).astype(np.int32)
ms_len = np.array(ms_len).astype(np.int32) ms_len = np.array(ms_len).astype(np.int32)
slhwm = np.concatenate( slhwm = np.concatenate(
( (
ms_start.reshape((-1, 1)), ms_start.reshape((-1, 1)),
ms_len.reshape((-1, 1)), ms_len.reshape((-1, 1)),
ms_height.reshape((-1, 1)),
ms_weight.reshape((-1, 1)),
map, map,
), ),
axis=1, axis=1,
@ -222,8 +210,6 @@ class MeaningMap:
self.ms_map = map # ms_map[i] = [sub(i),sub(i),sub(i),sub(i)...sub(i)] self.ms_map = map # ms_map[i] = [sub(i),sub(i),sub(i),sub(i)...sub(i)]
self.ms_start = ms_start self.ms_start = ms_start
self.ms_len = ms_len self.ms_len = ms_len
self.ms_height = ms_height
self.ms_weight = ms_weight
print("Mapping Disk cache build end, elapsed:" + str(time.time() - start_time) + "s") print("Mapping Disk cache build end, elapsed:" + str(time.time() - start_time) + "s")
def get_sequence(self, meaning): # return sequence[meaning] def get_sequence(self, meaning): # return sequence[meaning]
@ -352,8 +338,6 @@ class MeaningDataset(Dataset):
self.rank_all = [] self.rank_all = []
self.seq_meaning = [] self.seq_meaning = []
map = self.get_meaning_map() map = self.get_meaning_map()
self.m_height = map.ms_height
self.m_weight = map.ms_weight
if size: if size:
meanings = np.random.randint(start, end, size=(size)) meanings = np.random.randint(start, end, size=(size))
else: else: