55 lines
1.8 KiB
Python
55 lines
1.8 KiB
Python
import sys
|
|
sys.path.append("..")
|
|
|
|
import json
|
|
import torch
|
|
from tools import show
|
|
|
|
|
|
from chatglm import ChatGLMTokenizer
|
|
from modelscope import snapshot_download
|
|
|
|
pretrained_model_name_or_path = snapshot_download("ZhipuAI/chatglm3-6b")
|
|
|
|
tokenizer_config_file = "./tokenizer_config.json"
|
|
if tokenizer_config_file is not None:
|
|
with open(tokenizer_config_file, encoding="utf-8") as tokenizer_config_handle:
|
|
init_kwargs = json.load(tokenizer_config_handle)
|
|
init_kwargs.pop("tokenizer_class", None)
|
|
init_kwargs.pop("tokenizer_file", None)
|
|
saved_init_inputs = init_kwargs.pop("init_inputs", ())
|
|
init_inputs = saved_init_inputs
|
|
init_kwargs["vocab_file"] = "./tokenizer.model"
|
|
init_kwargs["added_tokens_file"] = None
|
|
init_kwargs["special_tokens_map_file"] = None
|
|
init_kwargs["tokenizer_file"] = None
|
|
init_kwargs["name_or_path"] = pretrained_model_name_or_path
|
|
tokenizer = ChatGLMTokenizer(*init_inputs, **init_kwargs)
|
|
|
|
|
|
a = tokenizer.encode("骉")
|
|
b = tokenizer.decode([236, 173, 140])
|
|
|
|
|
|
token = []
|
|
for i in range(64798):
|
|
token.append(str(i) + " : " + tokenizer.decode(i))
|
|
show.DumpListToFile(token, "../generated/token.log")
|
|
|
|
# print("=======================")
|
|
# for i in range(hidden_states_en.shape[0]):
|
|
# hidden_states = hidden_states_en[i : i + 1]
|
|
# lm_logits = self.output_layer(hidden_states)
|
|
# lm_logits = lm_logits.transpose(0, 1).contiguous()
|
|
|
|
# next_token_logits = lm_logits[:, -1, :]
|
|
# probss = nn.functional.softmax(next_token_logits, dim=-1)
|
|
# next_t = torch.multinomial(probss, num_samples=1).squeeze(1)
|
|
|
|
# response = tokenizer.decode(next_t)
|
|
|
|
# print(response)
|
|
# # name = "generated/next_tokens" + str(token_count) + "_" + response + "_.png"
|
|
# # show.DumpTensorToImage(next_token_logits[0], name)
|
|
# # token_count = token_count + 1
|