import sys sys.path.append("..") import json import torch from tools import show from chatglm import ChatGLMTokenizer from modelscope import snapshot_download pretrained_model_name_or_path = snapshot_download("ZhipuAI/chatglm3-6b") tokenizer_config_file = "./tokenizer_config.json" if tokenizer_config_file is not None: with open(tokenizer_config_file, encoding="utf-8") as tokenizer_config_handle: init_kwargs = json.load(tokenizer_config_handle) init_kwargs.pop("tokenizer_class", None) init_kwargs.pop("tokenizer_file", None) saved_init_inputs = init_kwargs.pop("init_inputs", ()) init_inputs = saved_init_inputs init_kwargs["vocab_file"] = "./tokenizer.model" init_kwargs["added_tokens_file"] = None init_kwargs["special_tokens_map_file"] = None init_kwargs["tokenizer_file"] = None init_kwargs["name_or_path"] = pretrained_model_name_or_path tokenizer = ChatGLMTokenizer(*init_inputs, **init_kwargs) a = tokenizer.encode("骉") b = tokenizer.decode([236, 173, 140]) token = [] for i in range(64798): token.append(str(i) + " : " + tokenizer.decode(i)) show.DumpListToFile(token, "../generated/token.log") # print("=======================") # for i in range(hidden_states_en.shape[0]): # hidden_states = hidden_states_en[i : i + 1] # lm_logits = self.output_layer(hidden_states) # lm_logits = lm_logits.transpose(0, 1).contiguous() # next_token_logits = lm_logits[:, -1, :] # probss = nn.functional.softmax(next_token_logits, dim=-1) # next_t = torch.multinomial(probss, num_samples=1).squeeze(1) # response = tokenizer.decode(next_t) # print(response) # # name = "generated/next_tokens" + str(token_count) + "_" + response + "_.png" # # show.DumpTensorToImage(next_token_logits[0], name) # # token_count = token_count + 1