Use local tokenizer.

This commit is contained in:
Colin 2024-02-24 14:14:12 +08:00
parent ac61c4d925
commit 122cbd9ff8
2 changed files with 2 additions and 1 deletions

View File

@ -0,0 +1 @@
{"model_max_length": 1024}

View File

@ -182,7 +182,7 @@ if __name__ == "__main__":
lit_module = LitModule(args.model_name, "./custom_models/gpt2", args.learning_rate, args.use_tril_attention_mask) lit_module = LitModule(args.model_name, "./custom_models/gpt2", args.learning_rate, args.use_tril_attention_mask)
# datasets # datasets
tokenizer = load_tokenizer(args.tokenizer_name_or_path) tokenizer = load_tokenizer("./custom_models/gpt2")
train_dataset_list = [] train_dataset_list = []
val_dataset_list = [] val_dataset_list = []
for dataset_name in args.dataset_name: for dataset_name in args.dataset_name: