From 122cbd9ff8bcc211bbfaebb20499f58aee111080 Mon Sep 17 00:00:00 2001 From: Colin Date: Sat, 24 Feb 2024 14:14:12 +0800 Subject: [PATCH] Use local tokenizer. --- custom_models/gpt2/tokenizer_config.json | 1 + lit_train.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) create mode 100644 custom_models/gpt2/tokenizer_config.json diff --git a/custom_models/gpt2/tokenizer_config.json b/custom_models/gpt2/tokenizer_config.json new file mode 100644 index 0000000..be4d21d --- /dev/null +++ b/custom_models/gpt2/tokenizer_config.json @@ -0,0 +1 @@ +{"model_max_length": 1024} \ No newline at end of file diff --git a/lit_train.py b/lit_train.py index c7e9873..c412c1b 100644 --- a/lit_train.py +++ b/lit_train.py @@ -182,7 +182,7 @@ if __name__ == "__main__": lit_module = LitModule(args.model_name, "./custom_models/gpt2", args.learning_rate, args.use_tril_attention_mask) # datasets - tokenizer = load_tokenizer(args.tokenizer_name_or_path) + tokenizer = load_tokenizer("./custom_models/gpt2") train_dataset_list = [] val_dataset_list = [] for dataset_name in args.dataset_name: