diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/chatglm/demo.py b/chatglm/demo.py index 4392766..e20d978 100644 --- a/chatglm/demo.py +++ b/chatglm/demo.py @@ -1,5 +1,4 @@ import sys - sys.path.append("..") import json diff --git a/chatglm/test_tokenizer.py b/chatglm/test_tokenizer.py index 4242e53..145b125 100644 --- a/chatglm/test_tokenizer.py +++ b/chatglm/test_tokenizer.py @@ -1,5 +1,4 @@ import sys - sys.path.append("..") import json diff --git a/qwen/finetune/.gitignore b/qwen/.gitignore similarity index 100% rename from qwen/finetune/.gitignore rename to qwen/.gitignore diff --git a/qwen/__init__.py b/qwen/__init__.py new file mode 100644 index 0000000..59e4a5d --- /dev/null +++ b/qwen/__init__.py @@ -0,0 +1,2 @@ +from qwen.modeling_qwen import QWenLMHeadModel +from qwen.configuration_qwen import QWenConfig \ No newline at end of file diff --git a/qwen/finetune/data.json b/qwen/data.json similarity index 100% rename from qwen/finetune/data.json rename to qwen/data.json diff --git a/qwen/finetune/ds_config_zero2.json b/qwen/ds_config_zero2.json similarity index 100% rename from qwen/finetune/ds_config_zero2.json rename to qwen/ds_config_zero2.json diff --git a/qwen/finetune/ds_config_zero3.json b/qwen/ds_config_zero3.json similarity index 100% rename from qwen/finetune/ds_config_zero3.json rename to qwen/ds_config_zero3.json diff --git a/qwen/finetune/finetune.py b/qwen/finetune.py similarity index 80% rename from qwen/finetune/finetune.py rename to qwen/finetune.py index fecbbde..35e80a9 100644 --- a/qwen/finetune/finetune.py +++ b/qwen/finetune.py @@ -18,13 +18,16 @@ from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training from accelerate.utils import DistributedType from modelscope import snapshot_download +from modeling_qwen import QWenLMHeadModel + + IGNORE_TOKEN_ID = LabelSmoother.ignore_index @dataclass class ModelArguments: model_name_or_path: Optional[str] = field(default="qwen/Qwen-1_8B-Chat") - + @dataclass class DataArguments: @@ -101,12 +104,15 @@ def get_peft_state_maybe_zero_3(named_params, bias): local_rank = None + def rank0_print(*args): if local_rank == 0: print(*args) -def safe_save_model_for_hf_trainer(trainer: transformers.Trainer, output_dir: str, bias="none"): +def safe_save_model_for_hf_trainer( + trainer: transformers.Trainer, output_dir: str, bias="none" +): """Collects the state dict and dump to disk.""" # check if zero3 mode enabled if deepspeed.is_deepspeed_zero3_enabled(): @@ -126,16 +132,16 @@ def preprocess( sources, tokenizer: transformers.PreTrainedTokenizer, max_len: int, - system_message: str = "You are a helpful assistant." + system_message: str = "You are a helpful assistant.", ) -> Dict: roles = {"user": "<|im_start|>user", "assistant": "<|im_start|>assistant"} im_start = tokenizer.im_start_id im_end = tokenizer.im_end_id - nl_tokens = tokenizer('\n').input_ids - _system = tokenizer('system').input_ids + nl_tokens - _user = tokenizer('user').input_ids + nl_tokens - _assistant = tokenizer('assistant').input_ids + nl_tokens + nl_tokens = tokenizer("\n").input_ids + _system = tokenizer("system").input_ids + nl_tokens + _user = tokenizer("user").input_ids + nl_tokens + _assistant = tokenizer("assistant").input_ids + nl_tokens # Apply prompt templates input_ids, targets = [], [] @@ -144,20 +150,43 @@ def preprocess( source = source[1:] input_id, target = [], [] - system = [im_start] + _system + tokenizer(system_message).input_ids + [im_end] + nl_tokens + system = ( + [im_start] + + _system + + tokenizer(system_message).input_ids + + [im_end] + + nl_tokens + ) input_id += system - target += [im_start] + [IGNORE_TOKEN_ID] * (len(system)-3) + [im_end] + nl_tokens + target += ( + [im_start] + [IGNORE_TOKEN_ID] * (len(system) - 3) + [im_end] + nl_tokens + ) assert len(input_id) == len(target) for j, sentence in enumerate(source): role = roles[sentence["from"]] - _input_id = tokenizer(role).input_ids + nl_tokens + \ - tokenizer(sentence["value"]).input_ids + [im_end] + nl_tokens + _input_id = ( + tokenizer(role).input_ids + + nl_tokens + + tokenizer(sentence["value"]).input_ids + + [im_end] + + nl_tokens + ) input_id += _input_id - if role == '<|im_start|>user': - _target = [im_start] + [IGNORE_TOKEN_ID] * (len(_input_id)-3) + [im_end] + nl_tokens - elif role == '<|im_start|>assistant': - _target = [im_start] + [IGNORE_TOKEN_ID] * len(tokenizer(role).input_ids) + \ - _input_id[len(tokenizer(role).input_ids)+1:-2] + [im_end] + nl_tokens + if role == "<|im_start|>user": + _target = ( + [im_start] + + [IGNORE_TOKEN_ID] * (len(_input_id) - 3) + + [im_end] + + nl_tokens + ) + elif role == "<|im_start|>assistant": + _target = ( + [im_start] + + [IGNORE_TOKEN_ID] * len(tokenizer(role).input_ids) + + _input_id[len(tokenizer(role).input_ids) + 1 : -2] + + [im_end] + + nl_tokens + ) else: raise NotImplementedError target += _target @@ -179,7 +208,9 @@ def preprocess( class SupervisedDataset(Dataset): """Dataset for supervised fine-tuning.""" - def __init__(self, raw_data, tokenizer: transformers.PreTrainedTokenizer, max_len: int): + def __init__( + self, raw_data, tokenizer: transformers.PreTrainedTokenizer, max_len: int + ): super(SupervisedDataset, self).__init__() rank0_print("Formatting inputs...") @@ -204,7 +235,9 @@ class SupervisedDataset(Dataset): class LazySupervisedDataset(Dataset): """Dataset for supervised fine-tuning.""" - def __init__(self, raw_data, tokenizer: transformers.PreTrainedTokenizer, max_len: int): + def __init__( + self, raw_data, tokenizer: transformers.PreTrainedTokenizer, max_len: int + ): super(LazySupervisedDataset, self).__init__() self.tokenizer = tokenizer self.max_len = max_len @@ -221,7 +254,9 @@ class LazySupervisedDataset(Dataset): if i in self.cached_data_dict: return self.cached_data_dict[i] - ret = preprocess([self.raw_data[i]["conversations"]], self.tokenizer, self.max_len) + ret = preprocess( + [self.raw_data[i]["conversations"]], self.tokenizer, self.max_len + ) ret = dict( input_ids=ret["input_ids"][0], labels=ret["labels"][0], @@ -233,7 +268,9 @@ class LazySupervisedDataset(Dataset): def make_supervised_data_module( - tokenizer: transformers.PreTrainedTokenizer, data_args, max_len, + tokenizer: transformers.PreTrainedTokenizer, + data_args, + max_len, ) -> Dict: """Make dataset and collator for supervised fine-tuning.""" dataset_cls = ( @@ -267,7 +304,10 @@ def train(): ) = parser.parse_args_into_dataclasses() # This serves for single-gpu qlora. - if getattr(training_args, 'deepspeed', None) and int(os.environ.get("WORLD_SIZE", 1))==1: + if ( + getattr(training_args, "deepspeed", None) + and int(os.environ.get("WORLD_SIZE", 1)) == 1 + ): training_args.distributed_state.distributed_type = DistributedType.DEEPSPEED local_rank = training_args.local_rank @@ -278,9 +318,7 @@ def train(): if lora_args.q_lora: device_map = {"": int(os.environ.get("LOCAL_RANK") or 0)} if ddp else "auto" if len(training_args.fsdp) > 0 or deepspeed.is_deepspeed_zero3_enabled(): - logging.warning( - "FSDP or ZeRO3 are incompatible with QLoRA." - ) + logging.warning("FSDP or ZeRO3 are incompatible with QLoRA.") model_dir = snapshot_download(model_args.model_name_or_path) @@ -294,19 +332,18 @@ def train(): # Load model and tokenizer - - model = transformers.AutoModelForCausalLM.from_pretrained( + model = QWenLMHeadModel(config) + model = model.from_pretrained( model_dir, config=config, cache_dir=training_args.cache_dir, device_map=device_map, trust_remote_code=True, - quantization_config=GPTQConfig( - bits=4, disable_exllama=True - ) + quantization_config=GPTQConfig(bits=4, disable_exllama=True) if training_args.use_lora and lora_args.q_lora else None, ) + tokenizer = transformers.AutoTokenizer.from_pretrained( model_dir, cache_dir=training_args.cache_dir, @@ -318,7 +355,7 @@ def train(): tokenizer.pad_token_id = tokenizer.eod_id if training_args.use_lora: - if lora_args.q_lora or 'chat' in model_dir.lower(): + if lora_args.q_lora or "chat" in model_dir.lower(): modules_to_save = None else: modules_to_save = ["wte", "lm_head"] @@ -329,7 +366,7 @@ def train(): lora_dropout=lora_args.lora_dropout, bias=lora_args.lora_bias, task_type="CAUSAL_LM", - modules_to_save=modules_to_save # This argument serves for adding new tokens. + modules_to_save=modules_to_save, # This argument serves for adding new tokens. ) if lora_args.q_lora: model = prepare_model_for_kbit_training( @@ -357,7 +394,9 @@ def train(): trainer.train() trainer.save_state() - safe_save_model_for_hf_trainer(trainer=trainer, output_dir=training_args.output_dir, bias=lora_args.lora_bias) + safe_save_model_for_hf_trainer( + trainer=trainer, output_dir=training_args.output_dir, bias=lora_args.lora_bias + ) if __name__ == "__main__": diff --git a/qwen/finetune/finetune_ds.sh b/qwen/finetune_ds.sh old mode 100644 new mode 100755 similarity index 100% rename from qwen/finetune/finetune_ds.sh rename to qwen/finetune_ds.sh diff --git a/qwen/finetune/finetune_lora_ds.sh b/qwen/finetune_lora_ds.sh old mode 100644 new mode 100755 similarity index 100% rename from qwen/finetune/finetune_lora_ds.sh rename to qwen/finetune_lora_ds.sh diff --git a/qwen/finetune/finetune_lora_single_gpu.sh b/qwen/finetune_lora_single_gpu.sh old mode 100644 new mode 100755 similarity index 100% rename from qwen/finetune/finetune_lora_single_gpu.sh rename to qwen/finetune_lora_single_gpu.sh diff --git a/qwen/finetune/finetune_qlora_ds.sh b/qwen/finetune_qlora_ds.sh old mode 100644 new mode 100755 similarity index 100% rename from qwen/finetune/finetune_qlora_ds.sh rename to qwen/finetune_qlora_ds.sh diff --git a/qwen/finetune/finetune_qlora_single_gpu.sh b/qwen/finetune_qlora_single_gpu.sh old mode 100644 new mode 100755 similarity index 100% rename from qwen/finetune/finetune_qlora_single_gpu.sh rename to qwen/finetune_qlora_single_gpu.sh diff --git a/tools/__init__.py b/tools/__init__.py index e69de29..65d088a 100644 --- a/tools/__init__.py +++ b/tools/__init__.py @@ -0,0 +1 @@ +import show \ No newline at end of file