diff --git a/qwen/demo.py b/qwen/demo.py index 189e462..93fe06f 100644 --- a/qwen/demo.py +++ b/qwen/demo.py @@ -1,4 +1,3 @@ - import torch from modelscope import snapshot_download from transformers import AutoModelForCausalLM, AutoTokenizer @@ -11,7 +10,8 @@ seed = 4321 torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) -model_dir = snapshot_download("qwen/Qwen-1_8B-Chat") +# model_dir = snapshot_download("qwen/Qwen-1_8B-Chat") +model_dir = "/home/colin/.cache/modelscope/hub/qwen/Qwen-1_8B-Chat" config, kwargs = AutoConfig.from_pretrained( model_dir, diff --git a/qwen/finetune/.gitignore b/qwen/finetune/.gitignore new file mode 100644 index 0000000..cde80e8 --- /dev/null +++ b/qwen/finetune/.gitignore @@ -0,0 +1 @@ +output_qwen \ No newline at end of file diff --git a/qwen/finetune/data.json b/qwen/finetune/data.json new file mode 100644 index 0000000..94e0d31 --- /dev/null +++ b/qwen/finetune/data.json @@ -0,0 +1,15 @@ +[ + { + "id": "identity_0", + "conversations": [ + { + "from": "user", + "value": "你好" + }, + { + "from": "assistant", + "value": "我是一个语言模型,我叫通义千问。" + } + ] + } +] \ No newline at end of file diff --git a/qwen/finetune/ds_config_zero2.json b/qwen/finetune/ds_config_zero2.json new file mode 100644 index 0000000..4be2c0b --- /dev/null +++ b/qwen/finetune/ds_config_zero2.json @@ -0,0 +1,52 @@ +{ + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "optimizer": { + "type": "AdamW", + "params": { + "lr": "auto", + "betas": "auto", + "eps": "auto", + "weight_decay": "auto" + } + }, + + "scheduler": { + "type": "WarmupLR", + "params": { + "warmup_min_lr": "auto", + "warmup_max_lr": "auto", + "warmup_num_steps": "auto" + } + }, + + "zero_optimization": { + "stage": 2, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "allgather_partitions": true, + "allgather_bucket_size": 2e8, + "overlap_comm": true, + "reduce_scatter": true, + "reduce_bucket_size": 2e8, + "contiguous_gradients": true + }, + + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 100, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false +} \ No newline at end of file diff --git a/qwen/finetune/ds_config_zero3.json b/qwen/finetune/ds_config_zero3.json new file mode 100644 index 0000000..e30fe94 --- /dev/null +++ b/qwen/finetune/ds_config_zero3.json @@ -0,0 +1,59 @@ +{ + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "optimizer": { + "type": "AdamW", + "params": { + "lr": "auto", + "betas": "auto", + "eps": "auto", + "weight_decay": "auto" + } + }, + + "scheduler": { + "type": "WarmupLR", + "params": { + "warmup_min_lr": "auto", + "warmup_max_lr": "auto", + "warmup_num_steps": "auto" + } + }, + + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1e9, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1e9, + "stage3_max_reuse_distance": 1e9, + "stage3_gather_16bit_weights_on_model_save": true + }, + + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 100, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false +} diff --git a/qwen/finetune/finetune.py b/qwen/finetune/finetune.py new file mode 100644 index 0000000..fecbbde --- /dev/null +++ b/qwen/finetune/finetune.py @@ -0,0 +1,364 @@ +# This code is based on the revised code from fastchat based on tatsu-lab/stanford_alpaca. + + +from dataclasses import dataclass, field +import json +import math +import logging +import os +from typing import Dict, Optional, List +import torch +from torch.utils.data import Dataset +from deepspeed import zero +from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus +import transformers +from transformers import Trainer, GPTQConfig, deepspeed +from transformers.trainer_pt_utils import LabelSmoother +from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training +from accelerate.utils import DistributedType +from modelscope import snapshot_download + +IGNORE_TOKEN_ID = LabelSmoother.ignore_index + + +@dataclass +class ModelArguments: + model_name_or_path: Optional[str] = field(default="qwen/Qwen-1_8B-Chat") + + +@dataclass +class DataArguments: + data_path: str = field( + default=None, metadata={"help": "Path to the training data."} + ) + eval_data_path: str = field( + default=None, metadata={"help": "Path to the evaluation data."} + ) + lazy_preprocess: bool = False + + +@dataclass +class TrainingArguments(transformers.TrainingArguments): + cache_dir: Optional[str] = field(default=None) + optim: str = field(default="adamw_torch") + model_max_length: int = field( + default=8192, + metadata={ + "help": "Maximum sequence length. Sequences will be right padded (and possibly truncated)." + }, + ) + use_lora: bool = False + + +@dataclass +class LoraArguments: + lora_r: int = 64 + lora_alpha: int = 16 + lora_dropout: float = 0.05 + lora_target_modules: List[str] = field( + default_factory=lambda: ["c_attn", "c_proj", "w1", "w2"] + ) + lora_weight_path: str = "" + lora_bias: str = "none" + q_lora: bool = False + + +def maybe_zero_3(param): + if hasattr(param, "ds_id"): + assert param.ds_status == ZeroParamStatus.NOT_AVAILABLE + with zero.GatheredParameters([param]): + param = param.data.detach().cpu().clone() + else: + param = param.detach().cpu().clone() + return param + + +# Borrowed from peft.utils.get_peft_model_state_dict +def get_peft_state_maybe_zero_3(named_params, bias): + if bias == "none": + to_return = {k: t for k, t in named_params if "lora_" in k} + elif bias == "all": + to_return = {k: t for k, t in named_params if "lora_" in k or "bias" in k} + elif bias == "lora_only": + to_return = {} + maybe_lora_bias = {} + lora_bias_names = set() + for k, t in named_params: + if "lora_" in k: + to_return[k] = t + bias_name = k.split("lora_")[0] + "bias" + lora_bias_names.add(bias_name) + elif "bias" in k: + maybe_lora_bias[k] = t + for k, t in maybe_lora_bias: + if bias_name in lora_bias_names: + to_return[bias_name] = t + else: + raise NotImplementedError + to_return = {k: maybe_zero_3(v) for k, v in to_return.items()} + return to_return + + +local_rank = None + +def rank0_print(*args): + if local_rank == 0: + print(*args) + + +def safe_save_model_for_hf_trainer(trainer: transformers.Trainer, output_dir: str, bias="none"): + """Collects the state dict and dump to disk.""" + # check if zero3 mode enabled + if deepspeed.is_deepspeed_zero3_enabled(): + state_dict = trainer.model_wrapped._zero3_consolidated_16bit_state_dict() + else: + if trainer.args.use_lora: + state_dict = get_peft_state_maybe_zero_3( + trainer.model.named_parameters(), bias + ) + else: + state_dict = trainer.model.state_dict() + if trainer.args.should_save and trainer.args.local_rank == 0: + trainer._save(output_dir, state_dict=state_dict) + + +def preprocess( + sources, + tokenizer: transformers.PreTrainedTokenizer, + max_len: int, + system_message: str = "You are a helpful assistant." +) -> Dict: + roles = {"user": "<|im_start|>user", "assistant": "<|im_start|>assistant"} + + im_start = tokenizer.im_start_id + im_end = tokenizer.im_end_id + nl_tokens = tokenizer('\n').input_ids + _system = tokenizer('system').input_ids + nl_tokens + _user = tokenizer('user').input_ids + nl_tokens + _assistant = tokenizer('assistant').input_ids + nl_tokens + + # Apply prompt templates + input_ids, targets = [], [] + for i, source in enumerate(sources): + if roles[source[0]["from"]] != roles["user"]: + source = source[1:] + + input_id, target = [], [] + system = [im_start] + _system + tokenizer(system_message).input_ids + [im_end] + nl_tokens + input_id += system + target += [im_start] + [IGNORE_TOKEN_ID] * (len(system)-3) + [im_end] + nl_tokens + assert len(input_id) == len(target) + for j, sentence in enumerate(source): + role = roles[sentence["from"]] + _input_id = tokenizer(role).input_ids + nl_tokens + \ + tokenizer(sentence["value"]).input_ids + [im_end] + nl_tokens + input_id += _input_id + if role == '<|im_start|>user': + _target = [im_start] + [IGNORE_TOKEN_ID] * (len(_input_id)-3) + [im_end] + nl_tokens + elif role == '<|im_start|>assistant': + _target = [im_start] + [IGNORE_TOKEN_ID] * len(tokenizer(role).input_ids) + \ + _input_id[len(tokenizer(role).input_ids)+1:-2] + [im_end] + nl_tokens + else: + raise NotImplementedError + target += _target + assert len(input_id) == len(target) + input_id += [tokenizer.pad_token_id] * (max_len - len(input_id)) + target += [IGNORE_TOKEN_ID] * (max_len - len(target)) + input_ids.append(input_id[:max_len]) + targets.append(target[:max_len]) + input_ids = torch.tensor(input_ids, dtype=torch.int) + targets = torch.tensor(targets, dtype=torch.int) + + return dict( + input_ids=input_ids, + labels=targets, + attention_mask=input_ids.ne(tokenizer.pad_token_id), + ) + + +class SupervisedDataset(Dataset): + """Dataset for supervised fine-tuning.""" + + def __init__(self, raw_data, tokenizer: transformers.PreTrainedTokenizer, max_len: int): + super(SupervisedDataset, self).__init__() + + rank0_print("Formatting inputs...") + sources = [example["conversations"] for example in raw_data] + data_dict = preprocess(sources, tokenizer, max_len) + + self.input_ids = data_dict["input_ids"] + self.labels = data_dict["labels"] + self.attention_mask = data_dict["attention_mask"] + + def __len__(self): + return len(self.input_ids) + + def __getitem__(self, i) -> Dict[str, torch.Tensor]: + return dict( + input_ids=self.input_ids[i], + labels=self.labels[i], + attention_mask=self.attention_mask[i], + ) + + +class LazySupervisedDataset(Dataset): + """Dataset for supervised fine-tuning.""" + + def __init__(self, raw_data, tokenizer: transformers.PreTrainedTokenizer, max_len: int): + super(LazySupervisedDataset, self).__init__() + self.tokenizer = tokenizer + self.max_len = max_len + + rank0_print("Formatting inputs...Skip in lazy mode") + self.tokenizer = tokenizer + self.raw_data = raw_data + self.cached_data_dict = {} + + def __len__(self): + return len(self.raw_data) + + def __getitem__(self, i) -> Dict[str, torch.Tensor]: + if i in self.cached_data_dict: + return self.cached_data_dict[i] + + ret = preprocess([self.raw_data[i]["conversations"]], self.tokenizer, self.max_len) + ret = dict( + input_ids=ret["input_ids"][0], + labels=ret["labels"][0], + attention_mask=ret["attention_mask"][0], + ) + self.cached_data_dict[i] = ret + + return ret + + +def make_supervised_data_module( + tokenizer: transformers.PreTrainedTokenizer, data_args, max_len, +) -> Dict: + """Make dataset and collator for supervised fine-tuning.""" + dataset_cls = ( + LazySupervisedDataset if data_args.lazy_preprocess else SupervisedDataset + ) + rank0_print("Loading data...") + + train_json = json.load(open(data_args.data_path, "r")) + train_dataset = dataset_cls(train_json, tokenizer=tokenizer, max_len=max_len) + + if data_args.eval_data_path: + eval_json = json.load(open(data_args.eval_data_path, "r")) + eval_dataset = dataset_cls(eval_json, tokenizer=tokenizer, max_len=max_len) + else: + eval_dataset = None + + return dict(train_dataset=train_dataset, eval_dataset=eval_dataset) + + +def train(): + global local_rank + + parser = transformers.HfArgumentParser( + (ModelArguments, DataArguments, TrainingArguments, LoraArguments) + ) + ( + model_args, + data_args, + training_args, + lora_args, + ) = parser.parse_args_into_dataclasses() + + # This serves for single-gpu qlora. + if getattr(training_args, 'deepspeed', None) and int(os.environ.get("WORLD_SIZE", 1))==1: + training_args.distributed_state.distributed_type = DistributedType.DEEPSPEED + + local_rank = training_args.local_rank + + device_map = "auto" + world_size = int(os.environ.get("WORLD_SIZE", 1)) + ddp = world_size != 1 + if lora_args.q_lora: + device_map = {"": int(os.environ.get("LOCAL_RANK") or 0)} if ddp else "auto" + if len(training_args.fsdp) > 0 or deepspeed.is_deepspeed_zero3_enabled(): + logging.warning( + "FSDP or ZeRO3 are incompatible with QLoRA." + ) + + model_dir = snapshot_download(model_args.model_name_or_path) + + # Set RoPE scaling factor + config = transformers.AutoConfig.from_pretrained( + model_dir, + cache_dir=training_args.cache_dir, + trust_remote_code=True, + ) + config.use_cache = False + + # Load model and tokenizer + + + model = transformers.AutoModelForCausalLM.from_pretrained( + model_dir, + config=config, + cache_dir=training_args.cache_dir, + device_map=device_map, + trust_remote_code=True, + quantization_config=GPTQConfig( + bits=4, disable_exllama=True + ) + if training_args.use_lora and lora_args.q_lora + else None, + ) + tokenizer = transformers.AutoTokenizer.from_pretrained( + model_dir, + cache_dir=training_args.cache_dir, + model_max_length=training_args.model_max_length, + padding_side="right", + use_fast=False, + trust_remote_code=True, + ) + tokenizer.pad_token_id = tokenizer.eod_id + + if training_args.use_lora: + if lora_args.q_lora or 'chat' in model_dir.lower(): + modules_to_save = None + else: + modules_to_save = ["wte", "lm_head"] + lora_config = LoraConfig( + r=lora_args.lora_r, + lora_alpha=lora_args.lora_alpha, + target_modules=lora_args.lora_target_modules, + lora_dropout=lora_args.lora_dropout, + bias=lora_args.lora_bias, + task_type="CAUSAL_LM", + modules_to_save=modules_to_save # This argument serves for adding new tokens. + ) + if lora_args.q_lora: + model = prepare_model_for_kbit_training( + model, use_gradient_checkpointing=training_args.gradient_checkpointing + ) + + model = get_peft_model(model, lora_config) + + # Print peft trainable params + model.print_trainable_parameters() + + if training_args.gradient_checkpointing: + model.enable_input_require_grads() + + # Load data + data_module = make_supervised_data_module( + tokenizer=tokenizer, data_args=data_args, max_len=training_args.model_max_length + ) + + # Start trainner + trainer = Trainer( + model=model, tokenizer=tokenizer, args=training_args, **data_module + ) + + trainer.train() + trainer.save_state() + + safe_save_model_for_hf_trainer(trainer=trainer, output_dir=training_args.output_dir, bias=lora_args.lora_bias) + + +if __name__ == "__main__": + train() diff --git a/qwen/finetune/finetune_ds.sh b/qwen/finetune/finetune_ds.sh new file mode 100644 index 0000000..ad28d9e --- /dev/null +++ b/qwen/finetune/finetune_ds.sh @@ -0,0 +1,90 @@ +#!/bin/bash +export CUDA_DEVICE_MAX_CONNECTIONS=1 +DIR=`pwd` + +# Guide: +# This script supports distributed training on multi-gpu workers (as well as single-worker training). +# Please set the options below according to the comments. +# For multi-gpu workers training, these options should be manually set for each worker. +# After setting the options, please run the script on each worker. + +# Number of GPUs per GPU worker +GPUS_PER_NODE=$(python -c 'import torch; print(torch.cuda.device_count())') + +# Number of GPU workers, for single-worker training, please set to 1 +NNODES=${NNODES:-1} + +# The rank of this worker, should be in {0, ..., WORKER_CNT-1}, for single-worker training, please set to 0 +NODE_RANK=${NODE_RANK:-0} + +# The ip address of the rank-0 worker, for single-worker training, please set to localhost +MASTER_ADDR=${MASTER_ADDR:localhost} + +# The port for communication +MASTER_PORT=${MASTER_PORT:-6001} + +MODEL="Qwen/Qwen-7B" # Set the path if you do not want to load from huggingface directly +# ATTENTION: specify the path to your training data, which should be a json file consisting of a list of conversations. +# See the section for finetuning in README for more information. +DATA="path_to_data" + +function usage() { + echo ' +Usage: bash finetune/finetune_ds.sh [-m MODEL_PATH] [-d DATA_PATH] +' +} + +while [[ "$1" != "" ]]; do + case $1 in + -m | --model ) + shift + MODEL=$1 + ;; + -d | --data ) + shift + DATA=$1 + ;; + -h | --help ) + usage + exit 0 + ;; + * ) + echo "Unknown argument ${1}" + exit 1 + ;; + esac + shift +done + +DISTRIBUTED_ARGS=" + --nproc_per_node $GPUS_PER_NODE \ + --nnodes $NNODES \ + --node_rank $NODE_RANK \ + --master_addr $MASTER_ADDR \ + --master_port $MASTER_PORT +" + +torchrun $DISTRIBUTED_ARGS finetune.py \ + --model_name_or_path $MODEL \ + --data_path $DATA \ + --bf16 True \ + --output_dir output_qwen \ + --num_train_epochs 5 \ + --per_device_train_batch_size 1 \ + --per_device_eval_batch_size 1 \ + --gradient_accumulation_steps 16 \ + --evaluation_strategy "no" \ + --save_strategy "steps" \ + --save_steps 1000 \ + --save_total_limit 10 \ + --learning_rate 1e-5 \ + --weight_decay 0.1 \ + --adam_beta2 0.95 \ + --warmup_ratio 0.01 \ + --lr_scheduler_type "cosine" \ + --logging_steps 1 \ + --report_to "none" \ + --model_max_length 512 \ + --gradient_checkpointing True \ + --lazy_preprocess True \ + --deepspeed finetune/ds_config_zero3.json diff --git a/qwen/finetune/finetune_lora_ds.sh b/qwen/finetune/finetune_lora_ds.sh new file mode 100644 index 0000000..7b7d514 --- /dev/null +++ b/qwen/finetune/finetune_lora_ds.sh @@ -0,0 +1,96 @@ +#!/bin/bash +export CUDA_DEVICE_MAX_CONNECTIONS=1 +DIR=`pwd` + +# Guide: +# This script supports distributed training on multi-gpu workers (as well as single-worker training). +# Please set the options below according to the comments. +# For multi-gpu workers training, these options should be manually set for each worker. +# After setting the options, please run the script on each worker. + +# Number of GPUs per GPU worker +GPUS_PER_NODE=$(python -c 'import torch; print(torch.cuda.device_count())') + +# Number of GPU workers, for single-worker training, please set to 1 +NNODES=${NNODES:-1} + +# The rank of this worker, should be in {0, ..., WORKER_CNT-1}, for single-worker training, please set to 0 +NODE_RANK=${NODE_RANK:-0} + +# The ip address of the rank-0 worker, for single-worker training, please set to localhost +MASTER_ADDR=${MASTER_ADDR:localhost} + +# The port for communication +MASTER_PORT=${MASTER_PORT:-6001} + +MODEL="Qwen/Qwen-7B" # Set the path if you do not want to load from huggingface directly +# ATTENTION: specify the path to your training data, which should be a json file consisting of a list of conversations. +# See the section for finetuning in README for more information. +DATA="path_to_data" +DS_CONFIG_PATH="finetune/ds_config_zero2.json" + +function usage() { + echo ' +Usage: bash finetune/finetune_lora_ds.sh [-m MODEL_PATH] [-d DATA_PATH] [--deepspeed DS_CONFIG_PATH] +' +} + +while [[ "$1" != "" ]]; do + case $1 in + -m | --model ) + shift + MODEL=$1 + ;; + -d | --data ) + shift + DATA=$1 + ;; + --deepspeed ) + shift + DS_CONFIG_PATH=$1 + ;; + -h | --help ) + usage + exit 0 + ;; + * ) + echo "Unknown argument ${1}" + exit 1 + ;; + esac + shift +done + +DISTRIBUTED_ARGS=" + --nproc_per_node $GPUS_PER_NODE \ + --nnodes $NNODES \ + --node_rank $NODE_RANK \ + --master_addr $MASTER_ADDR \ + --master_port $MASTER_PORT +" + +torchrun $DISTRIBUTED_ARGS finetune.py \ + --model_name_or_path $MODEL \ + --data_path $DATA \ + --bf16 True \ + --output_dir output_qwen \ + --num_train_epochs 5 \ + --per_device_train_batch_size 2 \ + --per_device_eval_batch_size 1 \ + --gradient_accumulation_steps 8 \ + --evaluation_strategy "no" \ + --save_strategy "steps" \ + --save_steps 1000 \ + --save_total_limit 10 \ + --learning_rate 3e-4 \ + --weight_decay 0.1 \ + --adam_beta2 0.95 \ + --warmup_ratio 0.01 \ + --lr_scheduler_type "cosine" \ + --logging_steps 1 \ + --report_to "none" \ + --model_max_length 512 \ + --lazy_preprocess True \ + --use_lora \ + --gradient_checkpointing \ + --deepspeed ${DS_CONFIG_PATH} diff --git a/qwen/finetune/finetune_lora_single_gpu.sh b/qwen/finetune/finetune_lora_single_gpu.sh new file mode 100644 index 0000000..877e5c4 --- /dev/null +++ b/qwen/finetune/finetune_lora_single_gpu.sh @@ -0,0 +1,65 @@ +#!/bin/bash +export CUDA_DEVICE_MAX_CONNECTIONS=1 + +MODEL="qwen/Qwen-1_8B-Chat" # Set the path if you do not want to load from huggingface directly +# ATTENTION: specify the path to your training data, which should be a json file consisting of a list of conversations. +# See the section for finetuning in README for more information. +DATA="data.json" + +function usage() { + echo ' +Usage: bash finetune/finetune_lora_single_gpu.sh [-m MODEL_PATH] [-d DATA_PATH] +' +} + +while [[ "$1" != "" ]]; do + case $1 in + -m | --model ) + shift + MODEL=$1 + ;; + -d | --data ) + shift + DATA=$1 + ;; + -h | --help ) + usage + exit 0 + ;; + * ) + echo "Unknown argument ${1}" + exit 1 + ;; + esac + shift +done + +export CUDA_VISIBLE_DEVICES=0 + +python finetune.py \ + --model_name_or_path $MODEL \ + --data_path $DATA \ + --bf16 False \ + --output_dir output_qwen \ + --num_train_epochs 5 \ + --per_device_train_batch_size 2 \ + --per_device_eval_batch_size 1 \ + --gradient_accumulation_steps 8 \ + --evaluation_strategy "no" \ + --save_strategy "steps" \ + --save_steps 1000 \ + --save_total_limit 10 \ + --learning_rate 3e-4 \ + --weight_decay 0.1 \ + --adam_beta2 0.95 \ + --warmup_ratio 0.01 \ + --lr_scheduler_type "cosine" \ + --logging_steps 1 \ + --report_to "none" \ + --model_max_length 512 \ + --lazy_preprocess True \ + --gradient_checkpointing \ + --use_lora + +# If you use fp16 instead of bf16, you should use deepspeed +# --fp16 True --deepspeed finetune/ds_config_zero2.json diff --git a/qwen/finetune/finetune_qlora_ds.sh b/qwen/finetune/finetune_qlora_ds.sh new file mode 100644 index 0000000..1bd4698 --- /dev/null +++ b/qwen/finetune/finetune_qlora_ds.sh @@ -0,0 +1,93 @@ +#!/bin/bash +export CUDA_DEVICE_MAX_CONNECTIONS=1 +DIR=`pwd` + +# Guide: +# This script supports distributed training on multi-gpu workers (as well as single-worker training). +# Please set the options below according to the comments. +# For multi-gpu workers training, these options should be manually set for each worker. +# After setting the options, please run the script on each worker. + +# Number of GPUs per GPU worker +GPUS_PER_NODE=$(python -c 'import torch; print(torch.cuda.device_count())') + +# Number of GPU workers, for single-worker training, please set to 1 +NNODES=${NNODES:-1} + +# The rank of this worker, should be in {0, ..., WORKER_CNT-1}, for single-worker training, please set to 0 +NODE_RANK=${NODE_RANK:-0} + +# The ip address of the rank-0 worker, for single-worker training, please set to localhost +MASTER_ADDR=${MASTER_ADDR:localhost} + +# The port for communication +MASTER_PORT=${MASTER_PORT:-6001} + +MODEL="Qwen/Qwen-7B-Chat-Int4" # Set the path if you do not want to load from huggingface directly +# ATTENTION: specify the path to your training data, which should be a json file consisting of a list of conversations. +# See the section for finetuning in README for more information. +DATA="path_to_data" + +function usage() { + echo ' +Usage: bash finetune/finetune_qlora_ds.sh [-m MODEL_PATH] [-d DATA_PATH] +' +} + +while [[ "$1" != "" ]]; do + case $1 in + -m | --model ) + shift + MODEL=$1 + ;; + -d | --data ) + shift + DATA=$1 + ;; + -h | --help ) + usage + exit 0 + ;; + * ) + echo "Unknown argument ${1}" + exit 1 + ;; + esac + shift +done + +DISTRIBUTED_ARGS=" + --nproc_per_node $GPUS_PER_NODE \ + --nnodes $NNODES \ + --node_rank $NODE_RANK \ + --master_addr $MASTER_ADDR \ + --master_port $MASTER_PORT +" + +# Remember to use --fp16 instead of --bf16 due to autogptq +torchrun $DISTRIBUTED_ARGS finetune.py \ + --model_name_or_path $MODEL \ + --data_path $DATA \ + --fp16 True \ + --output_dir output_qwen \ + --num_train_epochs 5 \ + --per_device_train_batch_size 2 \ + --per_device_eval_batch_size 1 \ + --gradient_accumulation_steps 8 \ + --evaluation_strategy "no" \ + --save_strategy "steps" \ + --save_steps 1000 \ + --save_total_limit 10 \ + --learning_rate 3e-4 \ + --weight_decay 0.1 \ + --adam_beta2 0.95 \ + --warmup_ratio 0.01 \ + --lr_scheduler_type "cosine" \ + --logging_steps 1 \ + --report_to "none" \ + --model_max_length 512 \ + --lazy_preprocess True \ + --use_lora \ + --q_lora \ + --gradient_checkpointing \ + --deepspeed finetune/ds_config_zero2.json diff --git a/qwen/finetune/finetune_qlora_single_gpu.sh b/qwen/finetune/finetune_qlora_single_gpu.sh new file mode 100644 index 0000000..fb019a0 --- /dev/null +++ b/qwen/finetune/finetune_qlora_single_gpu.sh @@ -0,0 +1,66 @@ +#!/bin/bash +export CUDA_DEVICE_MAX_CONNECTIONS=1 +DIR=`pwd` + +MODEL="Qwen/Qwen-7B-Chat-Int4" # Set the path if you do not want to load from huggingface directly +# ATTENTION: specify the path to your training data, which should be a json file consisting of a list of conversations. +# See the section for finetuning in README for more information. +DATA="path_to_data" + +function usage() { + echo ' +Usage: bash finetune/finetune_qlora_single_gpu.sh [-m MODEL_PATH] [-d DATA_PATH] +' +} + +while [[ "$1" != "" ]]; do + case $1 in + -m | --model ) + shift + MODEL=$1 + ;; + -d | --data ) + shift + DATA=$1 + ;; + -h | --help ) + usage + exit 0 + ;; + * ) + echo "Unknown argument ${1}" + exit 1 + ;; + esac + shift +done + +export CUDA_VISIBLE_DEVICES=0 + +# Remember to use --fp16 instead of --bf16 due to autogptq +python finetune.py \ + --model_name_or_path $MODEL \ + --data_path $DATA \ + --fp16 True \ + --output_dir output_qwen \ + --num_train_epochs 5 \ + --per_device_train_batch_size 2 \ + --per_device_eval_batch_size 1 \ + --gradient_accumulation_steps 8 \ + --evaluation_strategy "no" \ + --save_strategy "steps" \ + --save_steps 1000 \ + --save_total_limit 10 \ + --learning_rate 3e-4 \ + --weight_decay 0.1 \ + --adam_beta2 0.95 \ + --warmup_ratio 0.01 \ + --lr_scheduler_type "cosine" \ + --logging_steps 1 \ + --report_to "none" \ + --model_max_length 512 \ + --lazy_preprocess True \ + --gradient_checkpointing \ + --use_lora \ + --q_lora \ + --deepspeed finetune/ds_config_zero2.json diff --git a/test/dataset.py b/test/dataset.py new file mode 100644 index 0000000..165aae3 --- /dev/null +++ b/test/dataset.py @@ -0,0 +1,7 @@ +from datasets import load_dataset + + +dataset = load_dataset("BAAI/COIG") + +d = dataset["Default"][0] +dataset \ No newline at end of file