Remote return_dict config. Remove unuse files.
This commit is contained in:
		
							parent
							
								
									90cb0fe236
								
							
						
					
					
						commit
						a8f2fbbff5
					
				
							
								
								
									
										10
									
								
								qwen/demo.py
								
								
								
								
							
							
						
						
									
										10
									
								
								qwen/demo.py
								
								
								
								
							| 
						 | 
					@ -14,7 +14,7 @@ model_dir = snapshot_download("qwen/Qwen-1_8B-Chat")
 | 
				
			||||||
# model_dir = "/home/colin/.cache/modelscope/hub/qwen/Qwen-1_8B-Chat"
 | 
					# model_dir = "/home/colin/.cache/modelscope/hub/qwen/Qwen-1_8B-Chat"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
config, kwargs = AutoConfig.from_pretrained(
 | 
					config, kwargs = AutoConfig.from_pretrained(
 | 
				
			||||||
    model_dir,
 | 
					    "./",
 | 
				
			||||||
    return_unused_kwargs=True,
 | 
					    return_unused_kwargs=True,
 | 
				
			||||||
    trust_remote_code=True,
 | 
					    trust_remote_code=True,
 | 
				
			||||||
    code_revision=None,
 | 
					    code_revision=None,
 | 
				
			||||||
| 
						 | 
					@ -25,15 +25,15 @@ model = QWenLMHeadModel(config)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)
 | 
					tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)
 | 
				
			||||||
model = model.from_pretrained(
 | 
					model = model.from_pretrained(
 | 
				
			||||||
    model_dir, device_map="auto", trust_remote_code=True
 | 
					    model_dir, config=config, device_map="auto", trust_remote_code=True
 | 
				
			||||||
).train()
 | 
					).train()
 | 
				
			||||||
# model.train()
 | 
					# model.train()
 | 
				
			||||||
# model.zero_grad()
 | 
					# model.zero_grad()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# 可指定不同的生成长度、top_p等相关超参
 | 
					# 可指定不同的生成长度、top_p等相关超参
 | 
				
			||||||
model.generation_config = GenerationConfig.from_pretrained(
 | 
					# model.generation_config = GenerationConfig.from_pretrained(
 | 
				
			||||||
    model_dir, trust_remote_code=True
 | 
					#     model_dir, trust_remote_code=True
 | 
				
			||||||
)
 | 
					# )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# 第一轮对话
 | 
					# 第一轮对话
 | 
				
			||||||
response, history = model.chat(tokenizer, "你好", history=None)
 | 
					response, history = model.chat(tokenizer, "你好", history=None)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,52 +0,0 @@
 | 
				
			||||||
{
 | 
					 | 
				
			||||||
    "fp16": {
 | 
					 | 
				
			||||||
        "enabled": "auto",
 | 
					 | 
				
			||||||
        "loss_scale": 0,
 | 
					 | 
				
			||||||
        "loss_scale_window": 1000,
 | 
					 | 
				
			||||||
        "initial_scale_power": 16,
 | 
					 | 
				
			||||||
        "hysteresis": 2,
 | 
					 | 
				
			||||||
        "min_loss_scale": 1
 | 
					 | 
				
			||||||
    },
 | 
					 | 
				
			||||||
    "bf16": {
 | 
					 | 
				
			||||||
        "enabled": "auto"
 | 
					 | 
				
			||||||
    },
 | 
					 | 
				
			||||||
    "optimizer": {
 | 
					 | 
				
			||||||
        "type": "AdamW",
 | 
					 | 
				
			||||||
        "params": {
 | 
					 | 
				
			||||||
            "lr": "auto",
 | 
					 | 
				
			||||||
            "betas": "auto",
 | 
					 | 
				
			||||||
            "eps": "auto",
 | 
					 | 
				
			||||||
            "weight_decay": "auto"
 | 
					 | 
				
			||||||
        }
 | 
					 | 
				
			||||||
    },
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    "scheduler": {
 | 
					 | 
				
			||||||
        "type": "WarmupLR",
 | 
					 | 
				
			||||||
        "params": {
 | 
					 | 
				
			||||||
            "warmup_min_lr": "auto",
 | 
					 | 
				
			||||||
            "warmup_max_lr": "auto",
 | 
					 | 
				
			||||||
            "warmup_num_steps": "auto"
 | 
					 | 
				
			||||||
        }
 | 
					 | 
				
			||||||
    },
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    "zero_optimization": {
 | 
					 | 
				
			||||||
        "stage": 2,
 | 
					 | 
				
			||||||
        "offload_optimizer": {
 | 
					 | 
				
			||||||
            "device": "none",
 | 
					 | 
				
			||||||
            "pin_memory": true
 | 
					 | 
				
			||||||
        },
 | 
					 | 
				
			||||||
        "allgather_partitions": true,
 | 
					 | 
				
			||||||
        "allgather_bucket_size": 2e8,
 | 
					 | 
				
			||||||
        "overlap_comm": true,
 | 
					 | 
				
			||||||
        "reduce_scatter": true,
 | 
					 | 
				
			||||||
        "reduce_bucket_size": 2e8,
 | 
					 | 
				
			||||||
        "contiguous_gradients": true
 | 
					 | 
				
			||||||
    },
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    "gradient_accumulation_steps": "auto",
 | 
					 | 
				
			||||||
    "gradient_clipping": "auto",
 | 
					 | 
				
			||||||
    "steps_per_print": 100,
 | 
					 | 
				
			||||||
    "train_batch_size": "auto",
 | 
					 | 
				
			||||||
    "train_micro_batch_size_per_gpu": "auto",
 | 
					 | 
				
			||||||
    "wall_clock_breakdown": false
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
| 
						 | 
					@ -1,59 +0,0 @@
 | 
				
			||||||
{
 | 
					 | 
				
			||||||
    "fp16": {
 | 
					 | 
				
			||||||
        "enabled": "auto",
 | 
					 | 
				
			||||||
        "loss_scale": 0,
 | 
					 | 
				
			||||||
        "loss_scale_window": 1000,
 | 
					 | 
				
			||||||
        "initial_scale_power": 16,
 | 
					 | 
				
			||||||
        "hysteresis": 2,
 | 
					 | 
				
			||||||
        "min_loss_scale": 1
 | 
					 | 
				
			||||||
    },
 | 
					 | 
				
			||||||
    "bf16": {
 | 
					 | 
				
			||||||
        "enabled": "auto"
 | 
					 | 
				
			||||||
    },
 | 
					 | 
				
			||||||
    "optimizer": {
 | 
					 | 
				
			||||||
        "type": "AdamW",
 | 
					 | 
				
			||||||
        "params": {
 | 
					 | 
				
			||||||
            "lr": "auto",
 | 
					 | 
				
			||||||
            "betas": "auto",
 | 
					 | 
				
			||||||
            "eps": "auto",
 | 
					 | 
				
			||||||
            "weight_decay": "auto"
 | 
					 | 
				
			||||||
        }
 | 
					 | 
				
			||||||
    },
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    "scheduler": {
 | 
					 | 
				
			||||||
        "type": "WarmupLR",
 | 
					 | 
				
			||||||
        "params": {
 | 
					 | 
				
			||||||
            "warmup_min_lr": "auto",
 | 
					 | 
				
			||||||
            "warmup_max_lr": "auto",
 | 
					 | 
				
			||||||
            "warmup_num_steps": "auto"
 | 
					 | 
				
			||||||
        }
 | 
					 | 
				
			||||||
    },
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    "zero_optimization": {
 | 
					 | 
				
			||||||
        "stage": 3,
 | 
					 | 
				
			||||||
        "offload_optimizer": {
 | 
					 | 
				
			||||||
            "device": "none",
 | 
					 | 
				
			||||||
            "pin_memory": true
 | 
					 | 
				
			||||||
        },
 | 
					 | 
				
			||||||
        "offload_param": {
 | 
					 | 
				
			||||||
            "device": "none",
 | 
					 | 
				
			||||||
            "pin_memory": true
 | 
					 | 
				
			||||||
        },
 | 
					 | 
				
			||||||
        "overlap_comm": true,
 | 
					 | 
				
			||||||
        "contiguous_gradients": true,
 | 
					 | 
				
			||||||
        "sub_group_size": 1e9,
 | 
					 | 
				
			||||||
        "reduce_bucket_size": "auto",
 | 
					 | 
				
			||||||
        "stage3_prefetch_bucket_size": "auto",
 | 
					 | 
				
			||||||
        "stage3_param_persistence_threshold": "auto",
 | 
					 | 
				
			||||||
        "stage3_max_live_parameters": 1e9,
 | 
					 | 
				
			||||||
        "stage3_max_reuse_distance": 1e9,
 | 
					 | 
				
			||||||
        "stage3_gather_16bit_weights_on_model_save": true
 | 
					 | 
				
			||||||
    },
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    "gradient_accumulation_steps": "auto",
 | 
					 | 
				
			||||||
    "gradient_clipping": "auto",
 | 
					 | 
				
			||||||
    "steps_per_print": 100,
 | 
					 | 
				
			||||||
    "train_batch_size": "auto",
 | 
					 | 
				
			||||||
    "train_micro_batch_size_per_gpu": "auto",
 | 
					 | 
				
			||||||
    "wall_clock_breakdown": false
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
| 
						 | 
					@ -1,90 +0,0 @@
 | 
				
			||||||
#!/bin/bash
 | 
					 | 
				
			||||||
export CUDA_DEVICE_MAX_CONNECTIONS=1
 | 
					 | 
				
			||||||
DIR=`pwd`
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
# Guide:
 | 
					 | 
				
			||||||
# This script supports distributed training on multi-gpu workers (as well as single-worker training).
 | 
					 | 
				
			||||||
# Please set the options below according to the comments.
 | 
					 | 
				
			||||||
# For multi-gpu workers training, these options should be manually set for each worker.
 | 
					 | 
				
			||||||
# After setting the options, please run the script on each worker.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
# Number of GPUs per GPU worker
 | 
					 | 
				
			||||||
GPUS_PER_NODE=$(python -c 'import torch; print(torch.cuda.device_count())')
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
# Number of GPU workers, for single-worker training, please set to 1
 | 
					 | 
				
			||||||
NNODES=${NNODES:-1}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
# The rank of this worker, should be in {0, ..., WORKER_CNT-1}, for single-worker training, please set to 0
 | 
					 | 
				
			||||||
NODE_RANK=${NODE_RANK:-0}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
# The ip address of the rank-0 worker, for single-worker training, please set to localhost
 | 
					 | 
				
			||||||
MASTER_ADDR=${MASTER_ADDR:localhost}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
# The port for communication
 | 
					 | 
				
			||||||
MASTER_PORT=${MASTER_PORT:-6001}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
MODEL="Qwen/Qwen-7B" # Set the path if you do not want to load from huggingface directly
 | 
					 | 
				
			||||||
# ATTENTION: specify the path to your training data, which should be a json file consisting of a list of conversations.
 | 
					 | 
				
			||||||
# See the section for finetuning in README for more information.
 | 
					 | 
				
			||||||
DATA="path_to_data"
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
function usage() {
 | 
					 | 
				
			||||||
    echo '
 | 
					 | 
				
			||||||
Usage: bash finetune/finetune_ds.sh [-m MODEL_PATH] [-d DATA_PATH]
 | 
					 | 
				
			||||||
'
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
while [[ "$1" != "" ]]; do
 | 
					 | 
				
			||||||
    case $1 in
 | 
					 | 
				
			||||||
        -m | --model )
 | 
					 | 
				
			||||||
            shift
 | 
					 | 
				
			||||||
            MODEL=$1
 | 
					 | 
				
			||||||
            ;;
 | 
					 | 
				
			||||||
        -d | --data )
 | 
					 | 
				
			||||||
            shift
 | 
					 | 
				
			||||||
            DATA=$1
 | 
					 | 
				
			||||||
            ;;
 | 
					 | 
				
			||||||
        -h | --help )
 | 
					 | 
				
			||||||
            usage
 | 
					 | 
				
			||||||
            exit 0
 | 
					 | 
				
			||||||
            ;;
 | 
					 | 
				
			||||||
        * )
 | 
					 | 
				
			||||||
            echo "Unknown argument ${1}"
 | 
					 | 
				
			||||||
            exit 1
 | 
					 | 
				
			||||||
            ;;
 | 
					 | 
				
			||||||
    esac
 | 
					 | 
				
			||||||
    shift
 | 
					 | 
				
			||||||
done
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
DISTRIBUTED_ARGS="
 | 
					 | 
				
			||||||
    --nproc_per_node $GPUS_PER_NODE \
 | 
					 | 
				
			||||||
    --nnodes $NNODES \
 | 
					 | 
				
			||||||
    --node_rank $NODE_RANK \
 | 
					 | 
				
			||||||
    --master_addr $MASTER_ADDR \
 | 
					 | 
				
			||||||
    --master_port $MASTER_PORT
 | 
					 | 
				
			||||||
"
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
torchrun $DISTRIBUTED_ARGS finetune.py \
 | 
					 | 
				
			||||||
    --model_name_or_path $MODEL \
 | 
					 | 
				
			||||||
    --data_path $DATA \
 | 
					 | 
				
			||||||
    --bf16 True \
 | 
					 | 
				
			||||||
    --output_dir output_qwen \
 | 
					 | 
				
			||||||
    --num_train_epochs 5 \
 | 
					 | 
				
			||||||
    --per_device_train_batch_size 1 \
 | 
					 | 
				
			||||||
    --per_device_eval_batch_size 1 \
 | 
					 | 
				
			||||||
    --gradient_accumulation_steps 16 \
 | 
					 | 
				
			||||||
    --evaluation_strategy "no" \
 | 
					 | 
				
			||||||
    --save_strategy "steps" \
 | 
					 | 
				
			||||||
    --save_steps 1000 \
 | 
					 | 
				
			||||||
    --save_total_limit 10 \
 | 
					 | 
				
			||||||
    --learning_rate 1e-5 \
 | 
					 | 
				
			||||||
    --weight_decay 0.1 \
 | 
					 | 
				
			||||||
    --adam_beta2 0.95 \
 | 
					 | 
				
			||||||
    --warmup_ratio 0.01 \
 | 
					 | 
				
			||||||
    --lr_scheduler_type "cosine" \
 | 
					 | 
				
			||||||
    --logging_steps 1 \
 | 
					 | 
				
			||||||
    --report_to "none" \
 | 
					 | 
				
			||||||
    --model_max_length 512 \
 | 
					 | 
				
			||||||
    --gradient_checkpointing True \
 | 
					 | 
				
			||||||
    --lazy_preprocess True \
 | 
					 | 
				
			||||||
    --deepspeed finetune/ds_config_zero3.json
 | 
					 | 
				
			||||||
| 
						 | 
					@ -1,96 +0,0 @@
 | 
				
			||||||
#!/bin/bash
 | 
					 | 
				
			||||||
export CUDA_DEVICE_MAX_CONNECTIONS=1
 | 
					 | 
				
			||||||
DIR=`pwd`
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
# Guide:
 | 
					 | 
				
			||||||
# This script supports distributed training on multi-gpu workers (as well as single-worker training).
 | 
					 | 
				
			||||||
# Please set the options below according to the comments.
 | 
					 | 
				
			||||||
# For multi-gpu workers training, these options should be manually set for each worker.
 | 
					 | 
				
			||||||
# After setting the options, please run the script on each worker.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
# Number of GPUs per GPU worker
 | 
					 | 
				
			||||||
GPUS_PER_NODE=$(python -c 'import torch; print(torch.cuda.device_count())')
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
# Number of GPU workers, for single-worker training, please set to 1
 | 
					 | 
				
			||||||
NNODES=${NNODES:-1}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
# The rank of this worker, should be in {0, ..., WORKER_CNT-1}, for single-worker training, please set to 0
 | 
					 | 
				
			||||||
NODE_RANK=${NODE_RANK:-0}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
# The ip address of the rank-0 worker, for single-worker training, please set to localhost
 | 
					 | 
				
			||||||
MASTER_ADDR=${MASTER_ADDR:localhost}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
# The port for communication
 | 
					 | 
				
			||||||
MASTER_PORT=${MASTER_PORT:-6001}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
MODEL="Qwen/Qwen-7B" # Set the path if you do not want to load from huggingface directly
 | 
					 | 
				
			||||||
# ATTENTION: specify the path to your training data, which should be a json file consisting of a list of conversations.
 | 
					 | 
				
			||||||
# See the section for finetuning in README for more information.
 | 
					 | 
				
			||||||
DATA="path_to_data"
 | 
					 | 
				
			||||||
DS_CONFIG_PATH="finetune/ds_config_zero2.json"
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
function usage() {
 | 
					 | 
				
			||||||
    echo '
 | 
					 | 
				
			||||||
Usage: bash finetune/finetune_lora_ds.sh [-m MODEL_PATH] [-d DATA_PATH] [--deepspeed DS_CONFIG_PATH]
 | 
					 | 
				
			||||||
'
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
while [[ "$1" != "" ]]; do
 | 
					 | 
				
			||||||
    case $1 in
 | 
					 | 
				
			||||||
        -m | --model )
 | 
					 | 
				
			||||||
            shift
 | 
					 | 
				
			||||||
            MODEL=$1
 | 
					 | 
				
			||||||
            ;;
 | 
					 | 
				
			||||||
        -d | --data )
 | 
					 | 
				
			||||||
            shift
 | 
					 | 
				
			||||||
            DATA=$1
 | 
					 | 
				
			||||||
            ;;
 | 
					 | 
				
			||||||
        --deepspeed )
 | 
					 | 
				
			||||||
            shift
 | 
					 | 
				
			||||||
            DS_CONFIG_PATH=$1
 | 
					 | 
				
			||||||
            ;;
 | 
					 | 
				
			||||||
        -h | --help )
 | 
					 | 
				
			||||||
            usage
 | 
					 | 
				
			||||||
            exit 0
 | 
					 | 
				
			||||||
            ;;
 | 
					 | 
				
			||||||
        * )
 | 
					 | 
				
			||||||
            echo "Unknown argument ${1}"
 | 
					 | 
				
			||||||
            exit 1
 | 
					 | 
				
			||||||
            ;;
 | 
					 | 
				
			||||||
    esac
 | 
					 | 
				
			||||||
    shift
 | 
					 | 
				
			||||||
done
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
DISTRIBUTED_ARGS="
 | 
					 | 
				
			||||||
    --nproc_per_node $GPUS_PER_NODE \
 | 
					 | 
				
			||||||
    --nnodes $NNODES \
 | 
					 | 
				
			||||||
    --node_rank $NODE_RANK \
 | 
					 | 
				
			||||||
    --master_addr $MASTER_ADDR \
 | 
					 | 
				
			||||||
    --master_port $MASTER_PORT
 | 
					 | 
				
			||||||
"
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
torchrun $DISTRIBUTED_ARGS finetune.py \
 | 
					 | 
				
			||||||
    --model_name_or_path $MODEL \
 | 
					 | 
				
			||||||
    --data_path $DATA \
 | 
					 | 
				
			||||||
    --bf16 True \
 | 
					 | 
				
			||||||
    --output_dir output_qwen \
 | 
					 | 
				
			||||||
    --num_train_epochs 5 \
 | 
					 | 
				
			||||||
    --per_device_train_batch_size 2 \
 | 
					 | 
				
			||||||
    --per_device_eval_batch_size 1 \
 | 
					 | 
				
			||||||
    --gradient_accumulation_steps 8 \
 | 
					 | 
				
			||||||
    --evaluation_strategy "no" \
 | 
					 | 
				
			||||||
    --save_strategy "steps" \
 | 
					 | 
				
			||||||
    --save_steps 1000 \
 | 
					 | 
				
			||||||
    --save_total_limit 10 \
 | 
					 | 
				
			||||||
    --learning_rate 3e-4 \
 | 
					 | 
				
			||||||
    --weight_decay 0.1 \
 | 
					 | 
				
			||||||
    --adam_beta2 0.95 \
 | 
					 | 
				
			||||||
    --warmup_ratio 0.01 \
 | 
					 | 
				
			||||||
    --lr_scheduler_type "cosine" \
 | 
					 | 
				
			||||||
    --logging_steps 1 \
 | 
					 | 
				
			||||||
    --report_to "none" \
 | 
					 | 
				
			||||||
    --model_max_length 512 \
 | 
					 | 
				
			||||||
    --lazy_preprocess True \
 | 
					 | 
				
			||||||
    --use_lora \
 | 
					 | 
				
			||||||
    --gradient_checkpointing \
 | 
					 | 
				
			||||||
    --deepspeed ${DS_CONFIG_PATH}
 | 
					 | 
				
			||||||
| 
						 | 
					@ -1,93 +0,0 @@
 | 
				
			||||||
#!/bin/bash
 | 
					 | 
				
			||||||
export CUDA_DEVICE_MAX_CONNECTIONS=1
 | 
					 | 
				
			||||||
DIR=`pwd`
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
# Guide:
 | 
					 | 
				
			||||||
# This script supports distributed training on multi-gpu workers (as well as single-worker training).
 | 
					 | 
				
			||||||
# Please set the options below according to the comments.
 | 
					 | 
				
			||||||
# For multi-gpu workers training, these options should be manually set for each worker.
 | 
					 | 
				
			||||||
# After setting the options, please run the script on each worker.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
# Number of GPUs per GPU worker
 | 
					 | 
				
			||||||
GPUS_PER_NODE=$(python -c 'import torch; print(torch.cuda.device_count())')
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
# Number of GPU workers, for single-worker training, please set to 1
 | 
					 | 
				
			||||||
NNODES=${NNODES:-1}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
# The rank of this worker, should be in {0, ..., WORKER_CNT-1}, for single-worker training, please set to 0
 | 
					 | 
				
			||||||
NODE_RANK=${NODE_RANK:-0}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
# The ip address of the rank-0 worker, for single-worker training, please set to localhost
 | 
					 | 
				
			||||||
MASTER_ADDR=${MASTER_ADDR:localhost}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
# The port for communication
 | 
					 | 
				
			||||||
MASTER_PORT=${MASTER_PORT:-6001}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
MODEL="Qwen/Qwen-7B-Chat-Int4" # Set the path if you do not want to load from huggingface directly
 | 
					 | 
				
			||||||
# ATTENTION: specify the path to your training data, which should be a json file consisting of a list of conversations.
 | 
					 | 
				
			||||||
# See the section for finetuning in README for more information.
 | 
					 | 
				
			||||||
DATA="path_to_data"
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
function usage() {
 | 
					 | 
				
			||||||
    echo '
 | 
					 | 
				
			||||||
Usage: bash finetune/finetune_qlora_ds.sh [-m MODEL_PATH] [-d DATA_PATH]
 | 
					 | 
				
			||||||
'
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
while [[ "$1" != "" ]]; do
 | 
					 | 
				
			||||||
    case $1 in
 | 
					 | 
				
			||||||
        -m | --model )
 | 
					 | 
				
			||||||
            shift
 | 
					 | 
				
			||||||
            MODEL=$1
 | 
					 | 
				
			||||||
            ;;
 | 
					 | 
				
			||||||
        -d | --data )
 | 
					 | 
				
			||||||
            shift
 | 
					 | 
				
			||||||
            DATA=$1
 | 
					 | 
				
			||||||
            ;;
 | 
					 | 
				
			||||||
        -h | --help )
 | 
					 | 
				
			||||||
            usage
 | 
					 | 
				
			||||||
            exit 0
 | 
					 | 
				
			||||||
            ;;
 | 
					 | 
				
			||||||
        * )
 | 
					 | 
				
			||||||
            echo "Unknown argument ${1}"
 | 
					 | 
				
			||||||
            exit 1
 | 
					 | 
				
			||||||
            ;;
 | 
					 | 
				
			||||||
    esac
 | 
					 | 
				
			||||||
    shift
 | 
					 | 
				
			||||||
done
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
DISTRIBUTED_ARGS="
 | 
					 | 
				
			||||||
    --nproc_per_node $GPUS_PER_NODE \
 | 
					 | 
				
			||||||
    --nnodes $NNODES \
 | 
					 | 
				
			||||||
    --node_rank $NODE_RANK \
 | 
					 | 
				
			||||||
    --master_addr $MASTER_ADDR \
 | 
					 | 
				
			||||||
    --master_port $MASTER_PORT
 | 
					 | 
				
			||||||
"
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
# Remember to use --fp16 instead of --bf16 due to autogptq
 | 
					 | 
				
			||||||
torchrun $DISTRIBUTED_ARGS finetune.py \
 | 
					 | 
				
			||||||
    --model_name_or_path $MODEL \
 | 
					 | 
				
			||||||
    --data_path $DATA \
 | 
					 | 
				
			||||||
    --fp16 True \
 | 
					 | 
				
			||||||
    --output_dir output_qwen \
 | 
					 | 
				
			||||||
    --num_train_epochs 5 \
 | 
					 | 
				
			||||||
    --per_device_train_batch_size 2 \
 | 
					 | 
				
			||||||
    --per_device_eval_batch_size 1 \
 | 
					 | 
				
			||||||
    --gradient_accumulation_steps 8 \
 | 
					 | 
				
			||||||
    --evaluation_strategy "no" \
 | 
					 | 
				
			||||||
    --save_strategy "steps" \
 | 
					 | 
				
			||||||
    --save_steps 1000 \
 | 
					 | 
				
			||||||
    --save_total_limit 10 \
 | 
					 | 
				
			||||||
    --learning_rate 3e-4 \
 | 
					 | 
				
			||||||
    --weight_decay 0.1 \
 | 
					 | 
				
			||||||
    --adam_beta2 0.95 \
 | 
					 | 
				
			||||||
    --warmup_ratio 0.01 \
 | 
					 | 
				
			||||||
    --lr_scheduler_type "cosine" \
 | 
					 | 
				
			||||||
    --logging_steps 1 \
 | 
					 | 
				
			||||||
    --report_to "none" \
 | 
					 | 
				
			||||||
    --model_max_length 512 \
 | 
					 | 
				
			||||||
    --lazy_preprocess True \
 | 
					 | 
				
			||||||
    --use_lora \
 | 
					 | 
				
			||||||
    --q_lora \
 | 
					 | 
				
			||||||
    --gradient_checkpointing \
 | 
					 | 
				
			||||||
    --deepspeed finetune/ds_config_zero2.json
 | 
					 | 
				
			||||||
| 
						 | 
					@ -1,66 +0,0 @@
 | 
				
			||||||
#!/bin/bash
 | 
					 | 
				
			||||||
export CUDA_DEVICE_MAX_CONNECTIONS=1
 | 
					 | 
				
			||||||
DIR=`pwd`
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
MODEL="Qwen/Qwen-7B-Chat-Int4" # Set the path if you do not want to load from huggingface directly
 | 
					 | 
				
			||||||
# ATTENTION: specify the path to your training data, which should be a json file consisting of a list of conversations.
 | 
					 | 
				
			||||||
# See the section for finetuning in README for more information.
 | 
					 | 
				
			||||||
DATA="path_to_data"
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
function usage() {
 | 
					 | 
				
			||||||
    echo '
 | 
					 | 
				
			||||||
Usage: bash finetune/finetune_qlora_single_gpu.sh [-m MODEL_PATH] [-d DATA_PATH]
 | 
					 | 
				
			||||||
'
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
while [[ "$1" != "" ]]; do
 | 
					 | 
				
			||||||
    case $1 in
 | 
					 | 
				
			||||||
        -m | --model )
 | 
					 | 
				
			||||||
            shift
 | 
					 | 
				
			||||||
            MODEL=$1
 | 
					 | 
				
			||||||
            ;;
 | 
					 | 
				
			||||||
        -d | --data )
 | 
					 | 
				
			||||||
            shift
 | 
					 | 
				
			||||||
            DATA=$1
 | 
					 | 
				
			||||||
            ;;
 | 
					 | 
				
			||||||
        -h | --help )
 | 
					 | 
				
			||||||
            usage
 | 
					 | 
				
			||||||
            exit 0
 | 
					 | 
				
			||||||
            ;;
 | 
					 | 
				
			||||||
        * )
 | 
					 | 
				
			||||||
            echo "Unknown argument ${1}"
 | 
					 | 
				
			||||||
            exit 1
 | 
					 | 
				
			||||||
            ;;
 | 
					 | 
				
			||||||
    esac
 | 
					 | 
				
			||||||
    shift
 | 
					 | 
				
			||||||
done
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
export CUDA_VISIBLE_DEVICES=0
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
# Remember to use --fp16 instead of --bf16 due to autogptq
 | 
					 | 
				
			||||||
python finetune.py \
 | 
					 | 
				
			||||||
  --model_name_or_path $MODEL \
 | 
					 | 
				
			||||||
  --data_path $DATA \
 | 
					 | 
				
			||||||
  --fp16 True \
 | 
					 | 
				
			||||||
  --output_dir output_qwen \
 | 
					 | 
				
			||||||
  --num_train_epochs 5 \
 | 
					 | 
				
			||||||
  --per_device_train_batch_size 2 \
 | 
					 | 
				
			||||||
  --per_device_eval_batch_size 1 \
 | 
					 | 
				
			||||||
  --gradient_accumulation_steps 8 \
 | 
					 | 
				
			||||||
  --evaluation_strategy "no" \
 | 
					 | 
				
			||||||
  --save_strategy "steps" \
 | 
					 | 
				
			||||||
  --save_steps 1000 \
 | 
					 | 
				
			||||||
  --save_total_limit 10 \
 | 
					 | 
				
			||||||
  --learning_rate 3e-4 \
 | 
					 | 
				
			||||||
  --weight_decay 0.1 \
 | 
					 | 
				
			||||||
  --adam_beta2 0.95 \
 | 
					 | 
				
			||||||
  --warmup_ratio 0.01 \
 | 
					 | 
				
			||||||
  --lr_scheduler_type "cosine" \
 | 
					 | 
				
			||||||
  --logging_steps 1 \
 | 
					 | 
				
			||||||
  --report_to "none" \
 | 
					 | 
				
			||||||
  --model_max_length 512 \
 | 
					 | 
				
			||||||
  --lazy_preprocess True \
 | 
					 | 
				
			||||||
  --gradient_checkpointing \
 | 
					 | 
				
			||||||
  --use_lora \
 | 
					 | 
				
			||||||
  --q_lora \
 | 
					 | 
				
			||||||
  --deepspeed finetune/ds_config_zero2.json
 | 
					 | 
				
			||||||
| 
						 | 
					@ -406,8 +406,7 @@ class QWenModel(QWenPreTrainedModel):
 | 
				
			||||||
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
 | 
					        encoder_attention_mask: Optional[torch.FloatTensor] = None,
 | 
				
			||||||
        use_cache: Optional[bool] = None,
 | 
					        use_cache: Optional[bool] = None,
 | 
				
			||||||
        output_attentions: Optional[bool] = None,
 | 
					        output_attentions: Optional[bool] = None,
 | 
				
			||||||
        output_hidden_states: Optional[bool] = None,
 | 
					        output_hidden_states: Optional[bool] = None
 | 
				
			||||||
        return_dict: Optional[bool] = None,
 | 
					 | 
				
			||||||
    ):
 | 
					    ):
 | 
				
			||||||
        output_attentions = (
 | 
					        output_attentions = (
 | 
				
			||||||
            output_attentions
 | 
					            output_attentions
 | 
				
			||||||
| 
						 | 
					@ -420,9 +419,6 @@ class QWenModel(QWenPreTrainedModel):
 | 
				
			||||||
            else self.config.output_hidden_states
 | 
					            else self.config.output_hidden_states
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
        use_cache = use_cache if use_cache is not None else self.config.use_cache
 | 
					        use_cache = use_cache if use_cache is not None else self.config.use_cache
 | 
				
			||||||
        return_dict = (
 | 
					 | 
				
			||||||
            return_dict if return_dict is not None else self.config.use_return_dict
 | 
					 | 
				
			||||||
        )
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
        if input_ids is not None and inputs_embeds is not None:
 | 
					        if input_ids is not None and inputs_embeds is not None:
 | 
				
			||||||
            raise ValueError(
 | 
					            raise ValueError(
 | 
				
			||||||
| 
						 | 
					@ -569,11 +565,6 @@ class QWenModel(QWenPreTrainedModel):
 | 
				
			||||||
        if output_hidden_states:
 | 
					        if output_hidden_states:
 | 
				
			||||||
            all_hidden_states = all_hidden_states + (hidden_states,)
 | 
					            all_hidden_states = all_hidden_states + (hidden_states,)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        if not return_dict:
 | 
					 | 
				
			||||||
            return tuple(
 | 
					 | 
				
			||||||
                v for v in [hidden_states, presents, all_hidden_states] if v is not None
 | 
					 | 
				
			||||||
            )
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        return BaseModelOutputWithPast(
 | 
					        return BaseModelOutputWithPast(
 | 
				
			||||||
            last_hidden_state=hidden_states,
 | 
					            last_hidden_state=hidden_states,
 | 
				
			||||||
            past_key_values=presents,
 | 
					            past_key_values=presents,
 | 
				
			||||||
| 
						 | 
					@ -639,11 +630,8 @@ class QWenLMHeadModel(QWenPreTrainedModel):
 | 
				
			||||||
        use_cache: Optional[bool] = None,
 | 
					        use_cache: Optional[bool] = None,
 | 
				
			||||||
        output_attentions: Optional[bool] = None,
 | 
					        output_attentions: Optional[bool] = None,
 | 
				
			||||||
        output_hidden_states: Optional[bool] = None,
 | 
					        output_hidden_states: Optional[bool] = None,
 | 
				
			||||||
        return_dict: Optional[bool] = None,
 | 
					 | 
				
			||||||
    ) -> Union[Tuple, CausalLMOutputWithPast]:
 | 
					    ) -> Union[Tuple, CausalLMOutputWithPast]:
 | 
				
			||||||
        return_dict = (
 | 
					
 | 
				
			||||||
            return_dict if return_dict is not None else self.config.use_return_dict
 | 
					 | 
				
			||||||
        )
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
        transformer_outputs = self.transformer(
 | 
					        transformer_outputs = self.transformer(
 | 
				
			||||||
            input_ids,
 | 
					            input_ids,
 | 
				
			||||||
| 
						 | 
					@ -657,8 +645,7 @@ class QWenLMHeadModel(QWenPreTrainedModel):
 | 
				
			||||||
            encoder_attention_mask=encoder_attention_mask,
 | 
					            encoder_attention_mask=encoder_attention_mask,
 | 
				
			||||||
            use_cache=use_cache,
 | 
					            use_cache=use_cache,
 | 
				
			||||||
            output_attentions=output_attentions,
 | 
					            output_attentions=output_attentions,
 | 
				
			||||||
            output_hidden_states=output_hidden_states,
 | 
					            output_hidden_states=output_hidden_states
 | 
				
			||||||
            return_dict=return_dict,
 | 
					 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
        hidden_states = transformer_outputs[0]
 | 
					        hidden_states = transformer_outputs[0]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -674,17 +661,13 @@ class QWenLMHeadModel(QWenPreTrainedModel):
 | 
				
			||||||
                shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)
 | 
					                shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)
 | 
				
			||||||
            )
 | 
					            )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        shift_labels = torch.ones([1,19]).to(lm_logits.device).to(torch.int64)
 | 
					        # shift_labels = torch.ones([1,19]).to(lm_logits.device).to(torch.int64)
 | 
				
			||||||
        shift_logits = lm_logits[..., :-1, :].contiguous()
 | 
					        # shift_logits = lm_logits[..., :-1, :].contiguous()
 | 
				
			||||||
        loss_fct = CrossEntropyLoss()
 | 
					        # loss_fct = CrossEntropyLoss()
 | 
				
			||||||
        loss = loss_fct(
 | 
					        # loss = loss_fct(
 | 
				
			||||||
            shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)
 | 
					        #     shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)
 | 
				
			||||||
        )
 | 
					        # )
 | 
				
			||||||
        loss.backward()
 | 
					        # loss.backward()
 | 
				
			||||||
 | 
					 | 
				
			||||||
        if not return_dict:
 | 
					 | 
				
			||||||
            output = (lm_logits,) + transformer_outputs[1:]
 | 
					 | 
				
			||||||
            return ((loss,) + output) if loss is not None else output
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
        return CausalLMOutputWithPast(
 | 
					        return CausalLMOutputWithPast(
 | 
				
			||||||
            loss=loss,
 | 
					            loss=loss,
 | 
				
			||||||
| 
						 | 
					@ -1197,7 +1180,6 @@ class QWenLMHeadModel(QWenPreTrainedModel):
 | 
				
			||||||
            # forward pass to get next token
 | 
					            # forward pass to get next token
 | 
				
			||||||
            outputs = self(
 | 
					            outputs = self(
 | 
				
			||||||
                **model_inputs,
 | 
					                **model_inputs,
 | 
				
			||||||
                return_dict=True,
 | 
					 | 
				
			||||||
                output_attentions=output_attentions,
 | 
					                output_attentions=output_attentions,
 | 
				
			||||||
                output_hidden_states=output_hidden_states,
 | 
					                output_hidden_states=output_hidden_states,
 | 
				
			||||||
            )
 | 
					            )
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue