Remote return_dict config. Remove unuse files.
This commit is contained in:
		
							parent
							
								
									90cb0fe236
								
							
						
					
					
						commit
						a8f2fbbff5
					
				
							
								
								
									
										10
									
								
								qwen/demo.py
								
								
								
								
							
							
						
						
									
										10
									
								
								qwen/demo.py
								
								
								
								
							| 
						 | 
				
			
			@ -14,7 +14,7 @@ model_dir = snapshot_download("qwen/Qwen-1_8B-Chat")
 | 
			
		|||
# model_dir = "/home/colin/.cache/modelscope/hub/qwen/Qwen-1_8B-Chat"
 | 
			
		||||
 | 
			
		||||
config, kwargs = AutoConfig.from_pretrained(
 | 
			
		||||
    model_dir,
 | 
			
		||||
    "./",
 | 
			
		||||
    return_unused_kwargs=True,
 | 
			
		||||
    trust_remote_code=True,
 | 
			
		||||
    code_revision=None,
 | 
			
		||||
| 
						 | 
				
			
			@ -25,15 +25,15 @@ model = QWenLMHeadModel(config)
 | 
			
		|||
 | 
			
		||||
tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)
 | 
			
		||||
model = model.from_pretrained(
 | 
			
		||||
    model_dir, device_map="auto", trust_remote_code=True
 | 
			
		||||
    model_dir, config=config, device_map="auto", trust_remote_code=True
 | 
			
		||||
).train()
 | 
			
		||||
# model.train()
 | 
			
		||||
# model.zero_grad()
 | 
			
		||||
 | 
			
		||||
# 可指定不同的生成长度、top_p等相关超参
 | 
			
		||||
model.generation_config = GenerationConfig.from_pretrained(
 | 
			
		||||
    model_dir, trust_remote_code=True
 | 
			
		||||
)
 | 
			
		||||
# model.generation_config = GenerationConfig.from_pretrained(
 | 
			
		||||
#     model_dir, trust_remote_code=True
 | 
			
		||||
# )
 | 
			
		||||
 | 
			
		||||
# 第一轮对话
 | 
			
		||||
response, history = model.chat(tokenizer, "你好", history=None)
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,52 +0,0 @@
 | 
			
		|||
{
 | 
			
		||||
    "fp16": {
 | 
			
		||||
        "enabled": "auto",
 | 
			
		||||
        "loss_scale": 0,
 | 
			
		||||
        "loss_scale_window": 1000,
 | 
			
		||||
        "initial_scale_power": 16,
 | 
			
		||||
        "hysteresis": 2,
 | 
			
		||||
        "min_loss_scale": 1
 | 
			
		||||
    },
 | 
			
		||||
    "bf16": {
 | 
			
		||||
        "enabled": "auto"
 | 
			
		||||
    },
 | 
			
		||||
    "optimizer": {
 | 
			
		||||
        "type": "AdamW",
 | 
			
		||||
        "params": {
 | 
			
		||||
            "lr": "auto",
 | 
			
		||||
            "betas": "auto",
 | 
			
		||||
            "eps": "auto",
 | 
			
		||||
            "weight_decay": "auto"
 | 
			
		||||
        }
 | 
			
		||||
    },
 | 
			
		||||
 | 
			
		||||
    "scheduler": {
 | 
			
		||||
        "type": "WarmupLR",
 | 
			
		||||
        "params": {
 | 
			
		||||
            "warmup_min_lr": "auto",
 | 
			
		||||
            "warmup_max_lr": "auto",
 | 
			
		||||
            "warmup_num_steps": "auto"
 | 
			
		||||
        }
 | 
			
		||||
    },
 | 
			
		||||
 | 
			
		||||
    "zero_optimization": {
 | 
			
		||||
        "stage": 2,
 | 
			
		||||
        "offload_optimizer": {
 | 
			
		||||
            "device": "none",
 | 
			
		||||
            "pin_memory": true
 | 
			
		||||
        },
 | 
			
		||||
        "allgather_partitions": true,
 | 
			
		||||
        "allgather_bucket_size": 2e8,
 | 
			
		||||
        "overlap_comm": true,
 | 
			
		||||
        "reduce_scatter": true,
 | 
			
		||||
        "reduce_bucket_size": 2e8,
 | 
			
		||||
        "contiguous_gradients": true
 | 
			
		||||
    },
 | 
			
		||||
 | 
			
		||||
    "gradient_accumulation_steps": "auto",
 | 
			
		||||
    "gradient_clipping": "auto",
 | 
			
		||||
    "steps_per_print": 100,
 | 
			
		||||
    "train_batch_size": "auto",
 | 
			
		||||
    "train_micro_batch_size_per_gpu": "auto",
 | 
			
		||||
    "wall_clock_breakdown": false
 | 
			
		||||
}
 | 
			
		||||
| 
						 | 
				
			
			@ -1,59 +0,0 @@
 | 
			
		|||
{
 | 
			
		||||
    "fp16": {
 | 
			
		||||
        "enabled": "auto",
 | 
			
		||||
        "loss_scale": 0,
 | 
			
		||||
        "loss_scale_window": 1000,
 | 
			
		||||
        "initial_scale_power": 16,
 | 
			
		||||
        "hysteresis": 2,
 | 
			
		||||
        "min_loss_scale": 1
 | 
			
		||||
    },
 | 
			
		||||
    "bf16": {
 | 
			
		||||
        "enabled": "auto"
 | 
			
		||||
    },
 | 
			
		||||
    "optimizer": {
 | 
			
		||||
        "type": "AdamW",
 | 
			
		||||
        "params": {
 | 
			
		||||
            "lr": "auto",
 | 
			
		||||
            "betas": "auto",
 | 
			
		||||
            "eps": "auto",
 | 
			
		||||
            "weight_decay": "auto"
 | 
			
		||||
        }
 | 
			
		||||
    },
 | 
			
		||||
 | 
			
		||||
    "scheduler": {
 | 
			
		||||
        "type": "WarmupLR",
 | 
			
		||||
        "params": {
 | 
			
		||||
            "warmup_min_lr": "auto",
 | 
			
		||||
            "warmup_max_lr": "auto",
 | 
			
		||||
            "warmup_num_steps": "auto"
 | 
			
		||||
        }
 | 
			
		||||
    },
 | 
			
		||||
 | 
			
		||||
    "zero_optimization": {
 | 
			
		||||
        "stage": 3,
 | 
			
		||||
        "offload_optimizer": {
 | 
			
		||||
            "device": "none",
 | 
			
		||||
            "pin_memory": true
 | 
			
		||||
        },
 | 
			
		||||
        "offload_param": {
 | 
			
		||||
            "device": "none",
 | 
			
		||||
            "pin_memory": true
 | 
			
		||||
        },
 | 
			
		||||
        "overlap_comm": true,
 | 
			
		||||
        "contiguous_gradients": true,
 | 
			
		||||
        "sub_group_size": 1e9,
 | 
			
		||||
        "reduce_bucket_size": "auto",
 | 
			
		||||
        "stage3_prefetch_bucket_size": "auto",
 | 
			
		||||
        "stage3_param_persistence_threshold": "auto",
 | 
			
		||||
        "stage3_max_live_parameters": 1e9,
 | 
			
		||||
        "stage3_max_reuse_distance": 1e9,
 | 
			
		||||
        "stage3_gather_16bit_weights_on_model_save": true
 | 
			
		||||
    },
 | 
			
		||||
 | 
			
		||||
    "gradient_accumulation_steps": "auto",
 | 
			
		||||
    "gradient_clipping": "auto",
 | 
			
		||||
    "steps_per_print": 100,
 | 
			
		||||
    "train_batch_size": "auto",
 | 
			
		||||
    "train_micro_batch_size_per_gpu": "auto",
 | 
			
		||||
    "wall_clock_breakdown": false
 | 
			
		||||
}
 | 
			
		||||
| 
						 | 
				
			
			@ -1,90 +0,0 @@
 | 
			
		|||
#!/bin/bash
 | 
			
		||||
export CUDA_DEVICE_MAX_CONNECTIONS=1
 | 
			
		||||
DIR=`pwd`
 | 
			
		||||
 | 
			
		||||
# Guide:
 | 
			
		||||
# This script supports distributed training on multi-gpu workers (as well as single-worker training).
 | 
			
		||||
# Please set the options below according to the comments.
 | 
			
		||||
# For multi-gpu workers training, these options should be manually set for each worker.
 | 
			
		||||
# After setting the options, please run the script on each worker.
 | 
			
		||||
 | 
			
		||||
# Number of GPUs per GPU worker
 | 
			
		||||
GPUS_PER_NODE=$(python -c 'import torch; print(torch.cuda.device_count())')
 | 
			
		||||
 | 
			
		||||
# Number of GPU workers, for single-worker training, please set to 1
 | 
			
		||||
NNODES=${NNODES:-1}
 | 
			
		||||
 | 
			
		||||
# The rank of this worker, should be in {0, ..., WORKER_CNT-1}, for single-worker training, please set to 0
 | 
			
		||||
NODE_RANK=${NODE_RANK:-0}
 | 
			
		||||
 | 
			
		||||
# The ip address of the rank-0 worker, for single-worker training, please set to localhost
 | 
			
		||||
MASTER_ADDR=${MASTER_ADDR:localhost}
 | 
			
		||||
 | 
			
		||||
# The port for communication
 | 
			
		||||
MASTER_PORT=${MASTER_PORT:-6001}
 | 
			
		||||
 | 
			
		||||
MODEL="Qwen/Qwen-7B" # Set the path if you do not want to load from huggingface directly
 | 
			
		||||
# ATTENTION: specify the path to your training data, which should be a json file consisting of a list of conversations.
 | 
			
		||||
# See the section for finetuning in README for more information.
 | 
			
		||||
DATA="path_to_data"
 | 
			
		||||
 | 
			
		||||
function usage() {
 | 
			
		||||
    echo '
 | 
			
		||||
Usage: bash finetune/finetune_ds.sh [-m MODEL_PATH] [-d DATA_PATH]
 | 
			
		||||
'
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
while [[ "$1" != "" ]]; do
 | 
			
		||||
    case $1 in
 | 
			
		||||
        -m | --model )
 | 
			
		||||
            shift
 | 
			
		||||
            MODEL=$1
 | 
			
		||||
            ;;
 | 
			
		||||
        -d | --data )
 | 
			
		||||
            shift
 | 
			
		||||
            DATA=$1
 | 
			
		||||
            ;;
 | 
			
		||||
        -h | --help )
 | 
			
		||||
            usage
 | 
			
		||||
            exit 0
 | 
			
		||||
            ;;
 | 
			
		||||
        * )
 | 
			
		||||
            echo "Unknown argument ${1}"
 | 
			
		||||
            exit 1
 | 
			
		||||
            ;;
 | 
			
		||||
    esac
 | 
			
		||||
    shift
 | 
			
		||||
done
 | 
			
		||||
 | 
			
		||||
DISTRIBUTED_ARGS="
 | 
			
		||||
    --nproc_per_node $GPUS_PER_NODE \
 | 
			
		||||
    --nnodes $NNODES \
 | 
			
		||||
    --node_rank $NODE_RANK \
 | 
			
		||||
    --master_addr $MASTER_ADDR \
 | 
			
		||||
    --master_port $MASTER_PORT
 | 
			
		||||
"
 | 
			
		||||
 | 
			
		||||
torchrun $DISTRIBUTED_ARGS finetune.py \
 | 
			
		||||
    --model_name_or_path $MODEL \
 | 
			
		||||
    --data_path $DATA \
 | 
			
		||||
    --bf16 True \
 | 
			
		||||
    --output_dir output_qwen \
 | 
			
		||||
    --num_train_epochs 5 \
 | 
			
		||||
    --per_device_train_batch_size 1 \
 | 
			
		||||
    --per_device_eval_batch_size 1 \
 | 
			
		||||
    --gradient_accumulation_steps 16 \
 | 
			
		||||
    --evaluation_strategy "no" \
 | 
			
		||||
    --save_strategy "steps" \
 | 
			
		||||
    --save_steps 1000 \
 | 
			
		||||
    --save_total_limit 10 \
 | 
			
		||||
    --learning_rate 1e-5 \
 | 
			
		||||
    --weight_decay 0.1 \
 | 
			
		||||
    --adam_beta2 0.95 \
 | 
			
		||||
    --warmup_ratio 0.01 \
 | 
			
		||||
    --lr_scheduler_type "cosine" \
 | 
			
		||||
    --logging_steps 1 \
 | 
			
		||||
    --report_to "none" \
 | 
			
		||||
    --model_max_length 512 \
 | 
			
		||||
    --gradient_checkpointing True \
 | 
			
		||||
    --lazy_preprocess True \
 | 
			
		||||
    --deepspeed finetune/ds_config_zero3.json
 | 
			
		||||
| 
						 | 
				
			
			@ -1,96 +0,0 @@
 | 
			
		|||
#!/bin/bash
 | 
			
		||||
export CUDA_DEVICE_MAX_CONNECTIONS=1
 | 
			
		||||
DIR=`pwd`
 | 
			
		||||
 | 
			
		||||
# Guide:
 | 
			
		||||
# This script supports distributed training on multi-gpu workers (as well as single-worker training).
 | 
			
		||||
# Please set the options below according to the comments.
 | 
			
		||||
# For multi-gpu workers training, these options should be manually set for each worker.
 | 
			
		||||
# After setting the options, please run the script on each worker.
 | 
			
		||||
 | 
			
		||||
# Number of GPUs per GPU worker
 | 
			
		||||
GPUS_PER_NODE=$(python -c 'import torch; print(torch.cuda.device_count())')
 | 
			
		||||
 | 
			
		||||
# Number of GPU workers, for single-worker training, please set to 1
 | 
			
		||||
NNODES=${NNODES:-1}
 | 
			
		||||
 | 
			
		||||
# The rank of this worker, should be in {0, ..., WORKER_CNT-1}, for single-worker training, please set to 0
 | 
			
		||||
NODE_RANK=${NODE_RANK:-0}
 | 
			
		||||
 | 
			
		||||
# The ip address of the rank-0 worker, for single-worker training, please set to localhost
 | 
			
		||||
MASTER_ADDR=${MASTER_ADDR:localhost}
 | 
			
		||||
 | 
			
		||||
# The port for communication
 | 
			
		||||
MASTER_PORT=${MASTER_PORT:-6001}
 | 
			
		||||
 | 
			
		||||
MODEL="Qwen/Qwen-7B" # Set the path if you do not want to load from huggingface directly
 | 
			
		||||
# ATTENTION: specify the path to your training data, which should be a json file consisting of a list of conversations.
 | 
			
		||||
# See the section for finetuning in README for more information.
 | 
			
		||||
DATA="path_to_data"
 | 
			
		||||
DS_CONFIG_PATH="finetune/ds_config_zero2.json"
 | 
			
		||||
 | 
			
		||||
function usage() {
 | 
			
		||||
    echo '
 | 
			
		||||
Usage: bash finetune/finetune_lora_ds.sh [-m MODEL_PATH] [-d DATA_PATH] [--deepspeed DS_CONFIG_PATH]
 | 
			
		||||
'
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
while [[ "$1" != "" ]]; do
 | 
			
		||||
    case $1 in
 | 
			
		||||
        -m | --model )
 | 
			
		||||
            shift
 | 
			
		||||
            MODEL=$1
 | 
			
		||||
            ;;
 | 
			
		||||
        -d | --data )
 | 
			
		||||
            shift
 | 
			
		||||
            DATA=$1
 | 
			
		||||
            ;;
 | 
			
		||||
        --deepspeed )
 | 
			
		||||
            shift
 | 
			
		||||
            DS_CONFIG_PATH=$1
 | 
			
		||||
            ;;
 | 
			
		||||
        -h | --help )
 | 
			
		||||
            usage
 | 
			
		||||
            exit 0
 | 
			
		||||
            ;;
 | 
			
		||||
        * )
 | 
			
		||||
            echo "Unknown argument ${1}"
 | 
			
		||||
            exit 1
 | 
			
		||||
            ;;
 | 
			
		||||
    esac
 | 
			
		||||
    shift
 | 
			
		||||
done
 | 
			
		||||
 | 
			
		||||
DISTRIBUTED_ARGS="
 | 
			
		||||
    --nproc_per_node $GPUS_PER_NODE \
 | 
			
		||||
    --nnodes $NNODES \
 | 
			
		||||
    --node_rank $NODE_RANK \
 | 
			
		||||
    --master_addr $MASTER_ADDR \
 | 
			
		||||
    --master_port $MASTER_PORT
 | 
			
		||||
"
 | 
			
		||||
 | 
			
		||||
torchrun $DISTRIBUTED_ARGS finetune.py \
 | 
			
		||||
    --model_name_or_path $MODEL \
 | 
			
		||||
    --data_path $DATA \
 | 
			
		||||
    --bf16 True \
 | 
			
		||||
    --output_dir output_qwen \
 | 
			
		||||
    --num_train_epochs 5 \
 | 
			
		||||
    --per_device_train_batch_size 2 \
 | 
			
		||||
    --per_device_eval_batch_size 1 \
 | 
			
		||||
    --gradient_accumulation_steps 8 \
 | 
			
		||||
    --evaluation_strategy "no" \
 | 
			
		||||
    --save_strategy "steps" \
 | 
			
		||||
    --save_steps 1000 \
 | 
			
		||||
    --save_total_limit 10 \
 | 
			
		||||
    --learning_rate 3e-4 \
 | 
			
		||||
    --weight_decay 0.1 \
 | 
			
		||||
    --adam_beta2 0.95 \
 | 
			
		||||
    --warmup_ratio 0.01 \
 | 
			
		||||
    --lr_scheduler_type "cosine" \
 | 
			
		||||
    --logging_steps 1 \
 | 
			
		||||
    --report_to "none" \
 | 
			
		||||
    --model_max_length 512 \
 | 
			
		||||
    --lazy_preprocess True \
 | 
			
		||||
    --use_lora \
 | 
			
		||||
    --gradient_checkpointing \
 | 
			
		||||
    --deepspeed ${DS_CONFIG_PATH}
 | 
			
		||||
| 
						 | 
				
			
			@ -1,93 +0,0 @@
 | 
			
		|||
#!/bin/bash
 | 
			
		||||
export CUDA_DEVICE_MAX_CONNECTIONS=1
 | 
			
		||||
DIR=`pwd`
 | 
			
		||||
 | 
			
		||||
# Guide:
 | 
			
		||||
# This script supports distributed training on multi-gpu workers (as well as single-worker training).
 | 
			
		||||
# Please set the options below according to the comments.
 | 
			
		||||
# For multi-gpu workers training, these options should be manually set for each worker.
 | 
			
		||||
# After setting the options, please run the script on each worker.
 | 
			
		||||
 | 
			
		||||
# Number of GPUs per GPU worker
 | 
			
		||||
GPUS_PER_NODE=$(python -c 'import torch; print(torch.cuda.device_count())')
 | 
			
		||||
 | 
			
		||||
# Number of GPU workers, for single-worker training, please set to 1
 | 
			
		||||
NNODES=${NNODES:-1}
 | 
			
		||||
 | 
			
		||||
# The rank of this worker, should be in {0, ..., WORKER_CNT-1}, for single-worker training, please set to 0
 | 
			
		||||
NODE_RANK=${NODE_RANK:-0}
 | 
			
		||||
 | 
			
		||||
# The ip address of the rank-0 worker, for single-worker training, please set to localhost
 | 
			
		||||
MASTER_ADDR=${MASTER_ADDR:localhost}
 | 
			
		||||
 | 
			
		||||
# The port for communication
 | 
			
		||||
MASTER_PORT=${MASTER_PORT:-6001}
 | 
			
		||||
 | 
			
		||||
MODEL="Qwen/Qwen-7B-Chat-Int4" # Set the path if you do not want to load from huggingface directly
 | 
			
		||||
# ATTENTION: specify the path to your training data, which should be a json file consisting of a list of conversations.
 | 
			
		||||
# See the section for finetuning in README for more information.
 | 
			
		||||
DATA="path_to_data"
 | 
			
		||||
 | 
			
		||||
function usage() {
 | 
			
		||||
    echo '
 | 
			
		||||
Usage: bash finetune/finetune_qlora_ds.sh [-m MODEL_PATH] [-d DATA_PATH]
 | 
			
		||||
'
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
while [[ "$1" != "" ]]; do
 | 
			
		||||
    case $1 in
 | 
			
		||||
        -m | --model )
 | 
			
		||||
            shift
 | 
			
		||||
            MODEL=$1
 | 
			
		||||
            ;;
 | 
			
		||||
        -d | --data )
 | 
			
		||||
            shift
 | 
			
		||||
            DATA=$1
 | 
			
		||||
            ;;
 | 
			
		||||
        -h | --help )
 | 
			
		||||
            usage
 | 
			
		||||
            exit 0
 | 
			
		||||
            ;;
 | 
			
		||||
        * )
 | 
			
		||||
            echo "Unknown argument ${1}"
 | 
			
		||||
            exit 1
 | 
			
		||||
            ;;
 | 
			
		||||
    esac
 | 
			
		||||
    shift
 | 
			
		||||
done
 | 
			
		||||
 | 
			
		||||
DISTRIBUTED_ARGS="
 | 
			
		||||
    --nproc_per_node $GPUS_PER_NODE \
 | 
			
		||||
    --nnodes $NNODES \
 | 
			
		||||
    --node_rank $NODE_RANK \
 | 
			
		||||
    --master_addr $MASTER_ADDR \
 | 
			
		||||
    --master_port $MASTER_PORT
 | 
			
		||||
"
 | 
			
		||||
 | 
			
		||||
# Remember to use --fp16 instead of --bf16 due to autogptq
 | 
			
		||||
torchrun $DISTRIBUTED_ARGS finetune.py \
 | 
			
		||||
    --model_name_or_path $MODEL \
 | 
			
		||||
    --data_path $DATA \
 | 
			
		||||
    --fp16 True \
 | 
			
		||||
    --output_dir output_qwen \
 | 
			
		||||
    --num_train_epochs 5 \
 | 
			
		||||
    --per_device_train_batch_size 2 \
 | 
			
		||||
    --per_device_eval_batch_size 1 \
 | 
			
		||||
    --gradient_accumulation_steps 8 \
 | 
			
		||||
    --evaluation_strategy "no" \
 | 
			
		||||
    --save_strategy "steps" \
 | 
			
		||||
    --save_steps 1000 \
 | 
			
		||||
    --save_total_limit 10 \
 | 
			
		||||
    --learning_rate 3e-4 \
 | 
			
		||||
    --weight_decay 0.1 \
 | 
			
		||||
    --adam_beta2 0.95 \
 | 
			
		||||
    --warmup_ratio 0.01 \
 | 
			
		||||
    --lr_scheduler_type "cosine" \
 | 
			
		||||
    --logging_steps 1 \
 | 
			
		||||
    --report_to "none" \
 | 
			
		||||
    --model_max_length 512 \
 | 
			
		||||
    --lazy_preprocess True \
 | 
			
		||||
    --use_lora \
 | 
			
		||||
    --q_lora \
 | 
			
		||||
    --gradient_checkpointing \
 | 
			
		||||
    --deepspeed finetune/ds_config_zero2.json
 | 
			
		||||
| 
						 | 
				
			
			@ -1,66 +0,0 @@
 | 
			
		|||
#!/bin/bash
 | 
			
		||||
export CUDA_DEVICE_MAX_CONNECTIONS=1
 | 
			
		||||
DIR=`pwd`
 | 
			
		||||
 | 
			
		||||
MODEL="Qwen/Qwen-7B-Chat-Int4" # Set the path if you do not want to load from huggingface directly
 | 
			
		||||
# ATTENTION: specify the path to your training data, which should be a json file consisting of a list of conversations.
 | 
			
		||||
# See the section for finetuning in README for more information.
 | 
			
		||||
DATA="path_to_data"
 | 
			
		||||
 | 
			
		||||
function usage() {
 | 
			
		||||
    echo '
 | 
			
		||||
Usage: bash finetune/finetune_qlora_single_gpu.sh [-m MODEL_PATH] [-d DATA_PATH]
 | 
			
		||||
'
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
while [[ "$1" != "" ]]; do
 | 
			
		||||
    case $1 in
 | 
			
		||||
        -m | --model )
 | 
			
		||||
            shift
 | 
			
		||||
            MODEL=$1
 | 
			
		||||
            ;;
 | 
			
		||||
        -d | --data )
 | 
			
		||||
            shift
 | 
			
		||||
            DATA=$1
 | 
			
		||||
            ;;
 | 
			
		||||
        -h | --help )
 | 
			
		||||
            usage
 | 
			
		||||
            exit 0
 | 
			
		||||
            ;;
 | 
			
		||||
        * )
 | 
			
		||||
            echo "Unknown argument ${1}"
 | 
			
		||||
            exit 1
 | 
			
		||||
            ;;
 | 
			
		||||
    esac
 | 
			
		||||
    shift
 | 
			
		||||
done
 | 
			
		||||
 | 
			
		||||
export CUDA_VISIBLE_DEVICES=0
 | 
			
		||||
 | 
			
		||||
# Remember to use --fp16 instead of --bf16 due to autogptq
 | 
			
		||||
python finetune.py \
 | 
			
		||||
  --model_name_or_path $MODEL \
 | 
			
		||||
  --data_path $DATA \
 | 
			
		||||
  --fp16 True \
 | 
			
		||||
  --output_dir output_qwen \
 | 
			
		||||
  --num_train_epochs 5 \
 | 
			
		||||
  --per_device_train_batch_size 2 \
 | 
			
		||||
  --per_device_eval_batch_size 1 \
 | 
			
		||||
  --gradient_accumulation_steps 8 \
 | 
			
		||||
  --evaluation_strategy "no" \
 | 
			
		||||
  --save_strategy "steps" \
 | 
			
		||||
  --save_steps 1000 \
 | 
			
		||||
  --save_total_limit 10 \
 | 
			
		||||
  --learning_rate 3e-4 \
 | 
			
		||||
  --weight_decay 0.1 \
 | 
			
		||||
  --adam_beta2 0.95 \
 | 
			
		||||
  --warmup_ratio 0.01 \
 | 
			
		||||
  --lr_scheduler_type "cosine" \
 | 
			
		||||
  --logging_steps 1 \
 | 
			
		||||
  --report_to "none" \
 | 
			
		||||
  --model_max_length 512 \
 | 
			
		||||
  --lazy_preprocess True \
 | 
			
		||||
  --gradient_checkpointing \
 | 
			
		||||
  --use_lora \
 | 
			
		||||
  --q_lora \
 | 
			
		||||
  --deepspeed finetune/ds_config_zero2.json
 | 
			
		||||
| 
						 | 
				
			
			@ -406,8 +406,7 @@ class QWenModel(QWenPreTrainedModel):
 | 
			
		|||
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
 | 
			
		||||
        use_cache: Optional[bool] = None,
 | 
			
		||||
        output_attentions: Optional[bool] = None,
 | 
			
		||||
        output_hidden_states: Optional[bool] = None,
 | 
			
		||||
        return_dict: Optional[bool] = None,
 | 
			
		||||
        output_hidden_states: Optional[bool] = None
 | 
			
		||||
    ):
 | 
			
		||||
        output_attentions = (
 | 
			
		||||
            output_attentions
 | 
			
		||||
| 
						 | 
				
			
			@ -420,9 +419,6 @@ class QWenModel(QWenPreTrainedModel):
 | 
			
		|||
            else self.config.output_hidden_states
 | 
			
		||||
        )
 | 
			
		||||
        use_cache = use_cache if use_cache is not None else self.config.use_cache
 | 
			
		||||
        return_dict = (
 | 
			
		||||
            return_dict if return_dict is not None else self.config.use_return_dict
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
        if input_ids is not None and inputs_embeds is not None:
 | 
			
		||||
            raise ValueError(
 | 
			
		||||
| 
						 | 
				
			
			@ -569,11 +565,6 @@ class QWenModel(QWenPreTrainedModel):
 | 
			
		|||
        if output_hidden_states:
 | 
			
		||||
            all_hidden_states = all_hidden_states + (hidden_states,)
 | 
			
		||||
 | 
			
		||||
        if not return_dict:
 | 
			
		||||
            return tuple(
 | 
			
		||||
                v for v in [hidden_states, presents, all_hidden_states] if v is not None
 | 
			
		||||
            )
 | 
			
		||||
 | 
			
		||||
        return BaseModelOutputWithPast(
 | 
			
		||||
            last_hidden_state=hidden_states,
 | 
			
		||||
            past_key_values=presents,
 | 
			
		||||
| 
						 | 
				
			
			@ -639,11 +630,8 @@ class QWenLMHeadModel(QWenPreTrainedModel):
 | 
			
		|||
        use_cache: Optional[bool] = None,
 | 
			
		||||
        output_attentions: Optional[bool] = None,
 | 
			
		||||
        output_hidden_states: Optional[bool] = None,
 | 
			
		||||
        return_dict: Optional[bool] = None,
 | 
			
		||||
    ) -> Union[Tuple, CausalLMOutputWithPast]:
 | 
			
		||||
        return_dict = (
 | 
			
		||||
            return_dict if return_dict is not None else self.config.use_return_dict
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
        transformer_outputs = self.transformer(
 | 
			
		||||
            input_ids,
 | 
			
		||||
| 
						 | 
				
			
			@ -657,8 +645,7 @@ class QWenLMHeadModel(QWenPreTrainedModel):
 | 
			
		|||
            encoder_attention_mask=encoder_attention_mask,
 | 
			
		||||
            use_cache=use_cache,
 | 
			
		||||
            output_attentions=output_attentions,
 | 
			
		||||
            output_hidden_states=output_hidden_states,
 | 
			
		||||
            return_dict=return_dict,
 | 
			
		||||
            output_hidden_states=output_hidden_states
 | 
			
		||||
        )
 | 
			
		||||
        hidden_states = transformer_outputs[0]
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -674,17 +661,13 @@ class QWenLMHeadModel(QWenPreTrainedModel):
 | 
			
		|||
                shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)
 | 
			
		||||
            )
 | 
			
		||||
 | 
			
		||||
        shift_labels = torch.ones([1,19]).to(lm_logits.device).to(torch.int64)
 | 
			
		||||
        shift_logits = lm_logits[..., :-1, :].contiguous()
 | 
			
		||||
        loss_fct = CrossEntropyLoss()
 | 
			
		||||
        loss = loss_fct(
 | 
			
		||||
            shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)
 | 
			
		||||
        )
 | 
			
		||||
        loss.backward()
 | 
			
		||||
 | 
			
		||||
        if not return_dict:
 | 
			
		||||
            output = (lm_logits,) + transformer_outputs[1:]
 | 
			
		||||
            return ((loss,) + output) if loss is not None else output
 | 
			
		||||
        # shift_labels = torch.ones([1,19]).to(lm_logits.device).to(torch.int64)
 | 
			
		||||
        # shift_logits = lm_logits[..., :-1, :].contiguous()
 | 
			
		||||
        # loss_fct = CrossEntropyLoss()
 | 
			
		||||
        # loss = loss_fct(
 | 
			
		||||
        #     shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)
 | 
			
		||||
        # )
 | 
			
		||||
        # loss.backward()
 | 
			
		||||
 | 
			
		||||
        return CausalLMOutputWithPast(
 | 
			
		||||
            loss=loss,
 | 
			
		||||
| 
						 | 
				
			
			@ -1197,7 +1180,6 @@ class QWenLMHeadModel(QWenPreTrainedModel):
 | 
			
		|||
            # forward pass to get next token
 | 
			
		||||
            outputs = self(
 | 
			
		||||
                **model_inputs,
 | 
			
		||||
                return_dict=True,
 | 
			
		||||
                output_attentions=output_attentions,
 | 
			
		||||
                output_hidden_states=output_hidden_states,
 | 
			
		||||
            )
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in New Issue