Witllm/qwen/finetune_qlora_ds.sh

#!/bin/bash
export CUDA_DEVICE_MAX_CONNECTIONS=1
DIR=`pwd`

# Guide:
# This script supports distributed training on multi-gpu workers (as well as single-worker training).
# Please set the options below according to the comments.
# For multi-gpu workers training, these options should be manually set for each worker.
# After setting the options, please run the script on each worker.

# Number of GPUs per GPU worker
GPUS_PER_NODE=$(python -c 'import torch; print(torch.cuda.device_count())')

# Number of GPU workers, for single-worker training, please set to 1
NNODES=${NNODES:-1}

# The rank of this worker, should be in {0, ..., WORKER_CNT-1}, for single-worker training, please set to 0
NODE_RANK=${NODE_RANK:-0}

# The ip address of the rank-0 worker, for single-worker training, please set to localhost
MASTER_ADDR=${MASTER_ADDR:localhost}

# The port for communication
MASTER_PORT=${MASTER_PORT:-6001}

MODEL="Qwen/Qwen-7B-Chat-Int4" # Set the path if you do not want to load from huggingface directly
# ATTENTION: specify the path to your training data, which should be a json file consisting of a list of conversations.
# See the section for finetuning in README for more information.
DATA="path_to_data"

function usage() {
    echo '
Usage: bash finetune/finetune_qlora_ds.sh [-m MODEL_PATH] [-d DATA_PATH]
'
}

while [[ "$1" != "" ]]; do
    case $1 in
        -m | --model )
            shift
            MODEL=$1
            ;;
        -d | --data )
            shift
            DATA=$1
            ;;
        -h | --help )
            usage
            exit 0
            ;;
        * )
            echo "Unknown argument ${1}"
            exit 1
            ;;
    esac
    shift
done

DISTRIBUTED_ARGS="
    --nproc_per_node $GPUS_PER_NODE \
    --nnodes $NNODES \
    --node_rank $NODE_RANK \
    --master_addr $MASTER_ADDR \
    --master_port $MASTER_PORT
"

# Remember to use --fp16 instead of --bf16 due to autogptq
torchrun $DISTRIBUTED_ARGS finetune.py \
    --model_name_or_path $MODEL \
    --data_path $DATA \
    --fp16 True \
    --output_dir output_qwen \
    --num_train_epochs 5 \
    --per_device_train_batch_size 2 \
    --per_device_eval_batch_size 1 \
    --gradient_accumulation_steps 8 \
    --evaluation_strategy "no" \
    --save_strategy "steps" \
    --save_steps 1000 \
    --save_total_limit 10 \
    --learning_rate 3e-4 \
    --weight_decay 0.1 \
    --adam_beta2 0.95 \
    --warmup_ratio 0.01 \
    --lr_scheduler_type "cosine" \
    --logging_steps 1 \
    --report_to "none" \
    --model_max_length 512 \
    --lazy_preprocess True \
    --use_lora \
    --q_lora \
    --gradient_checkpointing \
    --deepspeed finetune/ds_config_zero2.json
Add finetune 2024-01-04 17:36:41 +08:00			`#!/bin/bash`
			`export CUDA_DEVICE_MAX_CONNECTIONS=1`
			DIR=`pwd`

			`# Guide:`
			`# This script supports distributed training on multi-gpu workers (as well as single-worker training).`
			`# Please set the options below according to the comments.`
			`# For multi-gpu workers training, these options should be manually set for each worker.`
			`# After setting the options, please run the script on each worker.`

			`# Number of GPUs per GPU worker`
			`GPUS_PER_NODE=$(python -c 'import torch; print(torch.cuda.device_count())')`

			`# Number of GPU workers, for single-worker training, please set to 1`
			`NNODES=${NNODES:-1}`

			`# The rank of this worker, should be in {0, ..., WORKER_CNT-1}, for single-worker training, please set to 0`
			`NODE_RANK=${NODE_RANK:-0}`

			`# The ip address of the rank-0 worker, for single-worker training, please set to localhost`
			`MASTER_ADDR=${MASTER_ADDR:localhost}`

			`# The port for communication`
			`MASTER_PORT=${MASTER_PORT:-6001}`

			`MODEL="Qwen/Qwen-7B-Chat-Int4" # Set the path if you do not want to load from huggingface directly`
			`# ATTENTION: specify the path to your training data, which should be a json file consisting of a list of conversations.`
			`# See the section for finetuning in README for more information.`
			`DATA="path_to_data"`

			`function usage() {`
			`echo '`
			`Usage: bash finetune/finetune_qlora_ds.sh [-m MODEL_PATH] [-d DATA_PATH]`
			`'`
			`}`

			`while [[ "$1" != "" ]]; do`
			`case $1 in`
			`-m \| --model )`
			`shift`
			`MODEL=$1`
			`;;`
			`-d \| --data )`
			`shift`
			`DATA=$1`
			`;;`
			`-h \| --help )`
			`usage`
			`exit 0`
			`;;`
			`* )`
			`echo "Unknown argument ${1}"`
			`exit 1`
			`;;`
			`esac`
			`shift`
			`done`

			`DISTRIBUTED_ARGS="`
			`--nproc_per_node $GPUS_PER_NODE \`
			`--nnodes $NNODES \`
			`--node_rank $NODE_RANK \`
			`--master_addr $MASTER_ADDR \`
			`--master_port $MASTER_PORT`
			`"`

			`# Remember to use --fp16 instead of --bf16 due to autogptq`
			`torchrun $DISTRIBUTED_ARGS finetune.py \`
			`--model_name_or_path $MODEL \`
			`--data_path $DATA \`
			`--fp16 True \`
			`--output_dir output_qwen \`
			`--num_train_epochs 5 \`
			`--per_device_train_batch_size 2 \`
			`--per_device_eval_batch_size 1 \`
			`--gradient_accumulation_steps 8 \`
			`--evaluation_strategy "no" \`
			`--save_strategy "steps" \`
			`--save_steps 1000 \`
			`--save_total_limit 10 \`
			`--learning_rate 3e-4 \`
			`--weight_decay 0.1 \`
			`--adam_beta2 0.95 \`
			`--warmup_ratio 0.01 \`
			`--lr_scheduler_type "cosine" \`
			`--logging_steps 1 \`
			`--report_to "none" \`
			`--model_max_length 512 \`
			`--lazy_preprocess True \`
			`--use_lora \`
			`--q_lora \`
			`--gradient_checkpointing \`
			`--deepspeed finetune/ds_config_zero2.json`