diff --git a/finetune/llamafactory/speed_test b/finetune/llamafactory/speed_test new file mode 100644 index 0000000..031f96e --- /dev/null +++ b/finetune/llamafactory/speed_test @@ -0,0 +1,8 @@ + + +4070tisuper batch=1 bf16=false 1.79s/it +4070tisuper batch=1 bf16=true 1.8s/it +4070tisuper batch=4 bf16=true 4.2s/it + +v100_32G_PCIE batch=1 bf16=false 5.5s/it +v100_32G_PCIE batch=1 bf16=true 8.7s/it diff --git a/finetune/llamafactory/train_qwen3_lora_sft.yaml b/finetune/llamafactory/train_qwen3_lora_sft.yaml index 8634ee0..a25c2b1 100644 --- a/finetune/llamafactory/train_qwen3_lora_sft.yaml +++ b/finetune/llamafactory/train_qwen3_lora_sft.yaml @@ -28,7 +28,7 @@ save_only_model: false report_to: tensorboard # choices: [none, wandb, tensorboard, swanlab, mlflow] ### train -per_device_train_batch_size: 4 +per_device_train_batch_size: 1 gradient_accumulation_steps: 8 learning_rate: 1.0e-4 num_train_epochs: 5.0