diff --git a/finetune/llamafactory/speed_test b/finetune/llamafactory/speed_test
new file mode 100644
index 0000000..031f96e
--- /dev/null
+++ b/finetune/llamafactory/speed_test
@@ -0,0 +1,8 @@
+
+
+4070tisuper batch=1 bf16=false  1.79s/it
+4070tisuper batch=1 bf16=true  1.8s/it
+4070tisuper batch=4 bf16=true  4.2s/it
+
+v100_32G_PCIE batch=1 bf16=false  5.5s/it
+v100_32G_PCIE batch=1 bf16=true  8.7s/it
diff --git a/finetune/llamafactory/train_qwen3_lora_sft.yaml b/finetune/llamafactory/train_qwen3_lora_sft.yaml
index 8634ee0..a25c2b1 100644
--- a/finetune/llamafactory/train_qwen3_lora_sft.yaml
+++ b/finetune/llamafactory/train_qwen3_lora_sft.yaml
@@ -28,7 +28,7 @@ save_only_model: false
 report_to: tensorboard  # choices: [none, wandb, tensorboard, swanlab, mlflow]
 
 ### train
-per_device_train_batch_size: 4
+per_device_train_batch_size: 1
 gradient_accumulation_steps: 8
 learning_rate: 1.0e-4
 num_train_epochs: 5.0