Update finetune
This commit is contained in:
parent
ec72ee1141
commit
9deb809a88
|
@ -1,5 +1,4 @@
|
|||
import sys
|
||||
|
||||
sys.path.append("..")
|
||||
|
||||
import json
|
||||
|
|
|
@ -1,5 +1,4 @@
|
|||
import sys
|
||||
|
||||
sys.path.append("..")
|
||||
|
||||
import json
|
||||
|
|
|
@ -0,0 +1,2 @@
|
|||
from qwen.modeling_qwen import QWenLMHeadModel
|
||||
from qwen.configuration_qwen import QWenConfig
|
|
@ -18,13 +18,16 @@ from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
|
|||
from accelerate.utils import DistributedType
|
||||
from modelscope import snapshot_download
|
||||
|
||||
from modeling_qwen import QWenLMHeadModel
|
||||
|
||||
|
||||
IGNORE_TOKEN_ID = LabelSmoother.ignore_index
|
||||
|
||||
|
||||
@dataclass
|
||||
class ModelArguments:
|
||||
model_name_or_path: Optional[str] = field(default="qwen/Qwen-1_8B-Chat")
|
||||
|
||||
|
||||
|
||||
@dataclass
|
||||
class DataArguments:
|
||||
|
@ -101,12 +104,15 @@ def get_peft_state_maybe_zero_3(named_params, bias):
|
|||
|
||||
local_rank = None
|
||||
|
||||
|
||||
def rank0_print(*args):
|
||||
if local_rank == 0:
|
||||
print(*args)
|
||||
|
||||
|
||||
def safe_save_model_for_hf_trainer(trainer: transformers.Trainer, output_dir: str, bias="none"):
|
||||
def safe_save_model_for_hf_trainer(
|
||||
trainer: transformers.Trainer, output_dir: str, bias="none"
|
||||
):
|
||||
"""Collects the state dict and dump to disk."""
|
||||
# check if zero3 mode enabled
|
||||
if deepspeed.is_deepspeed_zero3_enabled():
|
||||
|
@ -126,16 +132,16 @@ def preprocess(
|
|||
sources,
|
||||
tokenizer: transformers.PreTrainedTokenizer,
|
||||
max_len: int,
|
||||
system_message: str = "You are a helpful assistant."
|
||||
system_message: str = "You are a helpful assistant.",
|
||||
) -> Dict:
|
||||
roles = {"user": "<|im_start|>user", "assistant": "<|im_start|>assistant"}
|
||||
|
||||
im_start = tokenizer.im_start_id
|
||||
im_end = tokenizer.im_end_id
|
||||
nl_tokens = tokenizer('\n').input_ids
|
||||
_system = tokenizer('system').input_ids + nl_tokens
|
||||
_user = tokenizer('user').input_ids + nl_tokens
|
||||
_assistant = tokenizer('assistant').input_ids + nl_tokens
|
||||
nl_tokens = tokenizer("\n").input_ids
|
||||
_system = tokenizer("system").input_ids + nl_tokens
|
||||
_user = tokenizer("user").input_ids + nl_tokens
|
||||
_assistant = tokenizer("assistant").input_ids + nl_tokens
|
||||
|
||||
# Apply prompt templates
|
||||
input_ids, targets = [], []
|
||||
|
@ -144,20 +150,43 @@ def preprocess(
|
|||
source = source[1:]
|
||||
|
||||
input_id, target = [], []
|
||||
system = [im_start] + _system + tokenizer(system_message).input_ids + [im_end] + nl_tokens
|
||||
system = (
|
||||
[im_start]
|
||||
+ _system
|
||||
+ tokenizer(system_message).input_ids
|
||||
+ [im_end]
|
||||
+ nl_tokens
|
||||
)
|
||||
input_id += system
|
||||
target += [im_start] + [IGNORE_TOKEN_ID] * (len(system)-3) + [im_end] + nl_tokens
|
||||
target += (
|
||||
[im_start] + [IGNORE_TOKEN_ID] * (len(system) - 3) + [im_end] + nl_tokens
|
||||
)
|
||||
assert len(input_id) == len(target)
|
||||
for j, sentence in enumerate(source):
|
||||
role = roles[sentence["from"]]
|
||||
_input_id = tokenizer(role).input_ids + nl_tokens + \
|
||||
tokenizer(sentence["value"]).input_ids + [im_end] + nl_tokens
|
||||
_input_id = (
|
||||
tokenizer(role).input_ids
|
||||
+ nl_tokens
|
||||
+ tokenizer(sentence["value"]).input_ids
|
||||
+ [im_end]
|
||||
+ nl_tokens
|
||||
)
|
||||
input_id += _input_id
|
||||
if role == '<|im_start|>user':
|
||||
_target = [im_start] + [IGNORE_TOKEN_ID] * (len(_input_id)-3) + [im_end] + nl_tokens
|
||||
elif role == '<|im_start|>assistant':
|
||||
_target = [im_start] + [IGNORE_TOKEN_ID] * len(tokenizer(role).input_ids) + \
|
||||
_input_id[len(tokenizer(role).input_ids)+1:-2] + [im_end] + nl_tokens
|
||||
if role == "<|im_start|>user":
|
||||
_target = (
|
||||
[im_start]
|
||||
+ [IGNORE_TOKEN_ID] * (len(_input_id) - 3)
|
||||
+ [im_end]
|
||||
+ nl_tokens
|
||||
)
|
||||
elif role == "<|im_start|>assistant":
|
||||
_target = (
|
||||
[im_start]
|
||||
+ [IGNORE_TOKEN_ID] * len(tokenizer(role).input_ids)
|
||||
+ _input_id[len(tokenizer(role).input_ids) + 1 : -2]
|
||||
+ [im_end]
|
||||
+ nl_tokens
|
||||
)
|
||||
else:
|
||||
raise NotImplementedError
|
||||
target += _target
|
||||
|
@ -179,7 +208,9 @@ def preprocess(
|
|||
class SupervisedDataset(Dataset):
|
||||
"""Dataset for supervised fine-tuning."""
|
||||
|
||||
def __init__(self, raw_data, tokenizer: transformers.PreTrainedTokenizer, max_len: int):
|
||||
def __init__(
|
||||
self, raw_data, tokenizer: transformers.PreTrainedTokenizer, max_len: int
|
||||
):
|
||||
super(SupervisedDataset, self).__init__()
|
||||
|
||||
rank0_print("Formatting inputs...")
|
||||
|
@ -204,7 +235,9 @@ class SupervisedDataset(Dataset):
|
|||
class LazySupervisedDataset(Dataset):
|
||||
"""Dataset for supervised fine-tuning."""
|
||||
|
||||
def __init__(self, raw_data, tokenizer: transformers.PreTrainedTokenizer, max_len: int):
|
||||
def __init__(
|
||||
self, raw_data, tokenizer: transformers.PreTrainedTokenizer, max_len: int
|
||||
):
|
||||
super(LazySupervisedDataset, self).__init__()
|
||||
self.tokenizer = tokenizer
|
||||
self.max_len = max_len
|
||||
|
@ -221,7 +254,9 @@ class LazySupervisedDataset(Dataset):
|
|||
if i in self.cached_data_dict:
|
||||
return self.cached_data_dict[i]
|
||||
|
||||
ret = preprocess([self.raw_data[i]["conversations"]], self.tokenizer, self.max_len)
|
||||
ret = preprocess(
|
||||
[self.raw_data[i]["conversations"]], self.tokenizer, self.max_len
|
||||
)
|
||||
ret = dict(
|
||||
input_ids=ret["input_ids"][0],
|
||||
labels=ret["labels"][0],
|
||||
|
@ -233,7 +268,9 @@ class LazySupervisedDataset(Dataset):
|
|||
|
||||
|
||||
def make_supervised_data_module(
|
||||
tokenizer: transformers.PreTrainedTokenizer, data_args, max_len,
|
||||
tokenizer: transformers.PreTrainedTokenizer,
|
||||
data_args,
|
||||
max_len,
|
||||
) -> Dict:
|
||||
"""Make dataset and collator for supervised fine-tuning."""
|
||||
dataset_cls = (
|
||||
|
@ -267,7 +304,10 @@ def train():
|
|||
) = parser.parse_args_into_dataclasses()
|
||||
|
||||
# This serves for single-gpu qlora.
|
||||
if getattr(training_args, 'deepspeed', None) and int(os.environ.get("WORLD_SIZE", 1))==1:
|
||||
if (
|
||||
getattr(training_args, "deepspeed", None)
|
||||
and int(os.environ.get("WORLD_SIZE", 1)) == 1
|
||||
):
|
||||
training_args.distributed_state.distributed_type = DistributedType.DEEPSPEED
|
||||
|
||||
local_rank = training_args.local_rank
|
||||
|
@ -278,9 +318,7 @@ def train():
|
|||
if lora_args.q_lora:
|
||||
device_map = {"": int(os.environ.get("LOCAL_RANK") or 0)} if ddp else "auto"
|
||||
if len(training_args.fsdp) > 0 or deepspeed.is_deepspeed_zero3_enabled():
|
||||
logging.warning(
|
||||
"FSDP or ZeRO3 are incompatible with QLoRA."
|
||||
)
|
||||
logging.warning("FSDP or ZeRO3 are incompatible with QLoRA.")
|
||||
|
||||
model_dir = snapshot_download(model_args.model_name_or_path)
|
||||
|
||||
|
@ -294,19 +332,18 @@ def train():
|
|||
|
||||
# Load model and tokenizer
|
||||
|
||||
|
||||
model = transformers.AutoModelForCausalLM.from_pretrained(
|
||||
model = QWenLMHeadModel(config)
|
||||
model = model.from_pretrained(
|
||||
model_dir,
|
||||
config=config,
|
||||
cache_dir=training_args.cache_dir,
|
||||
device_map=device_map,
|
||||
trust_remote_code=True,
|
||||
quantization_config=GPTQConfig(
|
||||
bits=4, disable_exllama=True
|
||||
)
|
||||
quantization_config=GPTQConfig(bits=4, disable_exllama=True)
|
||||
if training_args.use_lora and lora_args.q_lora
|
||||
else None,
|
||||
)
|
||||
|
||||
tokenizer = transformers.AutoTokenizer.from_pretrained(
|
||||
model_dir,
|
||||
cache_dir=training_args.cache_dir,
|
||||
|
@ -318,7 +355,7 @@ def train():
|
|||
tokenizer.pad_token_id = tokenizer.eod_id
|
||||
|
||||
if training_args.use_lora:
|
||||
if lora_args.q_lora or 'chat' in model_dir.lower():
|
||||
if lora_args.q_lora or "chat" in model_dir.lower():
|
||||
modules_to_save = None
|
||||
else:
|
||||
modules_to_save = ["wte", "lm_head"]
|
||||
|
@ -329,7 +366,7 @@ def train():
|
|||
lora_dropout=lora_args.lora_dropout,
|
||||
bias=lora_args.lora_bias,
|
||||
task_type="CAUSAL_LM",
|
||||
modules_to_save=modules_to_save # This argument serves for adding new tokens.
|
||||
modules_to_save=modules_to_save, # This argument serves for adding new tokens.
|
||||
)
|
||||
if lora_args.q_lora:
|
||||
model = prepare_model_for_kbit_training(
|
||||
|
@ -357,7 +394,9 @@ def train():
|
|||
trainer.train()
|
||||
trainer.save_state()
|
||||
|
||||
safe_save_model_for_hf_trainer(trainer=trainer, output_dir=training_args.output_dir, bias=lora_args.lora_bias)
|
||||
safe_save_model_for_hf_trainer(
|
||||
trainer=trainer, output_dir=training_args.output_dir, bias=lora_args.lora_bias
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
0
qwen/finetune/finetune_lora_single_gpu.sh → qwen/finetune_lora_single_gpu.sh
Normal file → Executable file
0
qwen/finetune/finetune_lora_single_gpu.sh → qwen/finetune_lora_single_gpu.sh
Normal file → Executable file
0
qwen/finetune/finetune_qlora_single_gpu.sh → qwen/finetune_qlora_single_gpu.sh
Normal file → Executable file
0
qwen/finetune/finetune_qlora_single_gpu.sh → qwen/finetune_qlora_single_gpu.sh
Normal file → Executable file
|
@ -0,0 +1 @@
|
|||
import show
|
Loading…
Reference in New Issue