Refine model of qwen and add runner.
This commit is contained in:
		
							parent
							
								
									7c047f0b32
								
							
						
					
					
						commit
						9d28280cb1
					
				| 
						 | 
				
			
			@ -1,5 +0,0 @@
 | 
			
		|||
{
 | 
			
		||||
    "framework": "pytorch",
 | 
			
		||||
    "task": "chat",
 | 
			
		||||
    "allow_remote": true
 | 
			
		||||
}
 | 
			
		||||
| 
						 | 
				
			
			@ -1,55 +0,0 @@
 | 
			
		|||
from torch.utils import cpp_extension
 | 
			
		||||
import pathlib
 | 
			
		||||
import os
 | 
			
		||||
import subprocess
 | 
			
		||||
 | 
			
		||||
def _get_cuda_bare_metal_version(cuda_dir):
 | 
			
		||||
    raw_output = subprocess.check_output([cuda_dir + "/bin/nvcc", "-V"],
 | 
			
		||||
                                         universal_newlines=True)
 | 
			
		||||
    output = raw_output.split()
 | 
			
		||||
    release_idx = output.index("release") + 1
 | 
			
		||||
    release = output[release_idx].split(".")
 | 
			
		||||
    bare_metal_major = release[0]
 | 
			
		||||
    bare_metal_minor = release[1][0]
 | 
			
		||||
 | 
			
		||||
    return raw_output, bare_metal_major, bare_metal_minor
 | 
			
		||||
 | 
			
		||||
def _create_build_dir(buildpath):
 | 
			
		||||
    try:
 | 
			
		||||
        os.mkdir(buildpath)
 | 
			
		||||
    except OSError:
 | 
			
		||||
        if not os.path.isdir(buildpath):
 | 
			
		||||
            print(f"Creation of the build directory {buildpath} failed")
 | 
			
		||||
 | 
			
		||||
# Check if cuda 11 is installed for compute capability 8.0
 | 
			
		||||
cc_flag = []
 | 
			
		||||
_, bare_metal_major, bare_metal_minor = _get_cuda_bare_metal_version(cpp_extension.CUDA_HOME)
 | 
			
		||||
if int(bare_metal_major) >= 11:
 | 
			
		||||
    cc_flag.append('-gencode')
 | 
			
		||||
    cc_flag.append('arch=compute_80,code=sm_80')
 | 
			
		||||
    if int(bare_metal_minor) >= 7:
 | 
			
		||||
        cc_flag.append('-gencode')
 | 
			
		||||
        cc_flag.append('arch=compute_90,code=sm_90')
 | 
			
		||||
 | 
			
		||||
# Build path
 | 
			
		||||
srcpath = pathlib.Path(__file__).parent.absolute()
 | 
			
		||||
buildpath = srcpath / 'build'
 | 
			
		||||
_create_build_dir(buildpath)
 | 
			
		||||
 | 
			
		||||
def _cpp_extention_load_helper(name, sources, extra_cuda_flags):
 | 
			
		||||
    return cpp_extension.load(
 | 
			
		||||
        name=name,
 | 
			
		||||
        sources=sources,
 | 
			
		||||
        build_directory=buildpath,
 | 
			
		||||
        extra_cflags=['-O3', ],
 | 
			
		||||
        extra_cuda_cflags=['-O3',
 | 
			
		||||
                           '-gencode', 'arch=compute_70,code=sm_70',
 | 
			
		||||
                           '--use_fast_math'] + extra_cuda_flags + cc_flag,
 | 
			
		||||
        verbose=1
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
extra_flags = []
 | 
			
		||||
 | 
			
		||||
cache_autogptq_cuda_256_sources = ["./cache_autogptq_cuda_256.cpp",
 | 
			
		||||
           "./cache_autogptq_cuda_kernel_256.cu"]
 | 
			
		||||
cache_autogptq_cuda_256 = _cpp_extention_load_helper("cache_autogptq_cuda_256", cache_autogptq_cuda_256_sources, extra_flags)
 | 
			
		||||
| 
						 | 
				
			
			@ -5,6 +5,7 @@ from transformers.generation import GenerationConfig
 | 
			
		|||
from transformers import AutoConfig
 | 
			
		||||
 | 
			
		||||
from modeling_qwen import QWenLMHeadModel
 | 
			
		||||
from modeling_qwen import QwenRunner
 | 
			
		||||
 | 
			
		||||
seed = 4321
 | 
			
		||||
torch.manual_seed(seed)
 | 
			
		||||
| 
						 | 
				
			
			@ -35,8 +36,10 @@ model = model.eval()
 | 
			
		|||
#     model_dir, trust_remote_code=True
 | 
			
		||||
# )
 | 
			
		||||
 | 
			
		||||
runner = QwenRunner(model)
 | 
			
		||||
 | 
			
		||||
# 第一轮对话
 | 
			
		||||
response, history, decode_tokens = model.chat(tokenizer, "东南亚国家日本的首都是什么市", "", history=None)
 | 
			
		||||
response, history, decode_tokens = runner.Chat(tokenizer, "东南亚国家日本的首都是什么市", "")
 | 
			
		||||
print(decode_tokens)
 | 
			
		||||
# <|im_start|>system
 | 
			
		||||
# You are a helpful assistant.<|im_end|>
 | 
			
		||||
| 
						 | 
				
			
			@ -46,7 +49,8 @@ print(decode_tokens)
 | 
			
		|||
# 日本的首都东京。<|im_end|><|endoftext|>
 | 
			
		||||
 | 
			
		||||
# 第二轮对话
 | 
			
		||||
response, history, decode_tokens = model.chat(tokenizer, "给我讲一个年轻人奋斗创业最终取得成功的故事。", "", history=None)
 | 
			
		||||
 | 
			
		||||
response, history, decode_tokens = runner.Chat(tokenizer, "给我讲一个年轻人奋斗创业最终取得成功的故事。", "")
 | 
			
		||||
print(decode_tokens)
 | 
			
		||||
 | 
			
		||||
if decode_tokens.split("\n")[-2] != """这个故事告诉我们,只要我们有决心和毅力,就一定能够克服困难,实现我们的梦想。<|im_end|>""":
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
							
								
								
									
										403
									
								
								qwen/finetune.py
								
								
								
								
							
							
						
						
									
										403
									
								
								qwen/finetune.py
								
								
								
								
							| 
						 | 
				
			
			@ -1,403 +0,0 @@
 | 
			
		|||
# This code is based on the revised code from fastchat based on tatsu-lab/stanford_alpaca.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
from dataclasses import dataclass, field
 | 
			
		||||
import json
 | 
			
		||||
import math
 | 
			
		||||
import logging
 | 
			
		||||
import os
 | 
			
		||||
from typing import Dict, Optional, List
 | 
			
		||||
import torch
 | 
			
		||||
from torch.utils.data import Dataset
 | 
			
		||||
from deepspeed import zero
 | 
			
		||||
from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus
 | 
			
		||||
import transformers
 | 
			
		||||
from transformers import Trainer, GPTQConfig, deepspeed
 | 
			
		||||
from transformers.trainer_pt_utils import LabelSmoother
 | 
			
		||||
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
 | 
			
		||||
from accelerate.utils import DistributedType
 | 
			
		||||
from modelscope import snapshot_download
 | 
			
		||||
 | 
			
		||||
from modeling_qwen import QWenLMHeadModel
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
IGNORE_TOKEN_ID = LabelSmoother.ignore_index
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@dataclass
 | 
			
		||||
class ModelArguments:
 | 
			
		||||
    model_name_or_path: Optional[str] = field(default="qwen/Qwen-1_8B-Chat")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@dataclass
 | 
			
		||||
class DataArguments:
 | 
			
		||||
    data_path: str = field(
 | 
			
		||||
        default=None, metadata={"help": "Path to the training data."}
 | 
			
		||||
    )
 | 
			
		||||
    eval_data_path: str = field(
 | 
			
		||||
        default=None, metadata={"help": "Path to the evaluation data."}
 | 
			
		||||
    )
 | 
			
		||||
    lazy_preprocess: bool = False
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@dataclass
 | 
			
		||||
class TrainingArguments(transformers.TrainingArguments):
 | 
			
		||||
    cache_dir: Optional[str] = field(default=None)
 | 
			
		||||
    optim: str = field(default="adamw_torch")
 | 
			
		||||
    model_max_length: int = field(
 | 
			
		||||
        default=8192,
 | 
			
		||||
        metadata={
 | 
			
		||||
            "help": "Maximum sequence length. Sequences will be right padded (and possibly truncated)."
 | 
			
		||||
        },
 | 
			
		||||
    )
 | 
			
		||||
    use_lora: bool = False
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@dataclass
 | 
			
		||||
class LoraArguments:
 | 
			
		||||
    lora_r: int = 64
 | 
			
		||||
    lora_alpha: int = 16
 | 
			
		||||
    lora_dropout: float = 0.05
 | 
			
		||||
    lora_target_modules: List[str] = field(
 | 
			
		||||
        default_factory=lambda: ["c_attn", "c_proj", "w1", "w2"]
 | 
			
		||||
    )
 | 
			
		||||
    lora_weight_path: str = ""
 | 
			
		||||
    lora_bias: str = "none"
 | 
			
		||||
    q_lora: bool = False
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def maybe_zero_3(param):
 | 
			
		||||
    if hasattr(param, "ds_id"):
 | 
			
		||||
        assert param.ds_status == ZeroParamStatus.NOT_AVAILABLE
 | 
			
		||||
        with zero.GatheredParameters([param]):
 | 
			
		||||
            param = param.data.detach().cpu().clone()
 | 
			
		||||
    else:
 | 
			
		||||
        param = param.detach().cpu().clone()
 | 
			
		||||
    return param
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Borrowed from peft.utils.get_peft_model_state_dict
 | 
			
		||||
def get_peft_state_maybe_zero_3(named_params, bias):
 | 
			
		||||
    if bias == "none":
 | 
			
		||||
        to_return = {k: t for k, t in named_params if "lora_" in k}
 | 
			
		||||
    elif bias == "all":
 | 
			
		||||
        to_return = {k: t for k, t in named_params if "lora_" in k or "bias" in k}
 | 
			
		||||
    elif bias == "lora_only":
 | 
			
		||||
        to_return = {}
 | 
			
		||||
        maybe_lora_bias = {}
 | 
			
		||||
        lora_bias_names = set()
 | 
			
		||||
        for k, t in named_params:
 | 
			
		||||
            if "lora_" in k:
 | 
			
		||||
                to_return[k] = t
 | 
			
		||||
                bias_name = k.split("lora_")[0] + "bias"
 | 
			
		||||
                lora_bias_names.add(bias_name)
 | 
			
		||||
            elif "bias" in k:
 | 
			
		||||
                maybe_lora_bias[k] = t
 | 
			
		||||
        for k, t in maybe_lora_bias:
 | 
			
		||||
            if bias_name in lora_bias_names:
 | 
			
		||||
                to_return[bias_name] = t
 | 
			
		||||
    else:
 | 
			
		||||
        raise NotImplementedError
 | 
			
		||||
    to_return = {k: maybe_zero_3(v) for k, v in to_return.items()}
 | 
			
		||||
    return to_return
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
local_rank = None
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def rank0_print(*args):
 | 
			
		||||
    if local_rank == 0:
 | 
			
		||||
        print(*args)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def safe_save_model_for_hf_trainer(
 | 
			
		||||
    trainer: transformers.Trainer, output_dir: str, bias="none"
 | 
			
		||||
):
 | 
			
		||||
    """Collects the state dict and dump to disk."""
 | 
			
		||||
    # check if zero3 mode enabled
 | 
			
		||||
    if deepspeed.is_deepspeed_zero3_enabled():
 | 
			
		||||
        state_dict = trainer.model_wrapped._zero3_consolidated_16bit_state_dict()
 | 
			
		||||
    else:
 | 
			
		||||
        if trainer.args.use_lora:
 | 
			
		||||
            state_dict = get_peft_state_maybe_zero_3(
 | 
			
		||||
                trainer.model.named_parameters(), bias
 | 
			
		||||
            )
 | 
			
		||||
        else:
 | 
			
		||||
            state_dict = trainer.model.state_dict()
 | 
			
		||||
    if trainer.args.should_save and trainer.args.local_rank == 0:
 | 
			
		||||
        trainer._save(output_dir, state_dict=state_dict)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def preprocess(
 | 
			
		||||
    sources,
 | 
			
		||||
    tokenizer: transformers.PreTrainedTokenizer,
 | 
			
		||||
    max_len: int,
 | 
			
		||||
    system_message: str = "You are a helpful assistant.",
 | 
			
		||||
) -> Dict:
 | 
			
		||||
    roles = {"user": "<|im_start|>user", "assistant": "<|im_start|>assistant"}
 | 
			
		||||
 | 
			
		||||
    im_start = tokenizer.im_start_id
 | 
			
		||||
    im_end = tokenizer.im_end_id
 | 
			
		||||
    nl_tokens = tokenizer("\n").input_ids
 | 
			
		||||
    _system = tokenizer("system").input_ids + nl_tokens
 | 
			
		||||
    _user = tokenizer("user").input_ids + nl_tokens
 | 
			
		||||
    _assistant = tokenizer("assistant").input_ids + nl_tokens
 | 
			
		||||
 | 
			
		||||
    # Apply prompt templates
 | 
			
		||||
    input_ids, targets = [], []
 | 
			
		||||
    for i, source in enumerate(sources):
 | 
			
		||||
        if roles[source[0]["from"]] != roles["user"]:
 | 
			
		||||
            source = source[1:]
 | 
			
		||||
 | 
			
		||||
        input_id, target = [], []
 | 
			
		||||
        system = (
 | 
			
		||||
            [im_start]
 | 
			
		||||
            + _system
 | 
			
		||||
            + tokenizer(system_message).input_ids
 | 
			
		||||
            + [im_end]
 | 
			
		||||
            + nl_tokens
 | 
			
		||||
        )
 | 
			
		||||
        input_id += system
 | 
			
		||||
        target += (
 | 
			
		||||
            [im_start] + [IGNORE_TOKEN_ID] * (len(system) - 3) + [im_end] + nl_tokens
 | 
			
		||||
        )
 | 
			
		||||
        assert len(input_id) == len(target)
 | 
			
		||||
        for j, sentence in enumerate(source):
 | 
			
		||||
            role = roles[sentence["from"]]
 | 
			
		||||
            _input_id = (
 | 
			
		||||
                tokenizer(role).input_ids
 | 
			
		||||
                + nl_tokens
 | 
			
		||||
                + tokenizer(sentence["value"]).input_ids
 | 
			
		||||
                + [im_end]
 | 
			
		||||
                + nl_tokens
 | 
			
		||||
            )
 | 
			
		||||
            input_id += _input_id
 | 
			
		||||
            if role == "<|im_start|>user":
 | 
			
		||||
                _target = (
 | 
			
		||||
                    [im_start]
 | 
			
		||||
                    + [IGNORE_TOKEN_ID] * (len(_input_id) - 3)
 | 
			
		||||
                    + [im_end]
 | 
			
		||||
                    + nl_tokens
 | 
			
		||||
                )
 | 
			
		||||
            elif role == "<|im_start|>assistant":
 | 
			
		||||
                _target = (
 | 
			
		||||
                    [im_start]
 | 
			
		||||
                    + [IGNORE_TOKEN_ID] * len(tokenizer(role).input_ids)
 | 
			
		||||
                    + _input_id[len(tokenizer(role).input_ids) + 1 : -2]
 | 
			
		||||
                    + [im_end]
 | 
			
		||||
                    + nl_tokens
 | 
			
		||||
                )
 | 
			
		||||
            else:
 | 
			
		||||
                raise NotImplementedError
 | 
			
		||||
            target += _target
 | 
			
		||||
        assert len(input_id) == len(target)
 | 
			
		||||
        input_id += [tokenizer.pad_token_id] * (max_len - len(input_id))
 | 
			
		||||
        target += [IGNORE_TOKEN_ID] * (max_len - len(target))
 | 
			
		||||
        input_ids.append(input_id[:max_len])
 | 
			
		||||
        targets.append(target[:max_len])
 | 
			
		||||
    input_ids = torch.tensor(input_ids, dtype=torch.int)
 | 
			
		||||
    targets = torch.tensor(targets, dtype=torch.int)
 | 
			
		||||
 | 
			
		||||
    return dict(
 | 
			
		||||
        input_ids=input_ids,
 | 
			
		||||
        labels=targets,
 | 
			
		||||
        attention_mask=input_ids.ne(tokenizer.pad_token_id),
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class SupervisedDataset(Dataset):
 | 
			
		||||
    """Dataset for supervised fine-tuning."""
 | 
			
		||||
 | 
			
		||||
    def __init__(
 | 
			
		||||
        self, raw_data, tokenizer: transformers.PreTrainedTokenizer, max_len: int
 | 
			
		||||
    ):
 | 
			
		||||
        super(SupervisedDataset, self).__init__()
 | 
			
		||||
 | 
			
		||||
        rank0_print("Formatting inputs...")
 | 
			
		||||
        sources = [example["conversations"] for example in raw_data]
 | 
			
		||||
        data_dict = preprocess(sources, tokenizer, max_len)
 | 
			
		||||
 | 
			
		||||
        self.input_ids = data_dict["input_ids"]
 | 
			
		||||
        self.labels = data_dict["labels"]
 | 
			
		||||
        self.attention_mask = data_dict["attention_mask"]
 | 
			
		||||
 | 
			
		||||
    def __len__(self):
 | 
			
		||||
        return len(self.input_ids)
 | 
			
		||||
 | 
			
		||||
    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
 | 
			
		||||
        return dict(
 | 
			
		||||
            input_ids=self.input_ids[i],
 | 
			
		||||
            labels=self.labels[i],
 | 
			
		||||
            attention_mask=self.attention_mask[i],
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class LazySupervisedDataset(Dataset):
 | 
			
		||||
    """Dataset for supervised fine-tuning."""
 | 
			
		||||
 | 
			
		||||
    def __init__(
 | 
			
		||||
        self, raw_data, tokenizer: transformers.PreTrainedTokenizer, max_len: int
 | 
			
		||||
    ):
 | 
			
		||||
        super(LazySupervisedDataset, self).__init__()
 | 
			
		||||
        self.tokenizer = tokenizer
 | 
			
		||||
        self.max_len = max_len
 | 
			
		||||
 | 
			
		||||
        rank0_print("Formatting inputs...Skip in lazy mode")
 | 
			
		||||
        self.tokenizer = tokenizer
 | 
			
		||||
        self.raw_data = raw_data
 | 
			
		||||
        self.cached_data_dict = {}
 | 
			
		||||
 | 
			
		||||
    def __len__(self):
 | 
			
		||||
        return len(self.raw_data)
 | 
			
		||||
 | 
			
		||||
    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
 | 
			
		||||
        if i in self.cached_data_dict:
 | 
			
		||||
            return self.cached_data_dict[i]
 | 
			
		||||
 | 
			
		||||
        ret = preprocess(
 | 
			
		||||
            [self.raw_data[i]["conversations"]], self.tokenizer, self.max_len
 | 
			
		||||
        )
 | 
			
		||||
        ret = dict(
 | 
			
		||||
            input_ids=ret["input_ids"][0],
 | 
			
		||||
            labels=ret["labels"][0],
 | 
			
		||||
            attention_mask=ret["attention_mask"][0],
 | 
			
		||||
        )
 | 
			
		||||
        self.cached_data_dict[i] = ret
 | 
			
		||||
 | 
			
		||||
        return ret
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def make_supervised_data_module(
 | 
			
		||||
    tokenizer: transformers.PreTrainedTokenizer,
 | 
			
		||||
    data_args,
 | 
			
		||||
    max_len,
 | 
			
		||||
) -> Dict:
 | 
			
		||||
    """Make dataset and collator for supervised fine-tuning."""
 | 
			
		||||
    dataset_cls = (
 | 
			
		||||
        LazySupervisedDataset if data_args.lazy_preprocess else SupervisedDataset
 | 
			
		||||
    )
 | 
			
		||||
    rank0_print("Loading data...")
 | 
			
		||||
 | 
			
		||||
    train_json = json.load(open(data_args.data_path, "r"))
 | 
			
		||||
    train_dataset = dataset_cls(train_json, tokenizer=tokenizer, max_len=max_len)
 | 
			
		||||
 | 
			
		||||
    if data_args.eval_data_path:
 | 
			
		||||
        eval_json = json.load(open(data_args.eval_data_path, "r"))
 | 
			
		||||
        eval_dataset = dataset_cls(eval_json, tokenizer=tokenizer, max_len=max_len)
 | 
			
		||||
    else:
 | 
			
		||||
        eval_dataset = None
 | 
			
		||||
 | 
			
		||||
    return dict(train_dataset=train_dataset, eval_dataset=eval_dataset)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def train():
 | 
			
		||||
    global local_rank
 | 
			
		||||
 | 
			
		||||
    parser = transformers.HfArgumentParser(
 | 
			
		||||
        (ModelArguments, DataArguments, TrainingArguments, LoraArguments)
 | 
			
		||||
    )
 | 
			
		||||
    (
 | 
			
		||||
        model_args,
 | 
			
		||||
        data_args,
 | 
			
		||||
        training_args,
 | 
			
		||||
        lora_args,
 | 
			
		||||
    ) = parser.parse_args_into_dataclasses()
 | 
			
		||||
 | 
			
		||||
    # This serves for single-gpu qlora.
 | 
			
		||||
    if (
 | 
			
		||||
        getattr(training_args, "deepspeed", None)
 | 
			
		||||
        and int(os.environ.get("WORLD_SIZE", 1)) == 1
 | 
			
		||||
    ):
 | 
			
		||||
        training_args.distributed_state.distributed_type = DistributedType.DEEPSPEED
 | 
			
		||||
 | 
			
		||||
    local_rank = training_args.local_rank
 | 
			
		||||
 | 
			
		||||
    device_map = "auto"
 | 
			
		||||
    world_size = int(os.environ.get("WORLD_SIZE", 1))
 | 
			
		||||
    ddp = world_size != 1
 | 
			
		||||
    if lora_args.q_lora:
 | 
			
		||||
        device_map = {"": int(os.environ.get("LOCAL_RANK") or 0)} if ddp else "auto"
 | 
			
		||||
        if len(training_args.fsdp) > 0 or deepspeed.is_deepspeed_zero3_enabled():
 | 
			
		||||
            logging.warning("FSDP or ZeRO3 are incompatible with QLoRA.")
 | 
			
		||||
 | 
			
		||||
    model_dir = snapshot_download(model_args.model_name_or_path)
 | 
			
		||||
 | 
			
		||||
    # Set RoPE scaling factor
 | 
			
		||||
    config = transformers.AutoConfig.from_pretrained(
 | 
			
		||||
        model_dir,
 | 
			
		||||
        cache_dir=training_args.cache_dir,
 | 
			
		||||
        trust_remote_code=True,
 | 
			
		||||
    )
 | 
			
		||||
    config.use_cache = False
 | 
			
		||||
 | 
			
		||||
    # Load model and tokenizer
 | 
			
		||||
 | 
			
		||||
    model = QWenLMHeadModel(config)
 | 
			
		||||
    model = model.from_pretrained(
 | 
			
		||||
        model_dir,
 | 
			
		||||
        config=config,
 | 
			
		||||
        cache_dir=training_args.cache_dir,
 | 
			
		||||
        device_map=device_map,
 | 
			
		||||
        trust_remote_code=True,
 | 
			
		||||
        quantization_config=GPTQConfig(bits=4, disable_exllama=True)
 | 
			
		||||
        if training_args.use_lora and lora_args.q_lora
 | 
			
		||||
        else None,
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
    tokenizer = transformers.AutoTokenizer.from_pretrained(
 | 
			
		||||
        model_dir,
 | 
			
		||||
        cache_dir=training_args.cache_dir,
 | 
			
		||||
        model_max_length=training_args.model_max_length,
 | 
			
		||||
        padding_side="right",
 | 
			
		||||
        use_fast=False,
 | 
			
		||||
        trust_remote_code=True,
 | 
			
		||||
    )
 | 
			
		||||
    tokenizer.pad_token_id = tokenizer.eod_id
 | 
			
		||||
 | 
			
		||||
    if training_args.use_lora:
 | 
			
		||||
        if lora_args.q_lora or "chat" in model_dir.lower():
 | 
			
		||||
            modules_to_save = None
 | 
			
		||||
        else:
 | 
			
		||||
            modules_to_save = ["wte", "lm_head"]
 | 
			
		||||
        lora_config = LoraConfig(
 | 
			
		||||
            r=lora_args.lora_r,
 | 
			
		||||
            lora_alpha=lora_args.lora_alpha,
 | 
			
		||||
            target_modules=lora_args.lora_target_modules,
 | 
			
		||||
            lora_dropout=lora_args.lora_dropout,
 | 
			
		||||
            bias=lora_args.lora_bias,
 | 
			
		||||
            task_type="CAUSAL_LM",
 | 
			
		||||
            modules_to_save=modules_to_save,  # This argument serves for adding new tokens.
 | 
			
		||||
        )
 | 
			
		||||
        if lora_args.q_lora:
 | 
			
		||||
            model = prepare_model_for_kbit_training(
 | 
			
		||||
                model, use_gradient_checkpointing=training_args.gradient_checkpointing
 | 
			
		||||
            )
 | 
			
		||||
 | 
			
		||||
        model = get_peft_model(model, lora_config)
 | 
			
		||||
 | 
			
		||||
        # Print peft trainable params
 | 
			
		||||
        model.print_trainable_parameters()
 | 
			
		||||
 | 
			
		||||
        if training_args.gradient_checkpointing:
 | 
			
		||||
            model.enable_input_require_grads()
 | 
			
		||||
 | 
			
		||||
    # Load data
 | 
			
		||||
    data_module = make_supervised_data_module(
 | 
			
		||||
        tokenizer=tokenizer, data_args=data_args, max_len=training_args.model_max_length
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
    # Start trainner
 | 
			
		||||
    trainer = Trainer(
 | 
			
		||||
        model=model, tokenizer=tokenizer, args=training_args, **data_module
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
    trainer.train()
 | 
			
		||||
    trainer.save_state()
 | 
			
		||||
 | 
			
		||||
    safe_save_model_for_hf_trainer(
 | 
			
		||||
        trainer=trainer, output_dir=training_args.output_dir, bias=lora_args.lora_bias
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
if __name__ == "__main__":
 | 
			
		||||
    train()
 | 
			
		||||
| 
						 | 
				
			
			@ -1,65 +0,0 @@
 | 
			
		|||
#!/bin/bash
 | 
			
		||||
export CUDA_DEVICE_MAX_CONNECTIONS=1
 | 
			
		||||
 | 
			
		||||
MODEL="qwen/Qwen-1_8B-Chat" # Set the path if you do not want to load from huggingface directly
 | 
			
		||||
# ATTENTION: specify the path to your training data, which should be a json file consisting of a list of conversations.
 | 
			
		||||
# See the section for finetuning in README for more information.
 | 
			
		||||
DATA="data.json"
 | 
			
		||||
 | 
			
		||||
function usage() {
 | 
			
		||||
    echo '
 | 
			
		||||
Usage: bash finetune/finetune_lora_single_gpu.sh [-m MODEL_PATH] [-d DATA_PATH]
 | 
			
		||||
'
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
while [[ "$1" != "" ]]; do
 | 
			
		||||
    case $1 in
 | 
			
		||||
        -m | --model )
 | 
			
		||||
            shift
 | 
			
		||||
            MODEL=$1
 | 
			
		||||
            ;;
 | 
			
		||||
        -d | --data )
 | 
			
		||||
            shift
 | 
			
		||||
            DATA=$1
 | 
			
		||||
            ;;
 | 
			
		||||
        -h | --help )
 | 
			
		||||
            usage
 | 
			
		||||
            exit 0
 | 
			
		||||
            ;;
 | 
			
		||||
        * )
 | 
			
		||||
            echo "Unknown argument ${1}"
 | 
			
		||||
            exit 1
 | 
			
		||||
            ;;
 | 
			
		||||
    esac
 | 
			
		||||
    shift
 | 
			
		||||
done
 | 
			
		||||
 | 
			
		||||
export CUDA_VISIBLE_DEVICES=0
 | 
			
		||||
 | 
			
		||||
python finetune.py \
 | 
			
		||||
  --model_name_or_path $MODEL \
 | 
			
		||||
  --data_path $DATA \
 | 
			
		||||
  --bf16 False \
 | 
			
		||||
  --output_dir output_qwen \
 | 
			
		||||
  --num_train_epochs 5 \
 | 
			
		||||
  --per_device_train_batch_size 2 \
 | 
			
		||||
  --per_device_eval_batch_size 1 \
 | 
			
		||||
  --gradient_accumulation_steps 8 \
 | 
			
		||||
  --evaluation_strategy "no" \
 | 
			
		||||
  --save_strategy "steps" \
 | 
			
		||||
  --save_steps 1000 \
 | 
			
		||||
  --save_total_limit 10 \
 | 
			
		||||
  --learning_rate 3e-4 \
 | 
			
		||||
  --weight_decay 0.1 \
 | 
			
		||||
  --adam_beta2 0.95 \
 | 
			
		||||
  --warmup_ratio 0.01 \
 | 
			
		||||
  --lr_scheduler_type "cosine" \
 | 
			
		||||
  --logging_steps 1 \
 | 
			
		||||
  --report_to "none" \
 | 
			
		||||
  --model_max_length 512 \
 | 
			
		||||
  --lazy_preprocess True \
 | 
			
		||||
  --gradient_checkpointing \
 | 
			
		||||
  --use_lora
 | 
			
		||||
 | 
			
		||||
# If you use fp16 instead of bf16, you should use deepspeed
 | 
			
		||||
# --fp16 True --deepspeed finetune/ds_config_zero2.json
 | 
			
		||||
| 
						 | 
				
			
			@ -16,10 +16,7 @@ from torch import nn
 | 
			
		|||
from safetensors.torch import load_file as safe_load_file
 | 
			
		||||
from safetensors.torch import save_file as safe_save_file
 | 
			
		||||
 | 
			
		||||
from transformers.generation.utils import GenerateOutput
 | 
			
		||||
from configuration_qwen import QWenConfig
 | 
			
		||||
from qwen_generation_utils import (
 | 
			
		||||
    HistoryType,
 | 
			
		||||
    make_context,
 | 
			
		||||
    decode_tokens,
 | 
			
		||||
)
 | 
			
		||||
| 
						 | 
				
			
			@ -137,7 +134,6 @@ class QWenLMHeadModel(nn.Module):
 | 
			
		|||
    def __init__(self, config):
 | 
			
		||||
        super().__init__()
 | 
			
		||||
        self.config = config
 | 
			
		||||
 | 
			
		||||
        self.transformer = QWenModel(config)
 | 
			
		||||
        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -186,65 +182,42 @@ class QWenLMHeadModel(nn.Module):
 | 
			
		|||
        print(f"All model checkpoint weights were used when initializing {cls.__class__.__name__}.\n")
 | 
			
		||||
        return cls
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class QwenRunner:
 | 
			
		||||
    def __init__(self, qwen):
 | 
			
		||||
        self.qwen = qwen
 | 
			
		||||
 | 
			
		||||
    @torch.no_grad()
 | 
			
		||||
    def chat(
 | 
			
		||||
    def Chat(
 | 
			
		||||
        self,
 | 
			
		||||
        tokenizer,
 | 
			
		||||
        query: str,
 | 
			
		||||
        query_assistant: str,
 | 
			
		||||
        history: Optional[HistoryType],
 | 
			
		||||
        system: str = "You are a helpful assistant.",
 | 
			
		||||
        **kwargs,
 | 
			
		||||
    ) -> Tuple[str, HistoryType]:
 | 
			
		||||
        if history is None:
 | 
			
		||||
            history = []
 | 
			
		||||
        else:
 | 
			
		||||
        history=[],
 | 
			
		||||
    ):
 | 
			
		||||
        qwen = self.qwen
 | 
			
		||||
        history = copy.deepcopy(history)
 | 
			
		||||
 | 
			
		||||
        raw_text, context_tokens = make_context(tokenizer, query, query_assistant, history=history, system=system)
 | 
			
		||||
        input_ids = torch.tensor([context_tokens]).to(next(self.parameters()).device)
 | 
			
		||||
        outputs = self.generate(
 | 
			
		||||
            input_ids,
 | 
			
		||||
            tokenizer=tokenizer,
 | 
			
		||||
            **kwargs,
 | 
			
		||||
        )
 | 
			
		||||
        decoded, response, end_reason = decode_tokens(
 | 
			
		||||
            outputs[0],
 | 
			
		||||
            tokenizer,
 | 
			
		||||
            raw_text_len=len(raw_text),
 | 
			
		||||
            context_length=len(context_tokens),
 | 
			
		||||
            errors="replace",
 | 
			
		||||
        )
 | 
			
		||||
        history.append((query, response))
 | 
			
		||||
        return response, history, decoded
 | 
			
		||||
        input_ids = torch.tensor([context_tokens]).to(next(qwen.parameters()).device)
 | 
			
		||||
        eos_token_id_tensor = torch.tensor([qwen.config.eos_token_id]).to(input_ids.device)
 | 
			
		||||
        pad_token_id = qwen.config.pad_token_id
 | 
			
		||||
 | 
			
		||||
    def generate(
 | 
			
		||||
        self,
 | 
			
		||||
        input_ids: Optional[torch.Tensor] = None,
 | 
			
		||||
        tokenizer=None,
 | 
			
		||||
    ) -> Union[GenerateOutput, torch.LongTensor]:
 | 
			
		||||
        pad_token_id = self.config.pad_token_id
 | 
			
		||||
        eos_token_id_tensor = torch.tensor([self.config.eos_token_id]).to(input_ids.device)
 | 
			
		||||
 | 
			
		||||
        # keep track of which sequences are already finished
 | 
			
		||||
        unfinished_sequences = torch.ones(input_ids.shape[0], dtype=torch.long, device=input_ids.device)
 | 
			
		||||
 | 
			
		||||
        this_peer_finished = False
 | 
			
		||||
        # auto-regressive generation
 | 
			
		||||
        while True:
 | 
			
		||||
            # forward pass to get next token
 | 
			
		||||
            outputs = forwardQWen(self, input_ids)
 | 
			
		||||
            outputs = self.forwardQWen(input_ids)
 | 
			
		||||
            next_token_scores = outputs[:, -1, :]
 | 
			
		||||
 | 
			
		||||
            # repetition_penalty
 | 
			
		||||
            penalty = self.config.repetition_penalty
 | 
			
		||||
            penalty = qwen.config.repetition_penalty
 | 
			
		||||
            score = torch.gather(next_token_scores, 1, input_ids)
 | 
			
		||||
            # if score < 0 then repetition penalty has to be multiplied to reduce the token probabilities
 | 
			
		||||
            score = torch.where(score < 0, score * penalty, score / penalty)
 | 
			
		||||
            next_token_scores = next_token_scores.scatter_(1, input_ids, score)
 | 
			
		||||
 | 
			
		||||
            # top_p
 | 
			
		||||
            top_p = self.config.top_p
 | 
			
		||||
            top_p = qwen.config.top_p
 | 
			
		||||
            filter_value = -float("Inf")
 | 
			
		||||
            min_tokens_to_keep = 1
 | 
			
		||||
            sorted_logits, sorted_indices = torch.sort(next_token_scores, descending=False)
 | 
			
		||||
| 
						 | 
				
			
			@ -262,37 +235,33 @@ class QWenLMHeadModel(nn.Module):
 | 
			
		|||
            next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
 | 
			
		||||
 | 
			
		||||
            next_tokens = next_tokens * unfinished_sequences + pad_token_id * (1 - unfinished_sequences)
 | 
			
		||||
 | 
			
		||||
            # update generated ids, model inputs, and length for next step
 | 
			
		||||
            input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
 | 
			
		||||
 | 
			
		||||
            unfinished_sequences = unfinished_sequences.mul(
 | 
			
		||||
                next_tokens.tile(eos_token_id_tensor.shape[0], 1).ne(eos_token_id_tensor.unsqueeze(1)).prod(dim=0)
 | 
			
		||||
            )
 | 
			
		||||
 | 
			
		||||
            # decoded, response, end_reason = decode_tokens(
 | 
			
		||||
            #     next_tokens,
 | 
			
		||||
            #     tokenizer,
 | 
			
		||||
            #     raw_text_len=0,
 | 
			
		||||
            #     context_length=0,
 | 
			
		||||
            #     errors="replace",
 | 
			
		||||
            # )
 | 
			
		||||
            # print(decoded)
 | 
			
		||||
 | 
			
		||||
            # stop when each sentence is finished
 | 
			
		||||
            if unfinished_sequences.max() == 0:
 | 
			
		||||
                this_peer_finished = True
 | 
			
		||||
 | 
			
		||||
            if this_peer_finished:
 | 
			
		||||
                break
 | 
			
		||||
        return input_ids
 | 
			
		||||
 | 
			
		||||
        decoded, response, end_reason = decode_tokens(
 | 
			
		||||
            input_ids[0],
 | 
			
		||||
            tokenizer,
 | 
			
		||||
            raw_text_len=len(raw_text),
 | 
			
		||||
            context_length=len(context_tokens),
 | 
			
		||||
            errors="replace",
 | 
			
		||||
        )
 | 
			
		||||
        history.append((query, response))
 | 
			
		||||
        return response, history, decoded
 | 
			
		||||
 | 
			
		||||
def forwardAttention(
 | 
			
		||||
    def forwardAttention(
 | 
			
		||||
        self,
 | 
			
		||||
        attention,
 | 
			
		||||
        hidden_states: Optional[Tuple[torch.FloatTensor]],
 | 
			
		||||
        rotary_pos_emb_list: Optional[List[List[torch.Tensor]]] = None,
 | 
			
		||||
):
 | 
			
		||||
    ):
 | 
			
		||||
        def apply_rotary_pos_emb(t, freqs):
 | 
			
		||||
            def _rotate_half(x):
 | 
			
		||||
                x = rearrange(x, "... (j d) -> ... j d", j=2)
 | 
			
		||||
| 
						 | 
				
			
			@ -340,15 +309,15 @@ def forwardAttention(
 | 
			
		|||
 | 
			
		||||
        return attn_output
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def forwardQWenBlock(
 | 
			
		||||
    def forwardQWenBlock(
 | 
			
		||||
        self,
 | 
			
		||||
        block,
 | 
			
		||||
        hidden_states: Optional[Tuple[torch.FloatTensor]],
 | 
			
		||||
        rotary_pos_emb_list: Optional[List[List[torch.Tensor]]] = None,
 | 
			
		||||
):
 | 
			
		||||
    ):
 | 
			
		||||
        layernorm_output = block.ln_1(hidden_states)
 | 
			
		||||
 | 
			
		||||
    attn_outputs = forwardAttention(block.attn, layernorm_output, rotary_pos_emb_list)
 | 
			
		||||
        attn_outputs = self.forwardAttention(block.attn, layernorm_output, rotary_pos_emb_list)
 | 
			
		||||
        attn_output = attn_outputs[0]
 | 
			
		||||
        layernorm_input = attn_output + hidden_states
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -361,13 +330,12 @@ def forwardQWenBlock(
 | 
			
		|||
        hidden_states = layernorm_input + mlp_output
 | 
			
		||||
        return hidden_states
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def forwardQWen(
 | 
			
		||||
    qwen,
 | 
			
		||||
    def forwardQWen(
 | 
			
		||||
        self,
 | 
			
		||||
        input_ids: Optional[torch.LongTensor] = None,
 | 
			
		||||
        labels: Optional[torch.LongTensor] = None,
 | 
			
		||||
):
 | 
			
		||||
    transfm = qwen.transformer
 | 
			
		||||
    ):
 | 
			
		||||
        transfm = self.qwen.transformer
 | 
			
		||||
        input_shape = input_ids.size()
 | 
			
		||||
        input_ids = input_ids.view(-1, input_shape[-1])
 | 
			
		||||
        hidden_states = transfm.wte(input_ids)
 | 
			
		||||
| 
						 | 
				
			
			@ -381,12 +349,12 @@ def forwardQWen(
 | 
			
		|||
        output_shape = input_shape + (hidden_states.size(-1),)
 | 
			
		||||
 | 
			
		||||
        for block in transfm.h:
 | 
			
		||||
        hidden_states = forwardQWenBlock(block, hidden_states, rotary_pos_emb_list=rotary_pos_emb_list)
 | 
			
		||||
            hidden_states = self.forwardQWenBlock(block, hidden_states, rotary_pos_emb_list=rotary_pos_emb_list)
 | 
			
		||||
 | 
			
		||||
        hidden_states = transfm.ln_f(hidden_states)
 | 
			
		||||
        hidden_states = hidden_states.view(output_shape)
 | 
			
		||||
 | 
			
		||||
    lm_logits = qwen.lm_head(hidden_states)
 | 
			
		||||
        lm_logits = self.qwen.lm_head(hidden_states)
 | 
			
		||||
 | 
			
		||||
        loss = None
 | 
			
		||||
        if labels is not None:
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -0,0 +1,29 @@
 | 
			
		|||
 | 
			
		||||
from abc import ABC, abstractmethod
 | 
			
		||||
 | 
			
		||||
class People(ABC):
 | 
			
		||||
    # @abstractmethod
 | 
			
		||||
    def walk(self):
 | 
			
		||||
        pass
 | 
			
		||||
 | 
			
		||||
    @abstractmethod
 | 
			
		||||
    def eat(self):
 | 
			
		||||
        pass
 | 
			
		||||
        
 | 
			
		||||
    def auto(self):
 | 
			
		||||
        self.walk()
 | 
			
		||||
        self.eat()
 | 
			
		||||
 | 
			
		||||
class kid1(People):
 | 
			
		||||
    def __init__(self):
 | 
			
		||||
        pass
 | 
			
		||||
 | 
			
		||||
    def walk(self):
 | 
			
		||||
        print('走路')
 | 
			
		||||
 | 
			
		||||
    def eat(self):
 | 
			
		||||
        print('吃饭')
 | 
			
		||||
 | 
			
		||||
if __name__ == '__main__':
 | 
			
		||||
    k = kid1()
 | 
			
		||||
    k.auto()
 | 
			
		||||
		Loading…
	
		Reference in New Issue