Refine model of qwen and add runner.

2024-01-21 12:45:56 +08:00 · 2024-01-21 12:45:56 +08:00 · 9d28280cb1
parent 7c047f0b32
commit 9d28280cb1
8 changed files with 157 additions and 684 deletions
--- a/qwen/configuration.json
+++ b/qwen/configuration.json
@ -1,5 +0,0 @@
-{
-    "framework": "pytorch",
-    "task": "chat",
-    "allow_remote": true
-}
--- a/qwen/cpp_kernels.py
+++ b/qwen/cpp_kernels.py
@ -1,55 +0,0 @@
-from torch.utils import cpp_extension
-import pathlib
-import os
-import subprocess
-
-def _get_cuda_bare_metal_version(cuda_dir):
-    raw_output = subprocess.check_output([cuda_dir + "/bin/nvcc", "-V"],
-                                         universal_newlines=True)
-    output = raw_output.split()
-    release_idx = output.index("release") + 1
-    release = output[release_idx].split(".")
-    bare_metal_major = release[0]
-    bare_metal_minor = release[1][0]
-
-    return raw_output, bare_metal_major, bare_metal_minor
-
-def _create_build_dir(buildpath):
-    try:
-        os.mkdir(buildpath)
-    except OSError:
-        if not os.path.isdir(buildpath):
-            print(f"Creation of the build directory {buildpath} failed")
-
-# Check if cuda 11 is installed for compute capability 8.0
-cc_flag = []
-_, bare_metal_major, bare_metal_minor = _get_cuda_bare_metal_version(cpp_extension.CUDA_HOME)
-if int(bare_metal_major) >= 11:
-    cc_flag.append('-gencode')
-    cc_flag.append('arch=compute_80,code=sm_80')
-    if int(bare_metal_minor) >= 7:
-        cc_flag.append('-gencode')
-        cc_flag.append('arch=compute_90,code=sm_90')
-
-# Build path
-srcpath = pathlib.Path(__file__).parent.absolute()
-buildpath = srcpath / 'build'
-_create_build_dir(buildpath)
-
-def _cpp_extention_load_helper(name, sources, extra_cuda_flags):
-    return cpp_extension.load(
-        name=name,
-        sources=sources,
-        build_directory=buildpath,
-        extra_cflags=['-O3', ],
-        extra_cuda_cflags=['-O3',
-                           '-gencode', 'arch=compute_70,code=sm_70',
-                           '--use_fast_math'] + extra_cuda_flags + cc_flag,
-        verbose=1
-    )
-
-extra_flags = []
-
-cache_autogptq_cuda_256_sources = ["./cache_autogptq_cuda_256.cpp",
-           "./cache_autogptq_cuda_kernel_256.cu"]
-cache_autogptq_cuda_256 = _cpp_extention_load_helper("cache_autogptq_cuda_256", cache_autogptq_cuda_256_sources, extra_flags)
--- a/qwen/demo.py
+++ b/qwen/demo.py
@ -5,6 +5,7 @@ from transformers.generation import GenerationConfig
 from transformers import AutoConfig

 from modeling_qwen import QWenLMHeadModel
+from modeling_qwen import QwenRunner

 seed = 4321
 torch.manual_seed(seed)
@ -35,8 +36,10 @@ model = model.eval()
 #     model_dir, trust_remote_code=True
 # )

+runner = QwenRunner(model)
+
 # 第一轮对话
-response, history, decode_tokens = model.chat(tokenizer, "东南亚国家日本的首都是什么市", "", history=None)
+response, history, decode_tokens = runner.Chat(tokenizer, "东南亚国家日本的首都是什么市", "")
 print(decode_tokens)
 # <|im_start|>system
 # You are a helpful assistant.<|im_end|>
@ -46,7 +49,8 @@ print(decode_tokens)
 # 日本的首都东京。<|im_end|><|endoftext|>

 # 第二轮对话
-response, history, decode_tokens = model.chat(tokenizer, "给我讲一个年轻人奋斗创业最终取得成功的故事。", "", history=None)
+
+response, history, decode_tokens = runner.Chat(tokenizer, "给我讲一个年轻人奋斗创业最终取得成功的故事。", "")
 print(decode_tokens)

 if decode_tokens.split("\n")[-2] != """这个故事告诉我们，只要我们有决心和毅力，就一定能够克服困难，实现我们的梦想。<|im_end|>""":
--- a/qwen/finetune.py
+++ b/qwen/finetune.py
@ -1,403 +0,0 @@
-# This code is based on the revised code from fastchat based on tatsu-lab/stanford_alpaca.
-
-
-from dataclasses import dataclass, field
-import json
-import math
-import logging
-import os
-from typing import Dict, Optional, List
-import torch
-from torch.utils.data import Dataset
-from deepspeed import zero
-from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus
-import transformers
-from transformers import Trainer, GPTQConfig, deepspeed
-from transformers.trainer_pt_utils import LabelSmoother
-from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
-from accelerate.utils import DistributedType
-from modelscope import snapshot_download
-
-from modeling_qwen import QWenLMHeadModel
-
-
-IGNORE_TOKEN_ID = LabelSmoother.ignore_index
-
-
-@dataclass
-class ModelArguments:
-    model_name_or_path: Optional[str] = field(default="qwen/Qwen-1_8B-Chat")
-
-
-@dataclass
-class DataArguments:
-    data_path: str = field(
-        default=None, metadata={"help": "Path to the training data."}
-    )
-    eval_data_path: str = field(
-        default=None, metadata={"help": "Path to the evaluation data."}
-    )
-    lazy_preprocess: bool = False
-
-
-@dataclass
-class TrainingArguments(transformers.TrainingArguments):
-    cache_dir: Optional[str] = field(default=None)
-    optim: str = field(default="adamw_torch")
-    model_max_length: int = field(
-        default=8192,
-        metadata={
-            "help": "Maximum sequence length. Sequences will be right padded (and possibly truncated)."
-        },
-    )
-    use_lora: bool = False
-
-
-@dataclass
-class LoraArguments:
-    lora_r: int = 64
-    lora_alpha: int = 16
-    lora_dropout: float = 0.05
-    lora_target_modules: List[str] = field(
-        default_factory=lambda: ["c_attn", "c_proj", "w1", "w2"]
-    )
-    lora_weight_path: str = ""
-    lora_bias: str = "none"
-    q_lora: bool = False
-
-
-def maybe_zero_3(param):
-    if hasattr(param, "ds_id"):
-        assert param.ds_status == ZeroParamStatus.NOT_AVAILABLE
-        with zero.GatheredParameters([param]):
-            param = param.data.detach().cpu().clone()
-    else:
-        param = param.detach().cpu().clone()
-    return param
-
-
-# Borrowed from peft.utils.get_peft_model_state_dict
-def get_peft_state_maybe_zero_3(named_params, bias):
-    if bias == "none":
-        to_return = {k: t for k, t in named_params if "lora_" in k}
-    elif bias == "all":
-        to_return = {k: t for k, t in named_params if "lora_" in k or "bias" in k}
-    elif bias == "lora_only":
-        to_return = {}
-        maybe_lora_bias = {}
-        lora_bias_names = set()
-        for k, t in named_params:
-            if "lora_" in k:
-                to_return[k] = t
-                bias_name = k.split("lora_")[0] + "bias"
-                lora_bias_names.add(bias_name)
-            elif "bias" in k:
-                maybe_lora_bias[k] = t
-        for k, t in maybe_lora_bias:
-            if bias_name in lora_bias_names:
-                to_return[bias_name] = t
-    else:
-        raise NotImplementedError
-    to_return = {k: maybe_zero_3(v) for k, v in to_return.items()}
-    return to_return
-
-
-local_rank = None
-
-
-def rank0_print(*args):
-    if local_rank == 0:
-        print(*args)
-
-
-def safe_save_model_for_hf_trainer(
-    trainer: transformers.Trainer, output_dir: str, bias="none"
-):
-    """Collects the state dict and dump to disk."""
-    # check if zero3 mode enabled
-    if deepspeed.is_deepspeed_zero3_enabled():
-        state_dict = trainer.model_wrapped._zero3_consolidated_16bit_state_dict()
-    else:
-        if trainer.args.use_lora:
-            state_dict = get_peft_state_maybe_zero_3(
-                trainer.model.named_parameters(), bias
-            )
-        else:
-            state_dict = trainer.model.state_dict()
-    if trainer.args.should_save and trainer.args.local_rank == 0:
-        trainer._save(output_dir, state_dict=state_dict)
-
-
-def preprocess(
-    sources,
-    tokenizer: transformers.PreTrainedTokenizer,
-    max_len: int,
-    system_message: str = "You are a helpful assistant.",
-) -> Dict:
-    roles = {"user": "<|im_start|>user", "assistant": "<|im_start|>assistant"}
-
-    im_start = tokenizer.im_start_id
-    im_end = tokenizer.im_end_id
-    nl_tokens = tokenizer("\n").input_ids
-    _system = tokenizer("system").input_ids + nl_tokens
-    _user = tokenizer("user").input_ids + nl_tokens
-    _assistant = tokenizer("assistant").input_ids + nl_tokens
-
-    # Apply prompt templates
-    input_ids, targets = [], []
-    for i, source in enumerate(sources):
-        if roles[source[0]["from"]] != roles["user"]:
-            source = source[1:]
-
-        input_id, target = [], []
-        system = (
-            [im_start]
-            + _system
-            + tokenizer(system_message).input_ids
-            + [im_end]
-            + nl_tokens
-        )
-        input_id += system
-        target += (
-            [im_start] + [IGNORE_TOKEN_ID] * (len(system) - 3) + [im_end] + nl_tokens
-        )
-        assert len(input_id) == len(target)
-        for j, sentence in enumerate(source):
-            role = roles[sentence["from"]]
-            _input_id = (
-                tokenizer(role).input_ids
-                + nl_tokens
-                + tokenizer(sentence["value"]).input_ids
-                + [im_end]
-                + nl_tokens
-            )
-            input_id += _input_id
-            if role == "<|im_start|>user":
-                _target = (
-                    [im_start]
-                    + [IGNORE_TOKEN_ID] * (len(_input_id) - 3)
-                    + [im_end]
-                    + nl_tokens
-                )
-            elif role == "<|im_start|>assistant":
-                _target = (
-                    [im_start]
-                    + [IGNORE_TOKEN_ID] * len(tokenizer(role).input_ids)
-                    + _input_id[len(tokenizer(role).input_ids) + 1 : -2]
-                    + [im_end]
-                    + nl_tokens
-                )
-            else:
-                raise NotImplementedError
-            target += _target
-        assert len(input_id) == len(target)
-        input_id += [tokenizer.pad_token_id] * (max_len - len(input_id))
-        target += [IGNORE_TOKEN_ID] * (max_len - len(target))
-        input_ids.append(input_id[:max_len])
-        targets.append(target[:max_len])
-    input_ids = torch.tensor(input_ids, dtype=torch.int)
-    targets = torch.tensor(targets, dtype=torch.int)
-
-    return dict(
-        input_ids=input_ids,
-        labels=targets,
-        attention_mask=input_ids.ne(tokenizer.pad_token_id),
-    )
-
-
-class SupervisedDataset(Dataset):
-    """Dataset for supervised fine-tuning."""
-
-    def __init__(
-        self, raw_data, tokenizer: transformers.PreTrainedTokenizer, max_len: int
-    ):
-        super(SupervisedDataset, self).__init__()
-
-        rank0_print("Formatting inputs...")
-        sources = [example["conversations"] for example in raw_data]
-        data_dict = preprocess(sources, tokenizer, max_len)
-
-        self.input_ids = data_dict["input_ids"]
-        self.labels = data_dict["labels"]
-        self.attention_mask = data_dict["attention_mask"]
-
-    def __len__(self):
-        return len(self.input_ids)
-
-    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
-        return dict(
-            input_ids=self.input_ids[i],
-            labels=self.labels[i],
-            attention_mask=self.attention_mask[i],
-        )
-
-
-class LazySupervisedDataset(Dataset):
-    """Dataset for supervised fine-tuning."""
-
-    def __init__(
-        self, raw_data, tokenizer: transformers.PreTrainedTokenizer, max_len: int
-    ):
-        super(LazySupervisedDataset, self).__init__()
-        self.tokenizer = tokenizer
-        self.max_len = max_len
-
-        rank0_print("Formatting inputs...Skip in lazy mode")
-        self.tokenizer = tokenizer
-        self.raw_data = raw_data
-        self.cached_data_dict = {}
-
-    def __len__(self):
-        return len(self.raw_data)
-
-    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
-        if i in self.cached_data_dict:
-            return self.cached_data_dict[i]
-
-        ret = preprocess(
-            [self.raw_data[i]["conversations"]], self.tokenizer, self.max_len
-        )
-        ret = dict(
-            input_ids=ret["input_ids"][0],
-            labels=ret["labels"][0],
-            attention_mask=ret["attention_mask"][0],
-        )
-        self.cached_data_dict[i] = ret
-
-        return ret
-
-
-def make_supervised_data_module(
-    tokenizer: transformers.PreTrainedTokenizer,
-    data_args,
-    max_len,
-) -> Dict:
-    """Make dataset and collator for supervised fine-tuning."""
-    dataset_cls = (
-        LazySupervisedDataset if data_args.lazy_preprocess else SupervisedDataset
-    )
-    rank0_print("Loading data...")
-
-    train_json = json.load(open(data_args.data_path, "r"))
-    train_dataset = dataset_cls(train_json, tokenizer=tokenizer, max_len=max_len)
-
-    if data_args.eval_data_path:
-        eval_json = json.load(open(data_args.eval_data_path, "r"))
-        eval_dataset = dataset_cls(eval_json, tokenizer=tokenizer, max_len=max_len)
-    else:
-        eval_dataset = None
-
-    return dict(train_dataset=train_dataset, eval_dataset=eval_dataset)
-
-
-def train():
-    global local_rank
-
-    parser = transformers.HfArgumentParser(
-        (ModelArguments, DataArguments, TrainingArguments, LoraArguments)
-    )
-    (
-        model_args,
-        data_args,
-        training_args,
-        lora_args,
-    ) = parser.parse_args_into_dataclasses()
-
-    # This serves for single-gpu qlora.
-    if (
-        getattr(training_args, "deepspeed", None)
-        and int(os.environ.get("WORLD_SIZE", 1)) == 1
-    ):
-        training_args.distributed_state.distributed_type = DistributedType.DEEPSPEED
-
-    local_rank = training_args.local_rank
-
-    device_map = "auto"
-    world_size = int(os.environ.get("WORLD_SIZE", 1))
-    ddp = world_size != 1
-    if lora_args.q_lora:
-        device_map = {"": int(os.environ.get("LOCAL_RANK") or 0)} if ddp else "auto"
-        if len(training_args.fsdp) > 0 or deepspeed.is_deepspeed_zero3_enabled():
-            logging.warning("FSDP or ZeRO3 are incompatible with QLoRA.")
-
-    model_dir = snapshot_download(model_args.model_name_or_path)
-
-    # Set RoPE scaling factor
-    config = transformers.AutoConfig.from_pretrained(
-        model_dir,
-        cache_dir=training_args.cache_dir,
-        trust_remote_code=True,
-    )
-    config.use_cache = False
-
-    # Load model and tokenizer
-
-    model = QWenLMHeadModel(config)
-    model = model.from_pretrained(
-        model_dir,
-        config=config,
-        cache_dir=training_args.cache_dir,
-        device_map=device_map,
-        trust_remote_code=True,
-        quantization_config=GPTQConfig(bits=4, disable_exllama=True)
-        if training_args.use_lora and lora_args.q_lora
-        else None,
-    )
-
-    tokenizer = transformers.AutoTokenizer.from_pretrained(
-        model_dir,
-        cache_dir=training_args.cache_dir,
-        model_max_length=training_args.model_max_length,
-        padding_side="right",
-        use_fast=False,
-        trust_remote_code=True,
-    )
-    tokenizer.pad_token_id = tokenizer.eod_id
-
-    if training_args.use_lora:
-        if lora_args.q_lora or "chat" in model_dir.lower():
-            modules_to_save = None
-        else:
-            modules_to_save = ["wte", "lm_head"]
-        lora_config = LoraConfig(
-            r=lora_args.lora_r,
-            lora_alpha=lora_args.lora_alpha,
-            target_modules=lora_args.lora_target_modules,
-            lora_dropout=lora_args.lora_dropout,
-            bias=lora_args.lora_bias,
-            task_type="CAUSAL_LM",
-            modules_to_save=modules_to_save,  # This argument serves for adding new tokens.
-        )
-        if lora_args.q_lora:
-            model = prepare_model_for_kbit_training(
-                model, use_gradient_checkpointing=training_args.gradient_checkpointing
-            )
-
-        model = get_peft_model(model, lora_config)
-
-        # Print peft trainable params
-        model.print_trainable_parameters()
-
-        if training_args.gradient_checkpointing:
-            model.enable_input_require_grads()
-
-    # Load data
-    data_module = make_supervised_data_module(
-        tokenizer=tokenizer, data_args=data_args, max_len=training_args.model_max_length
-    )
-
-    # Start trainner
-    trainer = Trainer(
-        model=model, tokenizer=tokenizer, args=training_args, **data_module
-    )
-
-    trainer.train()
-    trainer.save_state()
-
-    safe_save_model_for_hf_trainer(
-        trainer=trainer, output_dir=training_args.output_dir, bias=lora_args.lora_bias
-    )
-
-
-if __name__ == "__main__":
-    train()
--- a/qwen/finetune_lora_single_gpu.sh
+++ b/qwen/finetune_lora_single_gpu.sh
@ -1,65 +0,0 @@
-#!/bin/bash
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-
-MODEL="qwen/Qwen-1_8B-Chat" # Set the path if you do not want to load from huggingface directly
-# ATTENTION: specify the path to your training data, which should be a json file consisting of a list of conversations.
-# See the section for finetuning in README for more information.
-DATA="data.json"
-
-function usage() {
-    echo '
-Usage: bash finetune/finetune_lora_single_gpu.sh [-m MODEL_PATH] [-d DATA_PATH]
-'
-}
-
-while [[ "$1" != "" ]]; do
-    case $1 in
-        -m | --model )
-            shift
-            MODEL=$1
-            ;;
-        -d | --data )
-            shift
-            DATA=$1
-            ;;
-        -h | --help )
-            usage
-            exit 0
-            ;;
-        * )
-            echo "Unknown argument ${1}"
-            exit 1
-            ;;
-    esac
-    shift
-done
-
-export CUDA_VISIBLE_DEVICES=0
-
-python finetune.py \
-  --model_name_or_path $MODEL \
-  --data_path $DATA \
-  --bf16 False \
-  --output_dir output_qwen \
-  --num_train_epochs 5 \
-  --per_device_train_batch_size 2 \
-  --per_device_eval_batch_size 1 \
-  --gradient_accumulation_steps 8 \
-  --evaluation_strategy "no" \
-  --save_strategy "steps" \
-  --save_steps 1000 \
-  --save_total_limit 10 \
-  --learning_rate 3e-4 \
-  --weight_decay 0.1 \
-  --adam_beta2 0.95 \
-  --warmup_ratio 0.01 \
-  --lr_scheduler_type "cosine" \
-  --logging_steps 1 \
-  --report_to "none" \
-  --model_max_length 512 \
-  --lazy_preprocess True \
-  --gradient_checkpointing \
-  --use_lora
-
-# If you use fp16 instead of bf16, you should use deepspeed
-# --fp16 True --deepspeed finetune/ds_config_zero2.json
--- a/qwen/modeling_qwen.py
+++ b/qwen/modeling_qwen.py
@ -16,10 +16,7 @@ from torch import nn
 from safetensors.torch import load_file as safe_load_file
 from safetensors.torch import save_file as safe_save_file

-from transformers.generation.utils import GenerateOutput
-from configuration_qwen import QWenConfig
 from qwen_generation_utils import (
-    HistoryType,
    make_context,
    decode_tokens,
 )
@ -137,7 +134,6 @@ class QWenLMHeadModel(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
-
        self.transformer = QWenModel(config)
        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)

@ -186,65 +182,42 @@ class QWenLMHeadModel(nn.Module):
        print(f"All model checkpoint weights were used when initializing {cls.__class__.__name__}.\n")
        return cls

+
+class QwenRunner:
+    def __init__(self, qwen):
+        self.qwen = qwen
+
    @torch.no_grad()
-    def chat(
+    def Chat(
        self,
        tokenizer,
        query: str,
        query_assistant: str,
-        history: Optional[HistoryType],
        system: str = "You are a helpful assistant.",
-        **kwargs,
-    ) -> Tuple[str, HistoryType]:
-        if history is None:
-            history = []
-        else:
+        history=[],
+    ):
+        qwen = self.qwen
        history = copy.deepcopy(history)
-
        raw_text, context_tokens = make_context(tokenizer, query, query_assistant, history=history, system=system)
-        input_ids = torch.tensor([context_tokens]).to(next(self.parameters()).device)
-        outputs = self.generate(
-            input_ids,
-            tokenizer=tokenizer,
-            **kwargs,
-        )
-        decoded, response, end_reason = decode_tokens(
-            outputs[0],
-            tokenizer,
-            raw_text_len=len(raw_text),
-            context_length=len(context_tokens),
-            errors="replace",
-        )
-        history.append((query, response))
-        return response, history, decoded
+        input_ids = torch.tensor([context_tokens]).to(next(qwen.parameters()).device)
+        eos_token_id_tensor = torch.tensor([qwen.config.eos_token_id]).to(input_ids.device)
+        pad_token_id = qwen.config.pad_token_id

-    def generate(
-        self,
-        input_ids: Optional[torch.Tensor] = None,
-        tokenizer=None,
-    ) -> Union[GenerateOutput, torch.LongTensor]:
-        pad_token_id = self.config.pad_token_id
-        eos_token_id_tensor = torch.tensor([self.config.eos_token_id]).to(input_ids.device)
-
-        # keep track of which sequences are already finished
        unfinished_sequences = torch.ones(input_ids.shape[0], dtype=torch.long, device=input_ids.device)
-
        this_peer_finished = False
-        # auto-regressive generation
        while True:
-            # forward pass to get next token
-            outputs = forwardQWen(self, input_ids)
+            outputs = self.forwardQWen(input_ids)
            next_token_scores = outputs[:, -1, :]

            # repetition_penalty
-            penalty = self.config.repetition_penalty
+            penalty = qwen.config.repetition_penalty
            score = torch.gather(next_token_scores, 1, input_ids)
            # if score < 0 then repetition penalty has to be multiplied to reduce the token probabilities
            score = torch.where(score < 0, score * penalty, score / penalty)
            next_token_scores = next_token_scores.scatter_(1, input_ids, score)

            # top_p
-            top_p = self.config.top_p
+            top_p = qwen.config.top_p
            filter_value = -float("Inf")
            min_tokens_to_keep = 1
            sorted_logits, sorted_indices = torch.sort(next_token_scores, descending=False)
@ -262,33 +235,29 @@ class QWenLMHeadModel(nn.Module):
            next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)

            next_tokens = next_tokens * unfinished_sequences + pad_token_id * (1 - unfinished_sequences)
-
-            # update generated ids, model inputs, and length for next step
            input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
-
            unfinished_sequences = unfinished_sequences.mul(
                next_tokens.tile(eos_token_id_tensor.shape[0], 1).ne(eos_token_id_tensor.unsqueeze(1)).prod(dim=0)
            )

-            # decoded, response, end_reason = decode_tokens(
-            #     next_tokens,
-            #     tokenizer,
-            #     raw_text_len=0,
-            #     context_length=0,
-            #     errors="replace",
-            # )
-            # print(decoded)
-
-            # stop when each sentence is finished
            if unfinished_sequences.max() == 0:
                this_peer_finished = True

            if this_peer_finished:
                break
-        return input_ids

+        decoded, response, end_reason = decode_tokens(
+            input_ids[0],
+            tokenizer,
+            raw_text_len=len(raw_text),
+            context_length=len(context_tokens),
+            errors="replace",
+        )
+        history.append((query, response))
+        return response, history, decoded

    def forwardAttention(
+        self,
        attention,
        hidden_states: Optional[Tuple[torch.FloatTensor]],
        rotary_pos_emb_list: Optional[List[List[torch.Tensor]]] = None,
@ -340,15 +309,15 @@ def forwardAttention(

        return attn_output

-
    def forwardQWenBlock(
+        self,
        block,
        hidden_states: Optional[Tuple[torch.FloatTensor]],
        rotary_pos_emb_list: Optional[List[List[torch.Tensor]]] = None,
    ):
        layernorm_output = block.ln_1(hidden_states)

-    attn_outputs = forwardAttention(block.attn, layernorm_output, rotary_pos_emb_list)
+        attn_outputs = self.forwardAttention(block.attn, layernorm_output, rotary_pos_emb_list)
        attn_output = attn_outputs[0]
        layernorm_input = attn_output + hidden_states

@ -361,13 +330,12 @@ def forwardQWenBlock(
        hidden_states = layernorm_input + mlp_output
        return hidden_states

-
    def forwardQWen(
-    qwen,
+        self,
        input_ids: Optional[torch.LongTensor] = None,
        labels: Optional[torch.LongTensor] = None,
    ):
-    transfm = qwen.transformer
+        transfm = self.qwen.transformer
        input_shape = input_ids.size()
        input_ids = input_ids.view(-1, input_shape[-1])
        hidden_states = transfm.wte(input_ids)
@ -381,12 +349,12 @@ def forwardQWen(
        output_shape = input_shape + (hidden_states.size(-1),)

        for block in transfm.h:
-        hidden_states = forwardQWenBlock(block, hidden_states, rotary_pos_emb_list=rotary_pos_emb_list)
+            hidden_states = self.forwardQWenBlock(block, hidden_states, rotary_pos_emb_list=rotary_pos_emb_list)

        hidden_states = transfm.ln_f(hidden_states)
        hidden_states = hidden_states.view(output_shape)

-    lm_logits = qwen.lm_head(hidden_states)
+        lm_logits = self.qwen.lm_head(hidden_states)

        loss = None
        if labels is not None:
--- a/qwen/train.py
+++ b/qwen/train.py
--- a/test/abc.py
+++ b/test/abc.py
@ -0,0 +1,29 @@
+
+from abc import ABC, abstractmethod
+
+class People(ABC):
+    # @abstractmethod
+    def walk(self):
+        pass
+
+    @abstractmethod
+    def eat(self):
+        pass
+        
+    def auto(self):
+        self.walk()
+        self.eat()
+
+class kid1(People):
+    def __init__(self):
+        pass
+
+    def walk(self):
+        print('走路')
+
+    def eat(self):
+        print('吃饭')
+
+if __name__ == '__main__':
+    k = kid1()
+    k.auto()