From f96bcc799c36219c76a3475e71f186845a53f61a Mon Sep 17 00:00:00 2001 From: Colin Date: Fri, 19 Jan 2024 14:54:48 +0800 Subject: [PATCH] Refine model of qwen for long sequence in eval. --- qwen/demo.py | 17 +++++++---------- qwen/modeling_qwen.py | 29 +++++++++-------------------- 2 files changed, 16 insertions(+), 30 deletions(-) diff --git a/qwen/demo.py b/qwen/demo.py index f5da548..959bae0 100644 --- a/qwen/demo.py +++ b/qwen/demo.py @@ -52,11 +52,10 @@ print(model) tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True) -model = model.from_pretrained( - model_dir, config=config, device_map="auto", trust_remote_code=True -).train() -# model.train() -# model.zero_grad() +model = model.from_pretrained(model_dir, config=config, device_map="auto", trust_remote_code=True) + +# model = model.eval() +model = model.train() # control by @torch.no_grad() # 可指定不同的生成长度、top_p等相关超参 # model.generation_config = GenerationConfig.from_pretrained( @@ -74,16 +73,14 @@ print(decode_tokens) # 日本的首都东京。<|im_end|><|endoftext|> - - # # 第一轮对话 # response, history, decode_tokens = model.chat(tokenizer, "你好", "", history=None) # print(decode_tokens) # # 你好!很高兴为你提供帮助。 # 第二轮对话 -# response, history = model.chat(tokenizer, "给我讲一个年轻人奋斗创业最终取得成功的故事。", history=None) -# print(response) +response, history, decode_tokens = model.chat(tokenizer, "给我讲一个年轻人奋斗创业最终取得成功的故事。", "", history=None) +print(response) # <|im_start|>system @@ -93,4 +90,4 @@ print(decode_tokens) # <|im_start|>assistant # 莎士比亚是头一个使用“你好”这个词的文学家,他在《哈姆雷特》中写道:“你是谁?你在哪儿? # ”他的这一段话,通常被认为是最早的使用“你好”这个词的文学记载。这句话在英国语中非常常见, -# 特别是在正式或礼貌的情况下。<|im_end|><|endoftext|> \ No newline at end of file +# 特别是在正式或礼貌的情况下。<|im_end|><|endoftext|> diff --git a/qwen/modeling_qwen.py b/qwen/modeling_qwen.py index 44f8806..6c75d38 100644 --- a/qwen/modeling_qwen.py +++ b/qwen/modeling_qwen.py @@ -41,8 +41,10 @@ import sys sys.path.append("..") from tools import show +from tools import mem_tracker -logger = logging.get_logger(__name__) +# tracker = mem_tracker.MemTracker() +# tracker.track() class QWenAttention(nn.Module): @@ -110,8 +112,6 @@ class QWenAttention(nn.Module): query = apply_rotary_pos_emb(query, q_pos_emb) key = apply_rotary_pos_emb(key, k_pos_emb) - present = (key, value) - key_size = key.size(1) if key_size > self.seq_length and not self.training: seq_start = key.size(1) - query.size(1) @@ -148,8 +148,8 @@ class QWenAttention(nn.Module): attn_output = F.scaled_dot_product_attention(query, key, value, attn_mask=attention_mask).transpose(1, 2) context_layer = self._merge_heads(attn_output, self.num_heads, self.head_dim) attn_output = self.c_proj(context_layer) - outputs = (attn_output, present) - return outputs + + return attn_output class QWenMLP(nn.Module): @@ -199,7 +199,6 @@ class QWenBlock(nn.Module): attention_mask=attention_mask, ) attn_output = attn_outputs[0] - outputs = attn_outputs[1:] residual = hidden_states layernorm_input = attn_output + residual @@ -207,8 +206,7 @@ class QWenBlock(nn.Module): residual = layernorm_input mlp_output = self.mlp(layernorm_output) hidden_states = residual + mlp_output - outputs = (hidden_states,) + outputs - return outputs + return hidden_states class QWenPreTrainedModel(PreTrainedModel): @@ -312,16 +310,13 @@ class QWenModel(QWenPreTrainedModel): hidden_states = self.drop(hidden_states) output_shape = input_shape + (hidden_states.size(-1),) - presents = () all_hidden_states = None - for i, block in enumerate(self.h): - outputs = block( + for block in self.h: + hidden_states = block( hidden_states, rotary_pos_emb_list=rotary_pos_emb_list, attention_mask=attention_mask, ) - hidden_states = outputs[0] - presents = presents + (outputs[1],) hidden_states = self.ln_f(hidden_states) hidden_states = hidden_states.view(output_shape) @@ -392,6 +387,7 @@ class QWenLMHeadModel(QWenPreTrainedModel): attentions=transformer_outputs.attentions, ) + @torch.no_grad() def chat( self, tokenizer: PreTrainedTokenizer, @@ -454,15 +450,9 @@ class QWenLMHeadModel(QWenPreTrainedModel): # 2. Set generation parameters if not already defined if generation_config.pad_token_id is None and generation_config.eos_token_id is not None: - if model_kwargs.get("attention_mask", None) is None: - logger.warning( - "The attention mask and the pad token id were not set. As a consequence, you may observe " - "unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results." - ) eos_token_id = generation_config.eos_token_id if isinstance(eos_token_id, list): eos_token_id = eos_token_id[0] - logger.warning(f"Setting `pad_token_id` to `eos_token_id`:{eos_token_id} for open-end generation.") generation_config.pad_token_id = eos_token_id # 3. Define model inputs @@ -571,7 +561,6 @@ class QWenLMHeadModel(QWenPreTrainedModel): if this_peer_finished: break - return input_ids