From fa7078b72de27d7f2f416ef69f53a7c8660b6814 Mon Sep 17 00:00:00 2001 From: Colin Date: Mon, 25 Dec 2023 16:22:45 +0800 Subject: [PATCH] Update code. --- Readme.md | 63 ++++++++++++++++++++++++++++++++++++- chatglm/modeling_chatglm.py | 33 ++++++++----------- demo.py | 12 ++++++- tensor.py | 13 ++++++++ 4 files changed, 99 insertions(+), 22 deletions(-) diff --git a/Readme.md b/Readme.md index ecc1f8b..7223c67 100644 --- a/Readme.md +++ b/Readme.md @@ -32,4 +32,65 @@ hidden_states -> [6, 1, 4096] 4096:hidden_size variance = hidden_states.pow(2).mean(-1, keepdim=True) -> [6, 1, 1] hidden_states = hidden_states * torch.rsqrt(variance + self.eps) 平方根倒数 self.weight -> [4096] -return (self.weight * hidden_states) -> [6, 1, 4096] \ No newline at end of file +return (self.weight * hidden_states) -> [6, 1, 4096] + +## MLP + +Linear(hidden_states) no bias -> [6, 1, 27392] +silu (x) = [6, 1, 13696] * sigmoid([6, 1, 13696]) +Linear(intermediate_parallel) no bias -> [6, 1, 4096] + +## core_attention + +query_layer=query_layer.permute(1, 2, 0, 3) -> [1, 32, 6, 128] +key_layer=key_layer.permute(1, 2, 0, 3) -> [1, 32, 6, 128] +value_layer=value_layer.permute(1, 2, 0, 3) -> [1, 32, 6, 128] +context_layer = scaled_dot_product_attention(query_layer, key_layer, value_layer) -> [1, 32, 6, 128] + softmax(QK^T/sqrt(in_dim))V + att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1))) + att = F.softmax(att, dim=-1) + y = att @ v -> (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs) +context_layer = context_layer.permute(2, 0, 1, 3) +context_layer = context_layer.reshape() -> [6, 1, 4096] + +## self_attention + +hidden_states: [s, b, h] +mixed_x_layer = Linear(hidden_states) -> [6, 1, 4608] 4608:4096+256+256 + +(query_layer, key_layer, value_layer) = mixed_x_layer.split -> [6, 1, 4096], [6, 1, 256], [6, 1, 256] +query_layer = query_layer.view -> [6, 1, 32, 128] +key_layer = key_layer.view -> [6, 1, 2, 128] +value_layer = value_layer.view -> [6, 1, 2, 128] + +query_layer = self.apply_rotary_pos_emb(query_layer, rotary_pos_emb) +key_layer = self.apply_rotary_pos_emb(key_layer, rotary_pos_emb) + +key_layer = key_layer.unsqueeze(-2) -> [6, 1, 2, 1, 128] +key_layer = key_layer.expand -> [6, 1, 2, 16, 128] +key_layer = key_layer.contiguous().view -> [6, 1, 32, 128] + +value_layer = value_layer.unsqueeze(-2) -> [6, 1, 2, 1, 128] +value_layer = value_layer.expand -> [6, 1, 2, 16, 128] +value_layer = value_layer.contiguous().view -> [6, 1, 32, 128] + +context_layer = self.core_attention(query_layer, key_layer, value_layer) -> [6, 1, 4096] +return Linear(context_layer) -> [6, 1, 4096] + +## GLMBlock + + input + | \ + | RMSNorm + | self_attention + | dropout + | / + Add + | \ + | RMSNorm + | mlp + | dropout + | / + Add + +所有的输出shape都是[6, 1, 4096], 6:sequence_length 1:batch_num 4096:hidden_size \ No newline at end of file diff --git a/chatglm/modeling_chatglm.py b/chatglm/modeling_chatglm.py index 8e60b6c..c28edd1 100644 --- a/chatglm/modeling_chatglm.py +++ b/chatglm/modeling_chatglm.py @@ -4,6 +4,7 @@ import copy import os import gc import json +import hashlib import torch import torch.utils.checkpoint @@ -148,28 +149,20 @@ class SelfAttention(torch.nn.Module): dtype=config.torch_dtype, ) - def apply_rotary_pos_emb( - self, x: torch.Tensor, rope_cache: torch.Tensor - ) -> torch.Tensor: + def apply_rotary_pos_emb(self, x: torch.Tensor, rope: torch.Tensor) -> torch.Tensor: # x: [sq, b, np, hn] sq, b, np, hn = x.size(0), x.size(1), x.size(2), x.size(3) - rot_dim = rope_cache.shape[-2] * 2 - x, x_pass = x[..., :rot_dim], x[..., rot_dim:] - # truncate to support variable sizes - rope_cache = rope_cache[:sq] - xshaped = x.reshape(sq, -1, np, rot_dim // 2, 2) - rope_cache = rope_cache.view(sq, -1, 1, xshaped.size(3), 2) - x_out2 = torch.stack( - [ - xshaped[..., 0] * rope_cache[..., 0] - - xshaped[..., 1] * rope_cache[..., 1], - xshaped[..., 1] * rope_cache[..., 0] - + xshaped[..., 0] * rope_cache[..., 1], - ], - -1, - ) - x_out2 = x_out2.flatten(3) - return torch.cat((x_out2, x_pass), dim=-1) + if rope.size(0) != sq: + raise ("Error rotary_pos_emb size") + x_rope = x[..., : hn // 2] + x_pass = x[..., hn // 2 :] + x_rope = x_rope.reshape(sq, -1, np, hn // 4, 1, 2) + rope = rope.view(sq, -1, 1, hn // 4, 1, 2) + roped1 = x_rope[..., 0] * rope[..., 0] - x_rope[..., 1] * rope[..., 1] + roped2 = x_rope[..., 1] * rope[..., 0] + x_rope[..., 0] * rope[..., 1] + x_out = torch.cat((roped1, roped2), -1) + x_out = x_out.flatten(3) + return torch.cat((x_out, x_pass), dim=-1) def forward(self, hidden_states, rotary_pos_emb): # hidden_states: [sq, b, h] diff --git a/demo.py b/demo.py index 5703b70..ad1215c 100644 --- a/demo.py +++ b/demo.py @@ -1,11 +1,15 @@ import json - +import torch from chatglm import ChatGLMForConditionalGeneration from chatglm import ChatGLMTokenizer from transformers import AutoConfig +seed = 1234 +torch.manual_seed(seed) +torch.cuda.manual_seed_all(seed) + pretrained_model_name_or_path = "../ZhipuAI/chatglm3-6b" config, kwargs = AutoConfig.from_pretrained( pretrained_model_name_or_path, @@ -38,9 +42,15 @@ glm = glm.eval() query = "colin" response, history = glm.chat(tokenizer, query, history=[]) print(response) +if response[1:] != " Hello! How can I assist you today": + raise () + query = "你好" response, history = glm.chat(tokenizer, query, history=history) print(response) +if response[1:] != " 你好!有什么我可以帮助你的吗": + raise () + # response, history = glm.chat(tokenizer, "你是一个心理学专家,请问晚上睡不着应该怎么办", history=history) # print(response) diff --git a/tensor.py b/tensor.py index 4ef7566..24633d4 100644 --- a/tensor.py +++ b/tensor.py @@ -30,7 +30,20 @@ print() print(x.unsqueeze(1).shape) print(x.unsqueeze(1).squeeze(1).shape) + x = torch.tensor([[1, 2], [3, 4]]).to(float) print(x.mean(1)) print(x.mean(0)) print(x.mean(0, keepdim=True)) + +print() +print() +x = torch.tensor([[1, 2], [3, 4]]) +print(x.flatten(0)) + +x = torch.tensor([[1, 2], [3, 4]]) +print(torch.stack((x, x), 1)) +print(torch.cat((x, x), 1)) +# So if A and B are of shape (3, 4): +# torch.cat([A, B], dim=0) will be of shape (6, 4) +# torch.stack([A, B], dim=0) will be of shape (2, 3, 4)