diff --git a/Readme.md b/Readme.md
new file mode 100644
index 0000000..0760be7
--- /dev/null
+++ b/Readme.md
@@ -0,0 +1,23 @@
+
+
+
+## data flow
+
+input_ids = tokenizer.build_chat_input(query, history=history, role=role)
+
+input_ids -> [1, 6]
+inputs_embeds -> [6, 1, 4096]  4096:hidden_size
+rotary_pos_emb -> [6, 1, 32, 2]  32:pos的编码维度  2:cos+sin
+
+hidden_states = inputs_embeds
+for layers : GLMBlock(hidden_states, rotary_pos_emb)
+hidden_states = self.final_layernorm(hidden_states)
+hidden_states = hidden_states[-1:]
+lm_logits = self.output_layer(hidden_states)
+lm_logits = lm_logits.transpose(0, 1).contiguous()  -> [1, 1, 65024]
+
+probs = softmax(lm_logits) -> [1, 65024]
+next_tokens = torch.multinomial(probs, num_samples=1) 采样  -> [1]
+input_ids = torch.cat([input_ids, next_tokens)  -> [1, 7]
+
+response = tokenizer.decode(outputs)
\ No newline at end of file
diff --git a/chatglm/modeling_chatglm.py b/chatglm/modeling_chatglm.py
index 1fdd859..4264e92 100644
--- a/chatglm/modeling_chatglm.py
+++ b/chatglm/modeling_chatglm.py
@@ -170,7 +170,7 @@ class SelfAttention(torch.nn.Module):
         x_out2 = x_out2.flatten(3)
         return torch.cat((x_out2, x_pass), dim=-1)
 
-    def forward(self, hidden_states, rotary_pos_emb, kv_cache=None):
+    def forward(self, hidden_states, rotary_pos_emb):
         # hidden_states: [sq, b, h]
         # Attention heads [sq, b, h] --> [sq, b, (np * 3 * hn)]
         mixed_x_layer = self.query_key_value(hidden_states)
@@ -213,8 +213,6 @@ class SelfAttention(torch.nn.Module):
             query_layer = self.apply_rotary_pos_emb(query_layer, rotary_pos_emb)
             key_layer = self.apply_rotary_pos_emb(key_layer, rotary_pos_emb)
 
-        kv_cache = (key_layer, value_layer)
-
         key_layer = key_layer.unsqueeze(-2)
         key_layer = key_layer.expand(
             -1,
@@ -255,7 +253,7 @@ class SelfAttention(torch.nn.Module):
         # Output. [sq, b, h]
         # =================
         output = self.dense(context_layer)
-        return output, kv_cache
+        return output
 
 
 class MLP(torch.nn.Module):
@@ -342,14 +340,12 @@ class GLMBlock(torch.nn.Module):
         # MLP
         self.mlp = MLP(config, device=device)
 
-    def forward(self, hidden_states, rotary_pos_emb, kv_cache=None):
+    def forward(self, hidden_states, rotary_pos_emb):
         # hidden_states: [s, b, h]
         # Layer norm at the beginning of the transformer layer.
         layernorm_output = self.input_layernorm(hidden_states)
         # Self attention.
-        attention_output, kv_cache = self.self_attention(
-            layernorm_output, rotary_pos_emb, kv_cache=kv_cache
-        )
+        attention_output = self.self_attention(layernorm_output, rotary_pos_emb)
         residual = hidden_states
 
         layernorm_input = torch.nn.functional.dropout(
@@ -369,7 +365,7 @@ class GLMBlock(torch.nn.Module):
             mlp_output, p=self.hidden_dropout, training=self.training
         )
         output = residual + output
-        return output, kv_cache
+        return output
 
 
 class GLMTransformer(torch.nn.Module):
@@ -389,18 +385,10 @@ class GLMTransformer(torch.nn.Module):
             dtype=config.torch_dtype,
         )
 
-    def forward(
-        self,
-        hidden_states,
-        rotary_pos_emb
-    ):
-        kv_caches = [None for _ in range(self.num_layers)]
-
+    def forward(self, hidden_states, rotary_pos_emb):
         for index in range(self.num_layers):
             layer = self.layers[index]
-            hidden_states, kv_cache = layer(
-                hidden_states, rotary_pos_emb, kv_cache=kv_caches[index]
-            )
+            hidden_states = layer(hidden_states, rotary_pos_emb)
         hidden_states = self.final_layernorm(hidden_states)
         return hidden_states
 
@@ -469,28 +457,21 @@ class ChatGLMModel(nn.Module):
         input_ids,
         position_ids: Optional[torch.Tensor] = None,
         output_hidden_states: Optional[bool] = None,
-        return_last_logit: Optional[bool] = False,
     ):
         output_hidden_states = (
             output_hidden_states
             if output_hidden_states is not None
             else self.config.output_hidden_states
         )
-        batch_size, seq_length = input_ids.shape
         inputs_embeds = self.embedding(input_ids)
 
-        # Rotary positional embeddings
         rotary_pos_emb = self.rotary_pos_emb(self.seq_length)
         # show.DumpTensorToImage(rotary_pos_emb[:, :, 0], "rotary_pos_emb.png", scale=0.1)
 
         rotary_pos_emb = rotary_pos_emb[position_ids]
         rotary_pos_emb = rotary_pos_emb.transpose(0, 1).contiguous()
-        hidden_states = self.encoder(
-            inputs_embeds,
-            rotary_pos_emb=rotary_pos_emb
-        )
-        if return_last_logit:
-            hidden_states = hidden_states[-1:]
+        hidden_states = self.encoder(inputs_embeds, rotary_pos_emb)
+        hidden_states = hidden_states[-1:]
         lm_logits = self.output_layer(hidden_states)
         lm_logits = lm_logits.transpose(0, 1).contiguous()
 
@@ -676,7 +657,7 @@ class ChatGLMForConditionalGeneration(nn.Module):
             input_ids,
             pad_token_id=generation_config.pad_token_id,
             eos_token_id=generation_config.eos_token_id,
-            output_hidden_states=generation_config.output_hidden_states
+            output_hidden_states=generation_config.output_hidden_states,
         )
 
         outputs = outputs.tolist()[0][len(inputs["input_ids"][0]) : -1]
@@ -689,7 +670,7 @@ class ChatGLMForConditionalGeneration(nn.Module):
         input_ids: torch.LongTensor,
         pad_token_id: Optional[int] = None,
         eos_token_id: Optional[Union[int, List[int]]] = None,
-        output_hidden_states: Optional[bool] = None
+        output_hidden_states: Optional[bool] = None,
     ):
         if isinstance(eos_token_id, int):
             eos_token_id = [eos_token_id]
@@ -708,11 +689,7 @@ class ChatGLMForConditionalGeneration(nn.Module):
                 .unsqueeze(0)
                 .repeat(batch_size, 1)
             )
-            model_inputs = {
-                "input_ids": input_ids_in,
-                "position_ids": position_ids_in,
-                "return_last_logit": True
-            }
+            model_inputs = {"input_ids": input_ids_in, "position_ids": position_ids_in}
 
             logits = self.transformer(
                 **model_inputs,
@@ -723,24 +700,20 @@ class ChatGLMForConditionalGeneration(nn.Module):
             next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
 
             # finished sentences should have their next token be a padding token
-            if eos_token_id is not None:
-                next_tokens = next_tokens * unfinished_sequences + pad_token_id * (
-                    1 - unfinished_sequences
-                )
+            next_tokens = next_tokens * unfinished_sequences + pad_token_id * (
+                1 - unfinished_sequences
+            )
 
             # update generated ids, model inputs, and length for next step
             input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
 
             # if eos_token was found in one sentence, set sentence to finished
-            if eos_token_id_tensor is not None:
-                unfinished_sequences = unfinished_sequences.mul(
-                    next_tokens.tile(eos_token_id_tensor.shape[0], 1)
-                    .ne(eos_token_id_tensor.unsqueeze(1))
-                    .prod(dim=0)
-                )
-                if unfinished_sequences.max() == 0:
-                    this_peer_finished = True
-            if this_peer_finished:
+            unfinished_sequences = unfinished_sequences.mul(
+                next_tokens.tile(eos_token_id_tensor.shape[0], 1)
+                .ne(eos_token_id_tensor.unsqueeze(1))
+                .prod(dim=0)
+            )
+            if unfinished_sequences.max() == 0:
                 break
 
         return input_ids
diff --git a/demo.py b/demo.py
index f1e9985..5703b70 100644
--- a/demo.py
+++ b/demo.py
@@ -25,7 +25,7 @@ if tokenizer_config_file is not None:
     init_kwargs.pop("tokenizer_file", None)
     saved_init_inputs = init_kwargs.pop("init_inputs", ())
     init_inputs = saved_init_inputs
-init_kwargs["vocab_file"] = './chatglm/tokenizer.model'
+init_kwargs["vocab_file"] = "./chatglm/tokenizer.model"
 init_kwargs["added_tokens_file"] = None
 init_kwargs["special_tokens_map_file"] = None
 init_kwargs["tokenizer_file"] = None
@@ -35,9 +35,11 @@ tokenizer = ChatGLMTokenizer(*init_inputs, **init_kwargs)
 
 glm = glm.from_pretrained(pretrained_model_name_or_path, config=config).half().cuda()
 glm = glm.eval()
-response, history = glm.chat(tokenizer, "colin", history=[])
+query = "colin"
+response, history = glm.chat(tokenizer, query, history=[])
 print(response)
-response, history = glm.chat(tokenizer, "你好", history=history)
+query = "你好"
+response, history = glm.chat(tokenizer, query, history=history)
 print(response)
 # response, history = glm.chat(tokenizer, "你是一个心理学专家，请问晚上睡不着应该怎么办", history=history)
 # print(response)
@@ -50,7 +52,6 @@ print(response)
 # px.scatter(gapminder2007, x='gdpPercap', y='lifeExp')
 
 
-
 # from modelscope import AutoTokenizer, AutoModel, snapshot_download
 # model_dir = snapshot_download("ZhipuAI/chatglm3-6b", cache_dir="./chatglm", revision="v1.0.0")
 # model = AutoModel.from_pretrained(model_dir, trust_remote_code=True).half().cuda()
diff --git a/embedding.py b/embedding.py
new file mode 100644
index 0000000..91f761b
--- /dev/null
+++ b/embedding.py
@@ -0,0 +1,28 @@
+import torch
+import torch.nn as nn
+
+# 定义词表大小和向量维度
+vocab_size = 10000
+embedding_dim = 16
+
+# 定义一个Embedding层
+embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)
+
+# 定义一个输入张量，形状为(batch_size, sequence_length)
+input_tensor = torch.LongTensor([[1, 2], [4, 3]])
+
+# 将输入张量传递给Embedding层
+embedded_tensor = embedding(input_tensor)
+
+
+print("embedded weight shape:")
+print(embedding.weight.shape)
+print("embedded weight:")
+print(embedding.weight)
+
+
+# 输出形状为 (batch_size, sequence_length, embedding_dim)
+print("embedded out shape:")
+print(embedded_tensor.shape)
+print("embedded out:")
+print(embedded_tensor)