diff --git a/chatglm/configuration_chatglm.py b/chatglm/configuration_chatglm.py
index 3560018..4b518ae 100644
--- a/chatglm/configuration_chatglm.py
+++ b/chatglm/configuration_chatglm.py
@@ -3,6 +3,7 @@ from transformers import PretrainedConfig
 
 class ChatGLMConfig(PretrainedConfig):
     model_type = "chatglm"
+
     def __init__(
         self,
         num_layers=28,
@@ -58,4 +59,4 @@ class ChatGLMConfig(PretrainedConfig):
         self.quantization_bit = quantization_bit
         self.pre_seq_len = pre_seq_len
         self.prefix_projection = prefix_projection
-        super().__init__(**kwargs)
\ No newline at end of file
+        super().__init__(**kwargs)
diff --git a/chatglm/modeling_chatglm.py b/chatglm/modeling_chatglm.py
index e83eea2..dc846b0 100644
--- a/chatglm/modeling_chatglm.py
+++ b/chatglm/modeling_chatglm.py
@@ -26,9 +26,7 @@ from tools import show
 class RotaryEmbedding(nn.Module):
     def __init__(self, dim: int, original_impl=False, device=None, dtype=None):
         super().__init__()
-        inv_freq = 1.0 / (
-            10000 ** (torch.arange(0, dim, 2, device=device).to(dtype=dtype) / dim)
-        )
+        inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2, device=device).to(dtype=dtype) / dim))
         self.register_buffer("inv_freq", inv_freq)
         self.dim = dim
         self.original_impl = original_impl
@@ -37,13 +35,7 @@ class RotaryEmbedding(nn.Module):
         dtype = self.inv_freq.dtype
         device = self.inv_freq.device
         # $\Theta = {\theta_i = 10000^{\frac{2(i-1)}{d}}, i \in [1, 2, ..., \frac{d}{2}]}$
-        theta = 1.0 / (
-            base
-            ** (
-                torch.arange(0, self.dim, 2, dtype=torch.float, device=device)
-                / self.dim
-            )
-        )
+        theta = 1.0 / (base ** (torch.arange(0, self.dim, 2, dtype=torch.float, device=device) / self.dim))
         # Create position indexes `[0, 1, ..., max_seq_len - 1]`
         seq_idx = torch.arange(max_seq_len, dtype=torch.float, device=device)
         # Calculate the product of position index and $\theta_i$
@@ -58,9 +50,7 @@ class RotaryEmbedding(nn.Module):
 class RMSNorm(torch.nn.Module):
     def __init__(self, normalized_shape, eps=1e-5, device=None, dtype=None, **kwargs):
         super().__init__()
-        self.weight = torch.nn.Parameter(
-            torch.empty(normalized_shape, device=device, dtype=dtype)
-        )
+        self.weight = torch.nn.Parameter(torch.empty(normalized_shape, device=device, dtype=dtype))
         self.eps = eps
 
     def forward(self, hidden_states: torch.Tensor):
@@ -80,9 +70,7 @@ class CoreAttention(torch.nn.Module):
         projection_size = config.kv_channels * config.num_attention_heads
         # Per attention head and per partition values.
         self.hidden_size_per_partition = projection_size
-        self.hidden_size_per_attention_head = (
-            projection_size // config.num_attention_heads
-        )
+        self.hidden_size_per_attention_head = projection_size // config.num_attention_heads
         self.num_attention_heads_per_partition = config.num_attention_heads
 
         coeff = None
@@ -94,17 +82,13 @@ class CoreAttention(torch.nn.Module):
         self.attention_dropout = torch.nn.Dropout(config.attention_dropout)
 
     def forward(self, query_layer, key_layer, value_layer):
-        query_layer, key_layer, value_layer = [
-            k.permute(1, 2, 0, 3) for k in [query_layer, key_layer, value_layer]
-        ]
+        query_layer, key_layer, value_layer = [k.permute(1, 2, 0, 3) for k in [query_layer, key_layer, value_layer]]
         if query_layer.shape[2] == key_layer.shape[2]:
             context_layer = torch.nn.functional.scaled_dot_product_attention(
                 query_layer, key_layer, value_layer, is_causal=True
             )
         context_layer = context_layer.permute(2, 0, 1, 3)
-        new_context_layer_shape = context_layer.size()[:-2] + (
-            self.hidden_size_per_partition,
-        )
+        new_context_layer_shape = context_layer.size()[:-2] + (self.hidden_size_per_partition,)
         context_layer = context_layer.reshape(*new_context_layer_shape)
         return context_layer
 
@@ -114,16 +98,13 @@ class SelfAttention(torch.nn.Module):
         super(SelfAttention, self).__init__()
         self.layer_number = max(1, layer_number)
         self.projection_size = config.kv_channels * config.num_attention_heads
-        self.hidden_size_per_attention_head = (
-            self.projection_size // config.num_attention_heads
-        )
+        self.hidden_size_per_attention_head = self.projection_size // config.num_attention_heads
         self.num_attention_heads_per_partition = config.num_attention_heads
         self.multi_query_attention = config.multi_query_attention
         self.qkv_hidden_size = 3 * self.projection_size
         self.num_multi_query_groups_per_partition = config.multi_query_group_num
         self.qkv_hidden_size = (
-            self.projection_size
-            + 2 * self.hidden_size_per_attention_head * config.multi_query_group_num
+            self.projection_size + 2 * self.hidden_size_per_attention_head * config.multi_query_group_num
         )
         self.query_key_value = nn.Linear(
             config.hidden_size,
@@ -163,12 +144,9 @@ class SelfAttention(torch.nn.Module):
 
         (query_layer, key_layer, value_layer) = mixed_x_layer.split(
             [
-                self.num_attention_heads_per_partition
-                * self.hidden_size_per_attention_head,
-                self.num_multi_query_groups_per_partition
-                * self.hidden_size_per_attention_head,
-                self.num_multi_query_groups_per_partition
-                * self.hidden_size_per_attention_head,
+                self.num_attention_heads_per_partition * self.hidden_size_per_attention_head,
+                self.num_multi_query_groups_per_partition * self.hidden_size_per_attention_head,
+                self.num_multi_query_groups_per_partition * self.hidden_size_per_attention_head,
             ],
             dim=-1,
         )
@@ -204,8 +182,7 @@ class SelfAttention(torch.nn.Module):
             -1,
             -1,
             -1,
-            self.num_attention_heads_per_partition
-            // self.num_multi_query_groups_per_partition,
+            self.num_attention_heads_per_partition // self.num_multi_query_groups_per_partition,
             -1,
         )
         key_layer = key_layer.contiguous().view(
@@ -220,8 +197,7 @@ class SelfAttention(torch.nn.Module):
             -1,
             -1,
             -1,
-            self.num_attention_heads_per_partition
-            // self.num_multi_query_groups_per_partition,
+            self.num_attention_heads_per_partition // self.num_multi_query_groups_per_partition,
             -1,
         )
         value_layer = value_layer.contiguous().view(
@@ -292,9 +268,7 @@ class GLMBlock(torch.nn.Module):
         super(GLMBlock, self).__init__()
         self.layer_number = layer_number
 
-        self.apply_residual_connection_post_layernorm = (
-            config.apply_residual_connection_post_layernorm
-        )
+        self.apply_residual_connection_post_layernorm = config.apply_residual_connection_post_layernorm
 
         self.fp32_residual_connection = config.fp32_residual_connection
 
@@ -326,9 +300,7 @@ class GLMBlock(torch.nn.Module):
         attention_output = self.self_attention(layernorm_output, rotary_pos_emb)
         residual = hidden_states
 
-        layernorm_input = torch.nn.functional.dropout(
-            attention_output, p=self.hidden_dropout, training=self.training
-        )
+        layernorm_input = torch.nn.functional.dropout(attention_output, p=self.hidden_dropout, training=self.training)
         layernorm_input = residual + layernorm_input
 
         # Layer norm post the self attention.
@@ -339,9 +311,7 @@ class GLMBlock(torch.nn.Module):
 
         residual = layernorm_input
 
-        output = torch.nn.functional.dropout(
-            mlp_output, p=self.hidden_dropout, training=self.training
-        )
+        output = torch.nn.functional.dropout(mlp_output, p=self.hidden_dropout, training=self.training)
         output = residual + output
         return output
 
@@ -409,9 +379,7 @@ class ChatGLMModel(nn.Module):
         # Rotary positional embeddings
         self.seq_length = config.seq_length
         rotary_dim = (
-            config.hidden_size // config.num_attention_heads
-            if config.kv_channels is None
-            else config.kv_channels
+            config.hidden_size // config.num_attention_heads if config.kv_channels is None else config.kv_channels
         )
 
         self.rotary_pos_emb = RotaryEmbedding(
@@ -438,9 +406,7 @@ class ChatGLMModel(nn.Module):
         tokenizer=None,
     ):
         output_hidden_states = (
-            output_hidden_states
-            if output_hidden_states is not None
-            else self.config.output_hidden_states
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
         inputs_embeds = self.embedding(input_ids)
 
@@ -475,23 +441,17 @@ class ChatGLMForConditionalGeneration(nn.Module):
         self.warnings_issued = {}
         self.generation_config = GenerationConfig.from_model_config(config)
 
-    def from_pretrained(
-        cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]]
-    ):
+    def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]]):
         load_in_8bit = False
         load_in_4bit = False
 
         pretrained_model_name_or_path = str(pretrained_model_name_or_path)
-        resolved_archive_file = os.path.join(
-            pretrained_model_name_or_path, "pytorch_model.bin.index.json"
-        )
+        resolved_archive_file = os.path.join(pretrained_model_name_or_path, "pytorch_model.bin.index.json")
         print(f"loading weights file {resolved_archive_file}")
         with open(resolved_archive_file, "r") as f:
             index = json.loads(f.read())
         shard_filenames = sorted(set(index["weight_map"].values()))
-        resolved_archive_file = [
-            os.path.join(pretrained_model_name_or_path, f) for f in shard_filenames
-        ]
+        resolved_archive_file = [os.path.join(pretrained_model_name_or_path, f) for f in shard_filenames]
         model = cls._load_pretrained_model(resolved_archive_file)
         model.is_loaded_in_4bit = load_in_4bit
         model.is_loaded_in_8bit = load_in_8bit
@@ -524,21 +484,15 @@ class ChatGLMForConditionalGeneration(nn.Module):
         model_to_load = cls
         error_msgs = []
         if len(resolved_archive_file) > 1:
-            resolved_archive_file = tqdm_lib.tqdm(
-                resolved_archive_file, desc="Loading checkpoint shards"
-            )
+            resolved_archive_file = tqdm_lib.tqdm(resolved_archive_file, desc="Loading checkpoint shards")
         for shard_file in resolved_archive_file:
             state_dict = torch.load(shard_file, map_location="cpu")
 
-            error_msgs += cls._load_state_dict_into_model(
-                model_to_load, state_dict, start_prefix
-            )
+            error_msgs += cls._load_state_dict_into_model(model_to_load, state_dict, start_prefix)
             del state_dict  # force memory release
             gc.collect()
 
-        print(
-            f"All model checkpoint weights were used when initializing {cls.__class__.__name__}.\n"
-        )
+        print(f"All model checkpoint weights were used when initializing {cls.__class__.__name__}.\n")
         return cls
 
     @torch.inference_mode()
@@ -556,9 +510,7 @@ class ChatGLMForConditionalGeneration(nn.Module):
 
         generation_config = copy.deepcopy(self.generation_config)
         inputs_tensor = inputs["input_ids"]
-        input_ids = inputs_tensor.repeat_interleave(
-            generation_config.num_return_sequences, dim=0
-        )
+        input_ids = inputs_tensor.repeat_interleave(generation_config.num_return_sequences, dim=0)
 
         outputs = self.sample(
             input_ids,
@@ -585,17 +537,13 @@ class ChatGLMForConditionalGeneration(nn.Module):
             eos_token_id = [eos_token_id]
         eos_token_id_tensor = torch.tensor(eos_token_id).to(input_ids.device)
 
-        isFinished = torch.zeros(
-            input_ids.shape[0], dtype=torch.long, device=input_ids.device
-        )
+        isFinished = torch.zeros(input_ids.shape[0], dtype=torch.long, device=input_ids.device)
         # token_count = 0
         while True:
             input_ids_in = input_ids
             batch_size, seq_length = input_ids_in.shape
             position_ids_in = (
-                torch.arange(seq_length, dtype=torch.long, device=input_ids.device)
-                .unsqueeze(0)
-                .repeat(batch_size, 1)
+                torch.arange(seq_length, dtype=torch.long, device=input_ids.device).unsqueeze(0).repeat(batch_size, 1)
             )
             model_inputs = {"input_ids": input_ids_in, "position_ids": position_ids_in}
 
diff --git a/chatglm/tokenization_chatglm.py b/chatglm/tokenization_chatglm.py
index f62a1f9..d973dce 100644
--- a/chatglm/tokenization_chatglm.py
+++ b/chatglm/tokenization_chatglm.py
@@ -19,8 +19,17 @@ class SPTokenizer:
         self.pad_id: int = self.sp_model.unk_id()
         assert self.sp_model.vocab_size() == self.sp_model.get_piece_size()
 
-        special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "sop", "eop", "<|system|>", "<|user|>", "<|assistant|>",
-                          "<|observation|>"]
+        special_tokens = [
+            "[MASK]",
+            "[gMASK]",
+            "[sMASK]",
+            "sop",
+            "eop",
+            "<|system|>",
+            "<|user|>",
+            "<|assistant|>",
+            "<|observation|>",
+        ]
         self.special_tokens = {}
         self.index_special_tokens = {}
         for token in special_tokens:
@@ -59,7 +68,7 @@ class SPTokenizer:
         return text
 
     def convert_token_to_id(self, token):
-        """ Converts a token (str) in an id using the vocab. """
+        """Converts a token (str) in an id using the vocab."""
         if token in self.special_tokens:
             return self.special_tokens[token]
         return self.sp_model.PieceToId(token)
@@ -86,7 +95,7 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
         self.special_tokens = {
             "<bos>": self.tokenizer.bos_id,
             "<eos>": self.tokenizer.eos_id,
-            "<pad>": self.tokenizer.pad_id
+            "<pad>": self.tokenizer.pad_id,
         }
         super().__init__(padding_side=padding_side, clean_up_tokenization_spaces=clean_up_tokenization_spaces, **kwargs)
 
@@ -121,7 +130,7 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
         return self.tokenizer.n_words
 
     def get_vocab(self):
-        """ Returns vocab as a dict """
+        """Returns vocab as a dict"""
         vocab = {self._convert_id_to_token(i): i for i in range(self.vocab_size)}
         vocab.update(self.added_tokens_encoder)
         return vocab
@@ -130,7 +139,7 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
         return self.tokenizer.tokenize(text)
 
     def _convert_token_to_id(self, token):
-        """ Converts a token (str) in an id using the vocab. """
+        """Converts a token (str) in an id using the vocab."""
         return self.tokenizer.convert_token_to_id(token)
 
     def _convert_id_to_token(self, index):
diff --git a/test_tokenizer.py b/test_tokenizer.py
index 63f59a9..a88c168 100644
--- a/test_tokenizer.py
+++ b/test_tokenizer.py
@@ -24,8 +24,7 @@ tokenizer = ChatGLMTokenizer(*init_inputs, **init_kwargs)
 
 
 a = tokenizer.encode("骉")
-b = tokenizer.decode([236,173,140])
- 
+b = tokenizer.decode([236, 173, 140])
 
 
 token = []
@@ -44,9 +43,8 @@ show.DumpListToFile(token, "generated/token.log")
 #     next_t = torch.multinomial(probss, num_samples=1).squeeze(1)
 
 #     response = tokenizer.decode(next_t)
-    
+
 #     print(response)
 #     # name = "generated/next_tokens" + str(token_count) + "_" + response + "_.png"
 #     # show.DumpTensorToImage(next_token_logits[0], name)
 #     # token_count = token_count + 1
-