update

Browse files

Files changed (5) hide show

README.md +3 -1
README_en.md +3 -1
config.json +1 -1
generation_config.json +1 -1
modeling_chatglm.py +5 -7

README.md CHANGED Viewed

@@ -39,7 +39,9 @@ GLM-4-9B 是智谱 AI 推出的最新一代预训练模型 GLM-4 系列中的开
 ## 运行模型
-更多推理代码和依赖信息，请访问我们的 [github](https://github.com/THUDM/GLM-4) 。
 使用 transformers 后端进行推理:

 ## 运行模型
+**更多推理代码和依赖信息，请访问我们的 [github](https://github.com/THUDM/GLM-4)。**
+**请严格按照[依赖](https://github.com/THUDM/GLM-4/blob/main/basic_demo/requirements.txt)安装，否则无法正常运行。**
 使用 transformers 后端进行推理:

README_en.md CHANGED Viewed

@@ -30,7 +30,9 @@ The long text capability was further evaluated on LongBench, and the results are
 ## Quick Start
-For more inference code and requirements, please visit our [github page](https://github.com/THUDM/GLM-4).
 ### Use the following method to quickly call the GLM-4-9B-Chat-1M language model

 ## Quick Start
+**For more inference code and requirements, please visit our [github page](https://github.com/THUDM/GLM-4).**
+**Please strictly follow the [dependencies](https://github.com/THUDM/GLM-4/blob/main/basic_demo/requirements.txt) to install, otherwise it will not run properly**
 ### Use the following method to quickly call the GLM-4-9B-Chat-1M language model

config.json CHANGED Viewed

@@ -38,7 +38,7 @@
   "seq_length": 1048576,
   "use_cache": true,
   "torch_dtype": "bfloat16",
-  "transformers_version": "4.40.2",
   "tie_word_embeddings": false,
   "eos_token_id": [151329, 151336, 151338],
   "pad_token_id": 151329

   "seq_length": 1048576,
   "use_cache": true,
   "torch_dtype": "bfloat16",
+  "transformers_version": "4.42.4",
   "tie_word_embeddings": false,
   "eos_token_id": [151329, 151336, 151338],
   "pad_token_id": 151329

generation_config.json CHANGED Viewed

@@ -9,5 +9,5 @@
   "temperature": 0.8,
   "max_length": 1024000,
   "top_p": 0.8,
-  "transformers_version": "4.40.2"
 }

   "temperature": 0.8,
   "max_length": 1024000,
   "top_p": 0.8,
+  "transformers_version": "4.42.4"
 }

modeling_chatglm.py CHANGED Viewed

@@ -29,13 +29,13 @@ from .configuration_chatglm import ChatGLMConfig
 try:
     from transformers.utils import is_flash_attn_greater_or_equal_2_10, is_flash_attn_2_available
     if is_flash_attn_2_available():
         from flash_attn import flash_attn_func, flash_attn_varlen_func
         from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
 except:
     pass
 # flags required to enable jit fusion kernels
 if sys.platform != 'darwin' and not is_torch_npu_available():
@@ -354,7 +354,8 @@ class FlashAttention2(CoreAttention):
         )
         if query_length == kv_seq_len:
             query_layer = index_first_axis(
-                query_layer.reshape(batch_size * kv_seq_len, self.num_attention_heads_per_partition, head_dim), indices_k
             )
             cu_seqlens_q = cu_seqlens_k
             max_seqlen_in_batch_q = max_seqlen_in_batch_k
@@ -797,10 +798,6 @@ class ChatGLMPreTrainedModel(PreTrainedModel):
         position_ids = torch.arange(seq_length, dtype=torch.long, device=device).unsqueeze(0).repeat(batch_size, 1)
         return position_ids
-    def gradient_checkpointing_enable(self, gradient_checkpointing_kwargs=None):
-        if not self.supports_gradient_checkpointing:
-            raise ValueError(f"{self.__class__.__name__} does not support gradient checkpointing.")
 class Embedding(torch.nn.Module):
     """Language model embeddings."""
@@ -936,9 +933,10 @@ class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel):
             standardize_cache_format: bool = False,
     ) -> Dict[str, Any]:
         # update past_key_values
-        model_kwargs["past_key_values"] = self._extract_past_from_model_output(
             outputs, standardize_cache_format=standardize_cache_format
         )
         # update attention mask
         if "attention_mask" in model_kwargs:

 try:
     from transformers.utils import is_flash_attn_greater_or_equal_2_10, is_flash_attn_2_available
     if is_flash_attn_2_available():
         from flash_attn import flash_attn_func, flash_attn_varlen_func
         from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
 except:
     pass
 # flags required to enable jit fusion kernels
 if sys.platform != 'darwin' and not is_torch_npu_available():
         )
         if query_length == kv_seq_len:
             query_layer = index_first_axis(
+                query_layer.reshape(batch_size * kv_seq_len, self.num_attention_heads_per_partition, head_dim),
+                indices_k
             )
             cu_seqlens_q = cu_seqlens_k
             max_seqlen_in_batch_q = max_seqlen_in_batch_k
         position_ids = torch.arange(seq_length, dtype=torch.long, device=device).unsqueeze(0).repeat(batch_size, 1)
         return position_ids
 class Embedding(torch.nn.Module):
     """Language model embeddings."""
             standardize_cache_format: bool = False,
     ) -> Dict[str, Any]:
         # update past_key_values
+        cache_name, cache = self._extract_past_from_model_output(
             outputs, standardize_cache_format=standardize_cache_format
         )
+        model_kwargs[cache_name] = cache
         # update attention mask
         if "attention_mask" in model_kwargs: