stan-hua
/

Llama-3.1-8B-Instruct-LC-SmoothQuant-RTN-W8A8

Text Generation

text-generation-inference

Inference Endpoints

8-bit precision

compressed-tensors

Model card Files Files and versions Community

stan-hua commited on 2 days ago

Commit

61899f9

•

1 Parent(s): af891fe

Push folder to HuggingFace Hub

Files changed (4) hide show

config.json +43 -1
recipe.yaml +7 -0
special_tokens_map.json +2 -1
tokenizer_config.json +1 -0

config.json CHANGED Viewed

@@ -23,6 +23,48 @@
   "num_hidden_layers": 32,
   "num_key_value_heads": 8,
   "pretraining_tp": 1,
   "rms_norm_eps": 1e-05,
   "rope_scaling": {
     "factor": 8.0,
@@ -37,4 +79,4 @@
   "transformers_version": "4.45.2",
   "use_cache": true,
   "vocab_size": 128256
-}

   "num_hidden_layers": 32,
   "num_key_value_heads": 8,
   "pretraining_tp": 1,
+  "quantization_config": {
+    "config_groups": {
+      "group_0": {
+        "input_activations": {
+          "actorder": null,
+          "block_structure": null,
+          "dynamic": true,
+          "group_size": null,
+          "num_bits": 8,
+          "observer": null,
+          "observer_kwargs": {},
+          "strategy": "token",
+          "symmetric": true,
+          "type": "int"
+        },
+        "output_activations": null,
+        "targets": [
+          "Linear"
+        ],
+        "weights": {
+          "actorder": null,
+          "block_structure": null,
+          "dynamic": false,
+          "group_size": null,
+          "num_bits": 8,
+          "observer": "minmax",
+          "observer_kwargs": {},
+          "strategy": "channel",
+          "symmetric": true,
+          "type": "int"
+        }
+      }
+    },
+    "format": "int-quantized",
+    "global_compression_ratio": 1.458959021545191,
+    "ignore": [
+      "lm_head"
+    ],
+    "kv_cache_scheme": null,
+    "quant_method": "compressed-tensors",
+    "quantization_status": "compressed"
+  },
   "rms_norm_eps": 1e-05,
   "rope_scaling": {
     "factor": 8.0,
   "transformers_version": "4.45.2",
   "use_cache": true,
   "vocab_size": 128256
+}

recipe.yaml ADDED Viewed

	@@ -0,0 +1,7 @@

+DEFAULT_stage:
+  DEFAULT_modifiers:
+    SmoothQuantModifier: {smoothing_strength: 0.8}
+    QuantizationModifier:
+      ignore: [lm_head]
+      targets: Linear
+      scheme: W8A8

special_tokens_map.json CHANGED Viewed

@@ -12,5 +12,6 @@
     "normalized": false,
     "rstrip": false,
     "single_word": false
-  }
 }

     "normalized": false,
     "rstrip": false,
     "single_word": false
+  },
+  "pad_token": "<|eot_id|>"
 }

tokenizer_config.json CHANGED Viewed

@@ -2058,5 +2058,6 @@
     "attention_mask"
   ],
   "model_max_length": 131072,
   "tokenizer_class": "PreTrainedTokenizerFast"
 }

     "attention_mask"
   ],
   "model_max_length": 131072,
+  "pad_token": "<|eot_id|>",
   "tokenizer_class": "PreTrainedTokenizerFast"
 }