stan-hua
/

Meta-Llama-3.1-70B-Instruct-LC-SmoothQuant-RTN-W8A16

Text Generation

text-generation-inference

Inference Endpoints

compressed-tensors

Model card Files Files and versions Community

stan-hua commited on 2 days ago

Commit

1300f68

•

1 Parent(s): 51ad104

Push folder to HuggingFace Hub

Files changed (2) hide show

config.json +32 -1
recipe.yaml +7 -0

config.json CHANGED Viewed

@@ -23,6 +23,37 @@
   "num_hidden_layers": 80,
   "num_key_value_heads": 8,
   "pretraining_tp": 1,
   "rms_norm_eps": 1e-05,
   "rope_scaling": {
     "factor": 8.0,
@@ -37,4 +68,4 @@
   "transformers_version": "4.45.2",
   "use_cache": true,
   "vocab_size": 128256
-}

   "num_hidden_layers": 80,
   "num_key_value_heads": 8,
   "pretraining_tp": 1,
+  "quantization_config": {
+    "config_groups": {
+      "group_0": {
+        "input_activations": null,
+        "output_activations": null,
+        "targets": [
+          "Linear"
+        ],
+        "weights": {
+          "actorder": null,
+          "block_structure": null,
+          "dynamic": false,
+          "group_size": null,
+          "num_bits": 8,
+          "observer": "minmax",
+          "observer_kwargs": {},
+          "strategy": "channel",
+          "symmetric": true,
+          "type": "int"
+        }
+      }
+    },
+    "format": "pack-quantized",
+    "global_compression_ratio": 1.463543865167781,
+    "ignore": [
+      "lm_head"
+    ],
+    "kv_cache_scheme": null,
+    "quant_method": "compressed-tensors",
+    "quantization_status": "compressed"
+  },
   "rms_norm_eps": 1e-05,
   "rope_scaling": {
     "factor": 8.0,
   "transformers_version": "4.45.2",
   "use_cache": true,
   "vocab_size": 128256
+}

recipe.yaml ADDED Viewed

	@@ -0,0 +1,7 @@

+DEFAULT_stage:
+  DEFAULT_modifiers:
+    SmoothQuantModifier: {smoothing_strength: 0.8}
+    QuantizationModifier:
+      ignore: [lm_head]
+      targets: Linear
+      scheme: W8A16