Update README.md
Browse files
README.md
CHANGED
@@ -55,90 +55,30 @@ This quant was created using llmcompressor.
|
|
55 |
Code below.
|
56 |
|
57 |
```python
|
58 |
-
import
|
59 |
-
from datasets import load_dataset
|
60 |
from transformers import AutoTokenizer
|
|
|
|
|
61 |
|
62 |
-
from llmcompressor.transformers import SparseAutoModelForCausalLM, oneshot
|
63 |
-
from llmcompressor.transformers.compression.helpers import (
|
64 |
-
calculate_offload_device_map,
|
65 |
-
custom_offload_device_map,
|
66 |
-
)
|
67 |
|
68 |
-
|
69 |
-
quant_stage:
|
70 |
-
quant_modifiers:
|
71 |
-
QuantizationModifier:
|
72 |
-
ignore: ["lm_head"]
|
73 |
-
config_groups:
|
74 |
-
group_0:
|
75 |
-
weights:
|
76 |
-
num_bits: 8
|
77 |
-
type: float
|
78 |
-
strategy: tensor
|
79 |
-
dynamic: false
|
80 |
-
symmetric: true
|
81 |
-
input_activations:
|
82 |
-
num_bits: 8
|
83 |
-
type: float
|
84 |
-
strategy: tensor
|
85 |
-
dynamic: false
|
86 |
-
symmetric: true
|
87 |
-
targets: ["Linear"]
|
88 |
-
"""
|
89 |
-
|
90 |
-
model_stub = "NousResearch/Hermes-3-Llama-3.1-8B"
|
91 |
-
model_name = model_stub.split("/")[-1]
|
92 |
-
|
93 |
-
device_map = calculate_offload_device_map(
|
94 |
-
model_stub, reserve_for_hessians=False, num_gpus=1, torch_dtype="auto"
|
95 |
-
)
|
96 |
|
97 |
model = SparseAutoModelForCausalLM.from_pretrained(
|
98 |
-
|
99 |
-
)
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
return {
|
114 |
-
"text": tokenizer.apply_chat_template(
|
115 |
-
example["messages"],
|
116 |
-
tokenize=False,
|
117 |
-
)
|
118 |
-
}
|
119 |
-
|
120 |
-
ds = ds.map(preprocess)
|
121 |
-
|
122 |
-
def tokenize(sample):
|
123 |
-
return tokenizer(
|
124 |
-
sample["text"],
|
125 |
-
padding=False,
|
126 |
-
max_length=MAX_SEQUENCE_LENGTH,
|
127 |
-
truncation=True,
|
128 |
-
add_special_tokens=False,
|
129 |
-
)
|
130 |
-
|
131 |
-
ds = ds.map(tokenize, remove_columns=ds.column_names)
|
132 |
-
|
133 |
-
oneshot(
|
134 |
-
model=model,
|
135 |
-
output_dir=output_dir,
|
136 |
-
dataset=ds,
|
137 |
-
recipe=recipe,
|
138 |
-
max_seq_length=MAX_SEQUENCE_LENGTH,
|
139 |
-
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
|
140 |
-
save_compressed=True,
|
141 |
-
)
|
142 |
|
143 |
```
|
144 |
|
|
|
55 |
Code below.
|
56 |
|
57 |
```python
|
58 |
+
from llmcompressor.transformers import SparseAutoModelForCausalLM
|
|
|
59 |
from transformers import AutoTokenizer
|
60 |
+
from llmcompressor.transformers import oneshot
|
61 |
+
from llmcompressor.modifiers.quantization import QuantizationModifier
|
62 |
|
|
|
|
|
|
|
|
|
|
|
63 |
|
64 |
+
MODEL_ID = "NousResearch/Hermes-3-Llama-3.1-8B"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
65 |
|
66 |
model = SparseAutoModelForCausalLM.from_pretrained(
|
67 |
+
MODEL_ID, device_map="auto", torch_dtype="auto")
|
68 |
+
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
|
69 |
+
|
70 |
+
# Configure the simple PTQ quantization
|
71 |
+
recipe = QuantizationModifier(
|
72 |
+
targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"])
|
73 |
+
|
74 |
+
# Apply the quantization algorithm.
|
75 |
+
oneshot(model=model, recipe=recipe)
|
76 |
+
|
77 |
+
# Save the model.
|
78 |
+
SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
|
79 |
+
model.save_pretrained(SAVE_DIR)
|
80 |
+
tokenizer.save_pretrained(SAVE_DIR)
|
81 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
82 |
|
83 |
```
|
84 |
|