Granther's picture
Upload modelling/quant.py with huggingface_hub
66a181a verified
raw
history blame contribute delete
No virus
1.38 kB
from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig, AwqConfig
from huggingface_hub import notebook_login, HfApi
from peft import PeftModel, PeftConfig
from optimum.gptq import GPTQQuantizer, load_quantized_model
from accelerate import Accelerator
import torch
model_id = "mistralai/Mistral-Nemo-Instruct-2407"
quant_dataset = "c4"
gptq_repo = "Granther/Mistral-Nemo-Instruct-GPTQ"
awq_repo = ""
# GPTQ
gptq_dir = "gptq/"
# AWQ
awq_dir = "awq/"
accelerator = Accelerator()
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, ignore_mismatched_sizes=True)
model, tokenizer = accelerator.prepare(model, tokenizer)
quantizer = GPTQQuantizer(bits=4,
dataset="c4",
group_size=64, # The size of groups to perform quant calcs on
desc_act=True, # Perplexity is better, compute speed is worse
sym=True, # Symetrical quant
true_sequential=True, #
#block_name_to_quantize = "layers.0",
tokenizer=tokenizer)
print("Made it to quant_model")
quantized_model = quantizer.quantize_model(model, tokenizer=tokenizer)
tokenizer.save_pretrained(gptq_dir)
#gptq_config.save_pretrained(gptq_dir)
quantized_model.save_pretrained(gptq_dir)