from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig, AwqConfig from huggingface_hub import notebook_login, HfApi from peft import PeftModel, PeftConfig from optimum.gptq import GPTQQuantizer, load_quantized_model from accelerate import Accelerator import torch model_id = "mistralai/Mistral-Nemo-Instruct-2407" quant_dataset = "c4" gptq_repo = "Granther/Mistral-Nemo-Instruct-GPTQ" awq_repo = "" # GPTQ gptq_dir = "gptq/" # AWQ awq_dir = "awq/" accelerator = Accelerator() tokenizer = AutoTokenizer.from_pretrained(model_id) model = AutoModelForCausalLM.from_pretrained(model_id, ignore_mismatched_sizes=True) model, tokenizer = accelerator.prepare(model, tokenizer) quantizer = GPTQQuantizer(bits=4, dataset="c4", group_size=64, # The size of groups to perform quant calcs on desc_act=True, # Perplexity is better, compute speed is worse sym=True, # Symetrical quant true_sequential=True, # #block_name_to_quantize = "layers.0", tokenizer=tokenizer) print("Made it to quant_model") quantized_model = quantizer.quantize_model(model, tokenizer=tokenizer) tokenizer.save_pretrained(gptq_dir) #gptq_config.save_pretrained(gptq_dir) quantized_model.save_pretrained(gptq_dir)