import os import torch from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, GenerationConfig model_path = os.environ.get("HF_REPO_ID") access_token = os.environ.get("HF_TOKEN") tokenizer = AutoTokenizer.from_pretrained(model_path, token=access_token) bnb_config = BitsAndBytesConfig( load_in_4bit=True, # load_in_8bit=use_8_bit, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=getattr(torch, "bfloat16"), bnb_4bit_use_double_quant=True, ) model = AutoModelForCausalLM.from_pretrained(model_path, token=access_token, quantization_config=bnb_config, torch_dtype=torch.float16, # attn_implementation="flash_attention_2", device_map='auto') if torch.cuda.is_available(): device = "cuda" else: device = "cpu" def generate( question, context=None, temperature=0.7, top_p=0.7, top_k=40, num_beams=4, max_new_tokens=256,): prompt = f"### CONTEXT:\n{context}\n\n### QUESTION:\n{question}\n\n### ANSWER:" inputs = tokenizer(prompt, return_tensors="pt") input_ids = inputs["input_ids"].to(device) generation_config = GenerationConfig( temperature=temperature, top_p=top_p, top_k=top_k, num_beams=num_beams, ) # with torch.autocast("cuda"): with torch.no_grad(): generation_output = model.generate( input_ids=input_ids, generation_config=generation_config, return_dict_in_generate=True, output_scores=True, max_new_tokens=max_new_tokens, ) seq = generation_output.sequences[0] output = tokenizer.decode(seq) return output