def get_llm_response(repo, filename, model_type, gpu_layers, prompt): from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline model_name_or_path = "TheBloke/meditron-7B-GPTQ" # To use a different branch, change revision # For example: revision="gptq-4bit-128g-actorder_True" model = AutoModelForCausalLM.from_pretrained(model_name_or_path, device_map="auto", trust_remote_code=False, revision="main") tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True) print("\n\n*** Generate:") #input_ids = tokenizer(prompt_template, return_tensors='pt').input_ids.cuda() #output = model.generate(inputs=input_ids, temperature=0.7, do_sample=True, top_p=0.95, top_k=40, max_new_tokens=512) #print(tokenizer.decode(output[0])) # Inference can also be done using transformers' pipeline print("*** Pipeline:") pipe = pipeline( "text-generation", model=model, tokenizer=tokenizer, max_new_tokens=512, do_sample=True, temperature=0.7, top_p=0.95, top_k=40, repetition_penalty=1.1 ) prompt_template=f'''<|im_start|>system {system_message}<|im_end|> <|im_start|>user {prompt}<|im_end|> <|im_start|>assistant '''.format(system_message="You are an assistant", prompt=prompt) response = pipe(prompt_template)[0]['generated_text'] print(response) return response