########################## #Must include these lines# ########################## import os os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"]="0,1,2,3" import torch from transformers import ( AutoModelForCausalLM, AutoTokenizer, pipeline, ) from peft import PeftModel #################################################### #This is how you can set up Llama model on your end# #################################################### model = AutoModelForCausalLM.from_pretrained( 'codellama/CodeLlama-7b-Instruct-hf', device_map = "balanced_low_0" ) tokenizer = AutoTokenizer.from_pretrained( 'llama_prompt_model/tokenizer', padding=True, truncation=True, device_map = "balanced_low_0" ) model.resize_token_embeddings(len(tokenizer)) model = PeftModel.from_pretrained(model, "llama_prompt_model/model") model = model.merge_and_unload() ################################################ #This is how you can communicate with the model# ################################################ pipe = pipeline(task="text-generation", model = model, tokenizer = tokenizer, max_length = 2000, pad_token_id = 2) ################################################## #feel free to change this line and see the result# ################################################## prompt = "What's your name?" result = pipe("[INST]%s[/INST]"%(prompt))[0]['generated_text'] print(result)