--- library_name: peft base_model: meta-llama/Llama-2-7b-chat-hf license: mit language: - en --- # Chadgpt Llama2 7b conversation ## Colab Example https://colab.research.google.com/drive/1YPF7oAM0s3W93iWIqJ-kZ2NY5gQK3tZ2?usp=sharing ## Install Prerequisite ```bash !pip install peft !pip install transformers !pip install bitsandbytes ``` ## Login Using Huggingface Token ```bash # You need a huggingface token that can access llama2 from huggingface_hub import notebook_login notebook_login() ``` ## Download Model ```python import torch from peft import PeftModel, PeftConfig from transformers import AutoModelForCausalLM, AutoTokenizer device = "cuda" if torch.cuda.is_available() else "cpu" peft_model_id = "danjie/Chadgpt-Llama2-7b-conversation" config = PeftConfig.from_pretrained(peft_model_id) model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path, return_dict=True, load_in_8bit=True, device_map='auto') tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path) # Load the Lora model model = PeftModel.from_pretrained(model, peft_model_id) ``` ## Inference ```python # Run this cell to start a new conversation conversation_history = [] def format_conversation(conversation: list[str]) -> str: formatted_conversation = "" # Check if the conversation has more than two turns if len(conversation) > 2: # Process all but the last two turns for i in range(len(conversation) - 2): if i % 2 == 0: formatted_conversation += "" + conversation[i] + "\n" else: formatted_conversation += "" + conversation[i] + "\n" # Process the last two turns if len(conversation) >= 2: formatted_conversation += "" + conversation[-2] + "\n" formatted_conversation += "" + conversation[-1] return formatted_conversation def talk_with_llm(chat: str) -> str: # Encode and move tensor into cuda if applicable. conversation_history.append(chat) conversation_history.append("") conversation = format_conversation(conversation_history) encoded_input = tokenizer(conversation, return_tensors='pt') encoded_input = {k: v.to(device) for k, v in encoded_input.items()} output = model.generate(**encoded_input, max_new_tokens=256) response = tokenizer.decode(output[0], skip_special_tokens=True) response = response[len(conversation):] conversation_history.pop() conversation_history.append(response) return response ```