How to use :
!pip install --no-deps packaging ninja einops peft accelerate bitsandbytes
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer
from peft import PeftModel, PeftConfig
# Load model and tokenizer configurations
config = PeftConfig.from_pretrained("Vijayendra/llama3.0-8B-merged-4bit")
base_model = AutoModelForCausalLM.from_pretrained("unsloth/llama-3-8b-bnb-4bit")
model = PeftModel.from_pretrained(base_model, "Vijayendra/llama3.0-8B-merged-4bit")
tokenizer = AutoTokenizer.from_pretrained("Vijayendra/llama3.0-8B-merged-4bit")
# Ensure padding token is set for the tokenizer
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
# Define the inference function with TextStreamer
def generate_answer_with_stream(model, tokenizer, text, max_new_tokens=1024, temperature=0.5, top_k=40, top_p=0.9):
prompt = f"Answer the following question\n\n{text}\n\nQuestion:"
# Tokenize the input text
inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True).to(model.device)
# Initialize the TextStreamer
streamer = TextStreamer(tokenizer)
# Generate answer using the model with streaming
with torch.no_grad():
model.generate(
inputs.input_ids,
attention_mask=inputs.attention_mask,
max_new_tokens=max_new_tokens,
temperature=temperature,
do_sample=True,
top_k=top_k,
top_p=top_p,
repetition_penalty=1.2,
eos_token_id=tokenizer.eos_token_id,
pad_token_id=tokenizer.pad_token_id,
streamer=streamer # Stream output as it's generated
)
# Input Question
question = "What is quantum mechanics?"
# Generate and print answer
generate_answer_with_stream(model, tokenizer, question)
- Downloads last month
- 20
Model tree for Vijayendra/llama3.0-8B-merged-4bit
Base model
meta-llama/Meta-Llama-3-8B
Quantized
unsloth/llama-3-8b-bnb-4bit