|
from time import perf_counter |
|
import streamlit as st |
|
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer, GenerationConfig |
|
|
|
def formatted_prompt(input)-> str: |
|
return f"<|im_start|>user\n{input}<|im_end|>\n<|im_start|>assistant:" |
|
|
|
def generate_response(user_input): |
|
prompt = formatted_prompt(user_input) |
|
|
|
inputs = tokenizer([prompt], return_tensors="pt") |
|
generation_config = GenerationConfig( |
|
penalty_alpha=0.6, |
|
do_sample=True, |
|
top_k=5, |
|
temperature=0.5, |
|
repetition_penalty=1.2, |
|
max_new_tokens=500, |
|
pad_token_id=tokenizer.eos_token_id |
|
) |
|
start_time = perf_counter() |
|
|
|
inputs = tokenizer(prompt, return_tensors="pt").to('cuda') |
|
|
|
outputs = model.generate(**inputs, generation_config=generation_config) |
|
response = tokenizer.decode(outputs[0], skip_special_tokens=True) |
|
output_time = perf_counter() - start_time |
|
st.write(response) |
|
st.write(f"Time taken for inference: {round(output_time, 2)} seconds") |
|
|
|
@st.cache(allow_output_mutation=True) |
|
def load_model_and_tokenizer(model_name, token): |
|
model = AutoModelForSequenceClassification.from_pretrained(model_name, token=token) |
|
tokenizer = AutoTokenizer.from_pretrained(model_name, token=token) |
|
return model, tokenizer |
|
|
|
|
|
model_name = "orYx-models/finetuned-tiny-llama-medical-papers" |
|
token = "Tinyllama_secret" |
|
model, tokenizer = load_model_and_tokenizer(model_name, token) |
|
|
|
|
|
pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer) |
|
|
|
user_input = st.text_area("Enter some text:") |
|
|
|
if user_input: |
|
generate_response(user_input) |
|
|