msy127's picture
Update app.py
de28914
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
model_name = "facebook/opt-350m"
# model_name = "NousResearch/Llama-2-7b-chat-hf"
tokenizer = AutoTokenizer.from_pretrained("NousResearch/Llama-2-7b-chat-hf")
model = AutoModelForCausalLM.from_pretrained("NousResearch/Llama-2-7b-chat-hf")
def predict(message, chatbot, temperature=0.9, max_new_tokens=256, top_p=0.6, repetition_penalty=1.0,):
system_message = "\n당신은 도움이 되고 μ •μ€‘ν•˜λ©° μ •μ§ν•œ Assistantμž…λ‹ˆλ‹€. μ•ˆμ „μ„ μœ μ§€ν•˜λ©΄μ„œ 항상 κ°€λŠ₯ν•œ ν•œ 도움이 λ˜λ„λ‘ λ‹΅λ³€ν•˜μ‹­μ‹œμ˜€. κ·€ν•˜μ˜ λ‹΅λ³€μ—λŠ” μœ ν•΄ν•˜κ±°λ‚˜, λΉ„μœ€λ¦¬μ μ΄κ±°λ‚˜, μΈμ’…μ°¨λ³„μ μ΄κ±°λ‚˜, μ„±μ°¨λ³„μ μ΄κ±°λ‚˜, 독성이 μžˆκ±°λ‚˜, μœ„ν—˜ν•˜κ±°λ‚˜ λΆˆλ²•μ μΈ μ½˜ν…μΈ κ°€ ν¬ν•¨λ˜μ–΄μ„œλŠ” μ•ˆ λ©λ‹ˆλ‹€. κ·€ν•˜μ˜ 닡변은 μ‚¬νšŒμ μœΌλ‘œ 편견이 μ—†κ³  κΈμ •μ μž…λ‹ˆλ‹€.\n\n질문이 μ˜λ―Έκ°€ μ—†κ±°λ‚˜ μ‚¬μ‹€μ μœΌλ‘œ 일관성이 μ—†λŠ” 경우, μ˜³μ§€ μ•Šμ€ 것에 λ‹΅λ³€ν•˜λŠ” λŒ€μ‹  이유λ₯Ό μ„€λͺ…ν•˜μ‹­μ‹œμ˜€. μ§ˆλ¬Έμ— λŒ€ν•œ 닡변을 λͺ¨λ₯΄λŠ” 경우, ν—ˆμœ„μ •λ³΄ κ³΅μœ ν•˜μ§€ λ§ˆμ„Έμš”"
input_system = f"[INST] <<SYS>>\n{system_message}\n<</SYS>>\n\n "
input_history = ""
for interaction in chatbot:
input_history = input_system + str(interaction[0]) + " [/INST] " + str(interaction[1]) + " </s><s> [INST] "
input_prompt = input_history + str(message) + " [/INST] "
inputs = tokenizer.encode(input_prompt, return_tensors="pt").to('cuda')
temperature = float(temperature)
if temperature < 1e-2: temperature = 1e-2
top_p = float(top_p)
generate_kwargs = dict(
input_ids=inputs,
temperature=temperature,
top_p=top_p,
max_new_tokens=max_new_tokens,
repetition_penalty=repetition_penalty,
)
outputs = model.generate(**generate_kwargs)
generated_indcluded_full_text = tokenizer.decode(outputs[0])
print("generated_indcluded_full_text:", generated_indcluded_full_text)
generated_text = generated_indcluded_full_text.split('[/INST] ')[-1]
if '</s>' in generated_text :
generated_text = generated_text.split('</s>')[0]
else : pass
import json
tokens = generated_text.split('\n')
token_list = []
for idx, token in enumerate(tokens):
token_dict = {"id": idx + 1, "text": token}
token_list.append(token_dict)
response = {"data": {"token": token_list}}
response = json.dumps(response, indent=4)
response = json.loads(response)
data_dict = response.get('data', {})
token_list = data_dict.get('token', [])
import time
partial_message = ""
for token_entry in token_list:
if token_entry:
try:
token_id = token_entry.get('id', None)
token_text = token_entry.get('text', None)
if token_text:
for char in token_text:
partial_message += char
yield partial_message
time.sleep(0.01)
else:
gr.Warning(f"The key 'text' does not exist or is None in this token entry: {token_entry}")
except KeyError as e:
gr.Warning(f"KeyError: {e} occurred for token entry: {token_entry}")
continue
title = "TheBloke/Llama-2-7b-Chat-GPTQ닝 λͺ¨λΈ chatbot"
description = """
TheBloke/Llama-2-7b-Chat-GPTQ λͺ¨λΈμž…λ‹ˆλ‹€.
"""
css = """.toast-wrap { display: none !important } """
examples=[
['Hello there! How are you doing?'],
['Can you explain to me briefly what is Python programming language?'],
['Explain the plot of Cinderella in a sentence.'],
['How many hours does it take a man to eat a Helicopter?'],
["Write a 100-word article on 'Benefits of Open-Source in AI research'"],
]
import gradio as gr
def vote(data: gr.LikeData):
if data.liked:
print("You upvoted this response: " + data.value)
else:
print("You downvoted this response: " + data.value)
additional_inputs=[
gr.Slider(
label="Temperature",
value=0.9,
minimum=0.0,
maximum=1.0,
step=0.05,
interactive=True,
info="Higher values produce more diverse outputs",
),
gr.Slider(
label="Max new tokens",
value=256,
minimum=0,
maximum=4096,
step=64,
interactive=True,
info="The maximum numbers of new tokens",
),
gr.Slider(
label="Top-p (nucleus sampling)",
value=0.6,
minimum=0.0,
maximum=1,
step=0.05,
interactive=True,
info="Higher values sample more low-probability tokens",
),
gr.Slider(
label="Repetition penalty",
value=1.2,
minimum=1.0,
maximum=2.0,
step=0.05,
interactive=True,
info="Penalize repeated tokens",
)
]
chatbot_stream = gr.Chatbot(avatar_images=('user.png', 'bot2.png'), bubble_full_width = False)
chat_interface_stream = gr.ChatInterface(predict,
title=title,
description=description,
chatbot=chatbot_stream,
css=css,
examples=examples,
cache_examples=False,
additional_inputs=additional_inputs,)
with gr.Blocks() as demo:
with gr.Tab("Streaming"):
chatbot_stream.like(vote, None, None)
chat_interface_stream.render()
demo.queue(concurrency_count=75, max_size=100).launch(debug=True)