Spaces:
Build error
Build error
import gradio as gr | |
import random | |
import torch | |
from transformers import AutoConfig, AutoTokenizer, AutoModelWithLMHead | |
from transformers import GPT2Tokenizer, GPT2LMHeadModel | |
from itertools import chain | |
import os | |
import librosa | |
import tempfile | |
from typing import Optional | |
import numpy as np | |
import wave | |
from huggingface_hub import hf_hub_download | |
from stt import Model | |
#### STT ########### | |
########### STT English ############## | |
state = gr.Variable() | |
REPO_ID = "mbarnig/lb-de-fr-en-pt-coqui-stt-models" | |
my_title = "STT-ChatGPT-TTS with Coqui" | |
my_description = "TODO add description and reference: STT base from mbarnig/lb-de-fr-en-pt-coqui-stt-models - 🐸 [Coqui.ai](https://https://coqui.ai/)." | |
STT_LANGUAGES = [ | |
"English", | |
] | |
EXAMPLES = [ | |
["examples/english.wav", "English", True, "Linda", "every window and roof which could command a view of the horrible performance was occupied"], | |
] | |
def stt_record(audio_record_buffer): | |
#using english model, it is here to reduce memory usage, will trigger download first run | |
#unfortunately will be slow as it is shared cpu/memory need to free memory after run | |
acoustic_model = Model(hf_hub_download(repo_id = REPO_ID, filename = "english/model.tflite")) | |
scorer_path = hf_hub_download(repo_id = REPO_ID, filename = "english/huge-vocabulary.scorer") | |
if type(audio_record_buffer)!=tuple: | |
y, sr = librosa.load(audio_record_buffer) | |
else: | |
sr, y = audio_record_buffer | |
y = librosa.resample(y, orig_sr=sr, target_sr=16000).astype("int16") | |
scorer = True # use scorer | |
if scorer: | |
acoustic_model.enableExternalScorer(scorer_path) | |
result = acoustic_model.stt(y) | |
else: | |
acoustic_model.disableExternalScorer() | |
result = acoustic_model.stt(y) | |
print("STT:",result) | |
return result | |
#emotion_tokenizer = AutoTokenizer.from_pretrained("mrm8488/t5-base-finetuned-emotion") | |
#emotion_model = AutoModelWithLMHead.from_pretrained("mrm8488/t5-base-finetuned-emotion") | |
def get_emotion(text): | |
input_ids = tokenizer.encode(text + '</s>', return_tensors='pt') | |
output = model.generate(input_ids=input_ids,max_length=2) | |
dec = [tokenizer.decode(ids) for ids in output] | |
label = dec[0] | |
return label.split()[1] | |
config = AutoConfig.from_pretrained('gorkemgoknar/gpt2chatbotenglish') | |
model = GPT2LMHeadModel.from_pretrained('gorkemgoknar/gpt2chatbotenglish', config=config) | |
tokenizer = GPT2Tokenizer.from_pretrained('gorkemgoknar/gpt2chatbotenglish') | |
tokenizer.model_max_length = 1024 | |
#Dynamic Temperature | |
#See experiment https://www.linkedin.com/pulse/ai-goes-job-interview-g%25C3%25B6rkem-g%25C3%25B6knar | |
base_temperature = 1.2 | |
dynamic_temperature_range = 0.15 | |
rand_range = random.uniform(-1 * dynamic_temperature_range , dynamic_temperature_range ) | |
temperature = base_temperature + rand_range | |
SPECIAL_TOKENS = ["<bos>", "<eos>", "<speaker1>", "<speaker2>", "<pad>"] | |
#See document for experiment https://www.linkedin.com/pulse/ai-goes-job-interview-g%C3%B6rkem-g%C3%B6knar/ | |
def get_chat_response(name,history=[], input_txt = "Hello , what is your name?"): | |
ai_history = history.copy() | |
#ai_history.append(input_txt) | |
ai_history_e = [tokenizer.encode(e) for e in ai_history] | |
personality = "My name is " + name | |
bos, eos, speaker1, speaker2 = tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[:-1]) | |
#persona first, history next, input text must be at the end | |
#[[bos, persona] , [history] , [input]] | |
sequence = [[bos] + tokenizer.encode(personality)] + ai_history_e + [tokenizer.encode(input_txt)] | |
##[[bos, persona] , [speaker1 .., speakser2 .., speaker1 ... speaker2 ... , [input]] | |
sequence = [sequence[0]] + [[speaker2 if (len(sequence)-i) % 2 else speaker1] + s for i, s in enumerate(sequence[1:])] | |
sequence = list(chain(*sequence)) | |
#bot_input_ids = tokenizer.encode(personality + tokenizer.eos_token + input_txt + tokenizer.eos_token , return_tensors='pt') | |
sequence_len = len(sequence) | |
#optimum response and speed | |
chat_history_ids = model.generate( | |
torch.tensor(sequence).unsqueeze(0), max_length=50, | |
pad_token_id=tokenizer.eos_token_id, | |
no_repeat_ngram_size=3, | |
do_sample=True, | |
top_k=60, | |
top_p=0.8, | |
temperature = 1.3 | |
) | |
out_str = tokenizer.decode(chat_history_ids[0][sequence_len:], skip_special_tokens=True) | |
#out_str = tokenizer.decode(chat_history_ids[:, sequence.shape[-1]:][0], skip_special_tokens=False) | |
return out_str | |
##you can use anyone from below | |
''' | |
| Macleod | Moran | Brenda | Ramirez | Peter Parker | Quentin Beck | Andy | |
| Red | Norton | Willard | Chief | Chef | Kilgore | Kurtz | Westley | Buttercup | |
| Vizzini | Fezzik | Inigo | Man In Black | Taylor | Zira | Zaius | Cornelius | |
| Bud | Lindsey | Hippy | Erin | Ed | George | Donna | Trinity | Agent Smith | |
| Morpheus | Neo | Tank | Meryl | Truman | Marlon | Christof | Stromboli | Bumstead | |
| Schreber | Walker | Korben | Cornelius | Loc Rhod | Anakin | Obi-Wan | Palpatine | |
| Padme | Superman | Luthor | Dude | Walter | Donny | Maude | General | Starkiller | |
| Indiana | Willie | Short Round | John | Sarah | Terminator | Miller | Sarge | Reiben | |
| Jackson | Upham | Chuckie | Will | Lambeau | Sean | Skylar | Saavik | Spock | |
| Kirk | Bones | Khan | Kirk | Spock | Sybok | Scotty | Bourne | Pamela | Abbott | |
| Nicky | Marshall | Korshunov | Troy | Vig | Archie Gates | Doc | Interrogator | |
| Ellie | Ted | Peter | Drumlin | Joss | Macready | Childs | Nicholas | Conrad | |
| Feingold | Christine | Adam | Barbara | Delia | Lydia | Cathy | Charles | Otho | |
| Schaefer | Han | Luke | Leia | Threepio | Vader | Yoda | Lando | Elaine | Striker | |
| Dr. Rumack | Kramer | David | Saavik | Kirk | Kruge | Holden | Deckard | Rachael | |
| Batty | Sebastian | Sam | Frodo | Pippin | Gandalf | Kay | Edwards | Laurel | |
| Edgar | Zed | Jay | Malloy | Plissken | Steve Rogers | Tony Stark | Scott Lang | |
| Bruce Banner | Bruce | Edward | Two-Face | Batman | Chase | Alfred | Dick | |
| Riddler | Din Djarin | Greef Karga | Kuiil | Ig-11 | Cara Dune | Peli Motto | |
| Toro Calican | Ripley | Meredith | Dickie | Marge | Peter | Lambert | Kane | |
| Dallas | Ripley | Ash | Parker | Threepio | Luke | Leia | Ben | Han | Common Bob | |
| Common Alice | Jack | Tyler | Marla | Dana | Stantz | Venkman | Spengler | Louis | |
| Fry | Johns | Riddick | Kirk | Decker | Spock | "Ilia | Indy | Belloq | Marion | |
| Brother | Allnut | Rose | Qui-Gon | Jar Jar | |
''' | |
MODEL_NAME= "tts_models/multilingual/multi-dataset/your_tts" | |
def greet(character,your_voice,message,history): | |
#gradios set_state/get_state had problems on embedded html! | |
history = history or {"character": character, "message_history" : [] } | |
#gradios set_state/get_state does not persist session for now using global | |
#global history | |
if history["character"] != character: | |
#switching character | |
history = {"character": character, "message_history" : [] } | |
response = get_chat_response(character,history=history["message_history"],input_txt=message) | |
os.system('tts --text "'+response+'" --model_name tts_models/multilingual/multi-dataset/your_tts --speaker_wav '+your_voice+' --language_idx "en"') | |
history["message_history"].append((message, response)) | |
#emotion = get_emotion(response) | |
html = "<div class='chatbot'>" | |
for user_msg, resp_msg in history["message_history"]: | |
html += f"<div class='user_msg'>You: {user_msg}</div>" | |
html += f"<div class='resp_msg'>{character}: {resp_msg}</div>" | |
html += "</div>" | |
return html,history,"tts_output.wav" | |
def greet_stt_to_tts(character,your_voice,history): | |
#gradios set_state/get_state had problems on embedded html! | |
history = history or {"character": character, "message_history" : [] } | |
#gradios set_state/get_state does not persist session for now using global | |
#global history | |
if history["character"] != character: | |
#switching character | |
history = {"character": character, "message_history" : [] } | |
# speech -> text (Whisper) | |
message = stt_record(your_voice) | |
response = get_chat_response(character,history=history["message_history"],input_txt=message) | |
print("Response:",response) | |
if type(response) == tuple: | |
# only get first | |
response = response[0] | |
print("Response only first:",response) | |
os.system('tts --text "'+str(response)+'" --model_name tts_models/multilingual/multi-dataset/your_tts --speaker_wav '+your_voice+' --language_idx "en"') | |
history["message_history"].append((message, response)) | |
#emotion = get_emotion(response) | |
html = "<div class='chatbot'>" | |
for user_msg, resp_msg in history["message_history"]: | |
html += f"<div class='user_msg'>You: {user_msg}</div>" | |
html += f"<div class='resp_msg'>{character}: {resp_msg}</div>" | |
html += "</div>" | |
return html,history,"tts_output.wav" | |
def greet_textonly(character,message,history): | |
#gradios set_state/get_state had problems on embedded html! | |
history = history or {"character": character, "message_history" : [] } | |
#gradios set_state/get_state does not persist session for now using global | |
#global history | |
if history["character"] != character: | |
#switching character | |
history = {"character": character, "message_history" : [] } | |
response = get_chat_response(character,history=history["message_history"],input_txt=message) | |
history["message_history"].append((message, response)) | |
#emotion = get_emotion(response) | |
html = "<div class='chatbot'>" | |
for user_msg, resp_msg in history["message_history"]: | |
html += f"<div class='user_msg'>You: {user_msg}</div>" | |
html += f"<div class='resp_msg'>{character}: {resp_msg}</div>" | |
html += "</div>" | |
return html,history | |
personality_choices = ["Gandalf", "Riddick", "Macleod", "Morpheus", "Neo","Spock","Vader","Indy"] | |
examples= ["Gandalf", "What is your name?"] | |
css=""" | |
.chatbox {display:flex;flex-direction:column} | |
.user_msg, .resp_msg {padding:4px;margin-bottom:4px;border-radius:4px;width:80%} | |
.user_msg {background-color:cornflowerblue;color:white;align-self:start} | |
.resp_msg {background-color:lightgray;align-self:self-end} | |
""" | |
#some selected ones are in for demo use | |
personality_choices = ["Gandalf", "Riddick", "Macleod", "Morpheus", "Neo","Spock","Vader","Indy", "Ig-11","Threepio","Tony Stark","Batman","Vizzini"] | |
title = "Movie Chatbot with Coqui YourTTS" | |
description = "Chat with your favorite movie characters, making characters voice like you. See Coqui Space for more TTS models https://huggingface.co/spaces/coqui/CoquiTTS" | |
article = "STT base model from mbarnig/lb-de-fr-en-pt-coqui-stt-models - 🐸 [Coqui.ai](https://https://coqui.ai/)" | |
#History not implemented in this demo, use metayazar.com/chatbot for a movie and character dropdown chat interface | |
##interface = gr.Interface(fn=greet, inputs=[gr.inputs.Dropdown(personality_choices) ,"text"], title=title, description=description, outputs="text") | |
examples=[['Gandalf','dragon.wav','Who are you sir?',{}]] | |
history = {"character": "None", "message_history" : [] } | |
interface_full = gr.Interface(fn=greet_stt_to_tts, | |
inputs=[gr.Dropdown(personality_choices), | |
gr.Audio(source="microphone", type="filepath", label="Record Audio") , | |
"state"], | |
outputs=["html","state",gr.Audio(type="filepath")], | |
css=css, title="Chat with Your Voice", description=description,article=article , | |
live=False) | |
interface_mic = gr.Interface(fn=greet, | |
inputs=[gr.Dropdown(personality_choices), | |
gr.Audio(source="microphone", type="filepath") , | |
"text", | |
"state"], | |
outputs=["html","state",gr.Audio(type="filepath")], | |
css=css, title="Chat with Your Voice", description=description,article=article ) | |
interface_text = gr.Interface(fn=greet_textonly, | |
inputs=[gr.Dropdown(personality_choices), | |
"text", | |
"state"], | |
outputs=["html","state"], | |
css=css, title="Chat Text Only", description=description,article=article) | |
interface_file= gr.Interface(fn=greet, | |
inputs=[gr.Dropdown(personality_choices), | |
gr.Audio(type="filepath") , | |
"text", | |
"state"], | |
outputs=["html","state",gr.Audio(type="filepath")], | |
css=css, title="Chat with Uploaded file", description=description,article=article ) | |
appinterface = gr.TabbedInterface([interface_mic,interface_full,interface_file, interface_text], ["Chat with Mic Record","Chat Speech -> Speech", "Chat with Audio Upload" , "Chat Text only"]) | |
appinterface.launch() |