import gradio as gr import time from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1") starter_text = """# Abstract Within thirty years, we will have the technological means to create superhuman intelligence. Shortly after, the human era will be ended. Is such progress avoidable? If not to be avoided, can events be guided so that we may survive? These questions are investigated. Some possible answers (and some further dangers) are presented. """ def calculate_wait_seconds(tokens_per_second): return 1 / tokens_per_second def get_tokens(prompt): tokens = tokenizer.tokenize(prompt) return [x.replace('▁', ' ').replace('<0x0A>', '\n') for x in tokens] def echo(message, history, prompt, tokens_per_second, time_to_first_token, stream): wait_seconds = calculate_wait_seconds(tokens_per_second) response = f"{prompt}" tokens = get_tokens(response) if time_to_first_token: time.sleep(time_to_first_token / 1000) partial_message = "" for new_token in tokens: time.sleep(wait_seconds) if '<' in new_token: # Gradio chat chokes on HTML-like elements continue partial_message += str(new_token) if stream: yield partial_message if not stream: yield partial_message with gr.Blocks(title='Tokens per Second Simulator') as demo: gr.Markdown('# ⏱️ Tokens per Second Simulator') gr.Markdown('Compare the feel of different response speeds for a chat bot') gr.Markdown('Reading speeds vary but in English 5-10 tokens per second is considered normal reading speed') gr.Markdown( 'References for further research:\n' '- https://www.perplexity.ai/search/How-many-tokens-1d7VyXCDQuWf3pJnK4.0iw?s=c\n' '- https://www.databricks.com/blog/llm-inference-performance-engineering-best-practices\n' '- https://news.ycombinator.com/item?id=35978864\n' '- https://www.reddit.com/r/LocalLLaMA/comments/162pgx9/what_do_yall_consider_acceptable_tokens_per/') prompt = gr.Textbox(starter_text, label="Prompt to Echo") tps_slider = gr.Slider(1, 50, render=True, value=8, label='Tokens per second (TPS)') ttft_slider = gr.Slider(0, 5000, render=True, value=0, label='Time to first token (TTFT) in milliseconds') stream_checkbox = gr.Checkbox(label='Stream Response', value=True) gr.ChatInterface(echo, additional_inputs=[prompt, tps_slider, ttft_slider, stream_checkbox], description='Submit any text to echo the prompt above at the selected speed.') demo.queue().launch()