# main.py import spaces import torch import torch.nn.functional as F from torch.nn import DataParallel from torch import Tensor from transformers import AutoTokenizer, AutoModel import threading import queue import os import json import numpy as np import gradio as gr from huggingface_hub import InferenceClient import openai from openai import OpenAI from globalvars import API_BASE, intention_prompt, tasks from dotenv import load_dotenv import re from utils import load_env_variables os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:30' os.environ['CUDA_LAUNCH_BLOCKING'] = '1' os.environ['CUDA_CACHE_DISABLE'] = '1' device = torch.device("cuda" if torch.cuda.is_available() else "cpu") ### Utils hf_token, yi_token = load_env_variables() ## use instruct embeddings # Load the tokenizer and model tokenizer = AutoTokenizer.from_pretrained('nvidia/NV-Embed-v1', token = hf_token , trust_remote_code=True) model = AutoModel.from_pretrained('nvidia/NV-Embed-v1' , token = hf_token , trust_remote_code=True).to(device) ## add chroma vector store ## Make intention Mapper intention_client = OpenAI( api_key=yi_token, base_url=API_BASE ) intention_completion = intention_client.chat.completions.create( model="yi-large", messages=[{"role": "system", "content": intention_prompt},{"role": "user", "content": inputext}] ) # print(completion) def respond( message, history: list[tuple[str, str]], system_message, max_tokens, temperature, top_p, ): messages = [{"role": "system", "content": system_message}] for val in history: if val[0]: messages.append({"role": "user", "content": val[0]}) if val[1]: messages.append({"role": "assistant", "content": val[1]}) messages.append({"role": "user", "content": message}) response = "" for message in client.chat_completion( messages, max_tokens=max_tokens, stream=True, temperature=temperature, top_p=top_p, ): token = message.choices[0].delta.content response += token yield response """ For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface """ demo = gr.ChatInterface( respond, additional_inputs=[ gr.Textbox(value="You are a friendly Chatbot.", label="System message"), gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"), gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"), gr.Slider( minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)", ), ], ) if __name__ == "__main__": demo.launch()