File size: 2,969 Bytes
8dbf509
7b924b8
 
 
 
 
 
 
 
 
 
 
8dbf509
 
e042085
 
 
af4feb8
7b924b8
8dbf509
 
 
 
 
af4feb8
 
 
 
 
 
 
 
 
 
 
 
7b924b8
 
 
 
 
af4feb8
 
7b924b8
 
 
 
e042085
af4feb8
e042085
 
 
 
 
 
 
9ff190c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
# main.py
import spaces
import torch
import torch.nn.functional as F
from torch.nn import DataParallel
from torch import Tensor
from transformers import AutoTokenizer, AutoModel
import threading
import queue
import os
import json
import numpy as np
import gradio as gr
from huggingface_hub import InferenceClient
import openai
from openai import OpenAI
from globalvars import API_BASE, API_KEY, intention_prompt
from dotenv import load_dotenv

os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:30'
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
os.environ['CUDA_CACHE_DISABLE'] = '1'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def load_env_variables():
    # Load the .env file
    load_dotenv()

    # Retrieve the environment variables
    hf_token = os.getenv('HF_TOKEN')
    yi_token = os.getenv('YI_TOKEN')

    return hf_token, yi_token

# Example usage
hf_token, yi_token = load_env_variables()
## add chroma vector store


## use instruct embeddings
# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('nvidia/NV-Embed-v1', token = hf_token , trust_remote_code=True)
model = AutoModel.from_pretrained('nvidia/NV-Embed-v1' , token = hf_token ,  trust_remote_code=True).to(device)


## Make intention Mapper 

intention_client = OpenAI(
    api_key=yi_token,
    base_url=API_BASE
)
intention_completion = intention_client.chat.completions.create(
    model="yi-large",
    messages=[{"role": "system", "content": intention_prompt},{"role": "user", "content": inputext}]
)
# print(completion)

def respond(
    message,
    history: list[tuple[str, str]],
    system_message,
    max_tokens,
    temperature,
    top_p,
):
    messages = [{"role": "system", "content": system_message}]

    for val in history:
        if val[0]:
            messages.append({"role": "user", "content": val[0]})
        if val[1]:
            messages.append({"role": "assistant", "content": val[1]})

    messages.append({"role": "user", "content": message})

    response = ""

    for message in client.chat_completion(
        messages,
        max_tokens=max_tokens,
        stream=True,
        temperature=temperature,
        top_p=top_p,
    ):
        token = message.choices[0].delta.content

        response += token
        yield response

"""
For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
"""
demo = gr.ChatInterface(
    respond,
    additional_inputs=[
        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
        gr.Slider(
            minimum=0.1,
            maximum=1.0,
            value=0.95,
            step=0.05,
            label="Top-p (nucleus sampling)",
        ),
    ],
)


if __name__ == "__main__":
    demo.launch()