vilarin commited on
Commit
85585d6
·
verified ·
1 Parent(s): 8716f81

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +91 -30
app.py CHANGED
@@ -1,27 +1,86 @@
1
- import torch
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  import copy
3
  import gradio as gr
4
  import spaces
5
- from llama_cpp import Llama
6
- import llama_cpp.llama_tokenizer
7
- import os
8
- from huggingface_hub import hf_hub_download
9
 
10
 
11
  HF_TOKEN = os.environ.get("HF_TOKEN", None)
12
- MODEL_ID = "google/gemma-2-27b-it"
13
- REPO_ID = "bartowski/gemma-2-27b-it-GGUF"
14
  MODEL_NAME = MODEL_ID.split("/")[-1]
15
- MODEL_FILE = "gemma-2-27b-it-Q4_K_M.gguf"
16
 
17
  os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
18
 
19
- llm = llama_cpp.Llama.from_pretrained(
20
- repo_id=REPO_ID,
21
- filename=MODEL_FILE,
22
- tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained(MODEL_ID),
23
- verbose=False,
24
- )
25
 
26
  TITLE = "<h1><center>Chatbox</center></h1>"
27
 
@@ -49,31 +108,33 @@ h3 {
49
 
50
 
51
  @spaces.GPU(duration=90)
52
- def stream_chat(message: str, history: list, temperature: float, max_new_tokens: int, top_p: float, top_k: int, penalty: float):
53
  print(f'message is - {message}')
54
  print(f'history is - {history}')
55
  conversation = []
56
  for prompt, answer in history:
57
- conversation.extend([{"role": "user", "content": prompt}, {"role": "assistant", "content": answer}])
58
- conversation.append({"role": "user", "content": message})
 
 
 
 
 
59
 
60
  print(f"Conversation is -\n{conversation}")
61
-
62
- output = llm(
63
- messages=conversation,
64
- top_k=top_k,
65
  top_p=top_p,
 
66
  repeat_penalty=penalty,
67
- max_tokens=max_new_tokens,
68
- stream =True,
69
- temperature=temperature,
70
  )
71
-
72
- for out in output:
73
- stream = copy.deepcopy(out)
74
- temp += stream["choices"][0]["text"]
75
- yield temp
76
 
 
 
77
 
78
 
79
  chatbot = gr.Chatbot(height=600)
@@ -101,7 +162,7 @@ with gr.Blocks(css=CSS, theme="soft") as demo:
101
  maximum=2048,
102
  step=1,
103
  value=1024,
104
- label="Max Tokens",
105
  render=False,
106
  ),
107
  gr.Slider(
 
1
+ model_name = "gemma2:27b"
2
+
3
+ import os
4
+
5
+ os.system("sudo apt install lshw")
6
+ os.system("curl https://ollama.ai/install.sh | sh")
7
+
8
+ import nest_asyncio
9
+ nest_asyncio.apply()
10
+
11
+ import os
12
+ import asyncio
13
+
14
+ # Run Async Ollama
15
+ # Taken from: https://stackoverflow.com/questions/77697302/how-to-run-ollama-in-google-colab
16
+ # NB: You may need to set these depending and get cuda working depending which backend you are running.
17
+ # Set environment variable for NVIDIA library
18
+ # Set environment variables for CUDA
19
+ os.environ['PATH'] += ':/usr/local/cuda/bin'
20
+ # Set LD_LIBRARY_PATH to include both /usr/lib64-nvidia and CUDA lib directories
21
+ os.environ['LD_LIBRARY_PATH'] = '/usr/lib64-nvidia:/usr/local/cuda/lib64'
22
+
23
+ async def run_process(cmd):
24
+ print('>>> starting', *cmd)
25
+ process = await asyncio.create_subprocess_exec(
26
+ *cmd,
27
+ stdout=asyncio.subprocess.PIPE,
28
+ stderr=asyncio.subprocess.PIPE
29
+ )
30
+
31
+ # define an async pipe function
32
+ async def pipe(lines):
33
+ async for line in lines:
34
+ print(line.decode().strip())
35
+
36
+ await asyncio.gather(
37
+ pipe(process.stdout),
38
+ pipe(process.stderr),
39
+ )
40
+
41
+ # call it
42
+ await asyncio.gather(pipe(process.stdout), pipe(process.stderr))
43
+
44
+ import asyncio
45
+ import threading
46
+
47
+ async def start_ollama_serve():
48
+ await run_process(['ollama', 'serve'])
49
+
50
+ def run_async_in_thread(loop, coro):
51
+ asyncio.set_event_loop(loop)
52
+ loop.run_until_complete(coro)
53
+ loop.close()
54
+
55
+ # Create a new event loop that will run in a new thread
56
+ new_loop = asyncio.new_event_loop()
57
+
58
+ # Start ollama serve in a separate thread so the cell won't block execution
59
+ thread = threading.Thread(target=run_async_in_thread, args=(new_loop, start_ollama_serve()))
60
+ thread.start()
61
+
62
+ # Load up model
63
+
64
+ os.system(f"ollama pull {model_name}")
65
+
66
+
67
  import copy
68
  import gradio as gr
69
  import spaces
70
+ from llama_index.llms.ollama import Ollama
71
+ import llama_index
72
+ from llama_index.core.llms import ChatMessage
 
73
 
74
 
75
  HF_TOKEN = os.environ.get("HF_TOKEN", None)
76
+ MODEL_ID_LIST = ["google/gemma-2-27b-it"]
 
77
  MODEL_NAME = MODEL_ID.split("/")[-1]
 
78
 
79
  os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
80
 
81
+
82
+ gemma2 = Ollama(model=model_name, request_timeout=30.0)
83
+
 
 
 
84
 
85
  TITLE = "<h1><center>Chatbox</center></h1>"
86
 
 
108
 
109
 
110
  @spaces.GPU(duration=90)
111
+ def stream_chat(message: str, history: list, temperature: float, context_window: int, top_p: float, top_k: int, penalty: float):
112
  print(f'message is - {message}')
113
  print(f'history is - {history}')
114
  conversation = []
115
  for prompt, answer in history:
116
+ conversation.extend([
117
+ ChatMessage(
118
+ role="user", content=prompt
119
+ ),
120
+ ChatMessage(role="assistant", content=answer),
121
+ ])
122
+ messages = [ChatMessage(role="user", content=message)]
123
 
124
  print(f"Conversation is -\n{conversation}")
125
+
126
+ resp = gemma2.stream_chat(
127
+ message = messages,
128
+ chat_history = conversation,
129
  top_p=top_p,
130
+ top_k=top_k,
131
  repeat_penalty=penalty,
132
+ context_window=context_window,
 
 
133
  )
134
+
 
 
 
 
135
 
136
+ for r in resp:
137
+ yield r.delta
138
 
139
 
140
  chatbot = gr.Chatbot(height=600)
 
162
  maximum=2048,
163
  step=1,
164
  value=1024,
165
+ label="Context window",
166
  render=False,
167
  ),
168
  gr.Slider(