Spaces:
Sleeping
Sleeping
Yurii Paniv
commited on
Commit
•
d36cdc2
1
Parent(s):
cb99192
Add logging
Browse files- app.py +49 -2
- data_logger.py +41 -0
app.py
CHANGED
@@ -13,6 +13,40 @@ from threading import Thread
|
|
13 |
from torch import float16
|
14 |
import spaces
|
15 |
import huggingface_hub
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
|
17 |
|
18 |
config = PeftConfig.from_pretrained("lang-uk/dragoman")
|
@@ -37,9 +71,12 @@ tokenizer = AutoTokenizer.from_pretrained(
|
|
37 |
|
38 |
@spaces.GPU(duration=30)
|
39 |
def translate(input_text):
|
40 |
-
generated_text = ""
|
41 |
input_text = input_text.strip()
|
42 |
|
|
|
|
|
|
|
43 |
input_text = f"[INST] {input_text} [/INST]"
|
44 |
inputs = tokenizer([input_text], return_tensors="pt").to(model.device)
|
45 |
|
@@ -78,6 +115,10 @@ desc_file = huggingface_hub.hf_hub_download("lang-uk/dragoman", "README.md")
|
|
78 |
with open(desc_file, "r") as f:
|
79 |
model_description = f.read()
|
80 |
model_description = model_description[model_description.find("---", 1) + 5 :]
|
|
|
|
|
|
|
|
|
81 |
|
82 |
|
83 |
iface = gr.Interface(
|
@@ -91,10 +132,16 @@ iface = gr.Interface(
|
|
91 |
label="Translated sentence",
|
92 |
),
|
93 |
examples=[
|
|
|
|
|
|
|
94 |
[
|
95 |
"ChatGPT (Chat Generative Pre-trained Transformer) is a chatbot developed by OpenAI and launched on November 30, 2022. Based on a large language model, it enables users to refine and steer a conversation towards a desired length, format, style, level of detail, and language. Successive prompts and replies, known as prompt engineering, are considered at each conversation stage as a context.[2] ",
|
|
|
|
|
96 |
"who holds this neighborhood?",
|
97 |
-
]
|
|
|
98 |
],
|
99 |
title="Dragoman: SOTA English-Ukrainian translation model",
|
100 |
description='This demo contains a model from paper "Setting up the Data Printer with Improved English to Ukrainian Machine Translation", accepted to UNLP 2024 workshop at the LREC-COLING 2024 conference.',
|
|
|
13 |
from torch import float16
|
14 |
import spaces
|
15 |
import huggingface_hub
|
16 |
+
from threading import Thread
|
17 |
+
from queue import Queue
|
18 |
+
from time import sleep
|
19 |
+
from os import getenv
|
20 |
+
from data_logger import log_data
|
21 |
+
|
22 |
+
|
23 |
+
def check_thread(logging_queue: Queue):
|
24 |
+
logging_callback = log_data(
|
25 |
+
hf_token=getenv("HF_API_TOKEN"),
|
26 |
+
dataset_name=getenv("OUTPUT_DATASET"),
|
27 |
+
private=True,
|
28 |
+
)
|
29 |
+
while True:
|
30 |
+
sleep(60)
|
31 |
+
batch = []
|
32 |
+
while not logging_queue.empty():
|
33 |
+
batch.append(logging_queue.get())
|
34 |
+
|
35 |
+
if len(batch) > 0:
|
36 |
+
try:
|
37 |
+
logging_callback(batch)
|
38 |
+
except:
|
39 |
+
print(
|
40 |
+
"Error happened while pushing data to HF. Puttting items back in queue..."
|
41 |
+
)
|
42 |
+
for item in batch:
|
43 |
+
logging_queue.put(item)
|
44 |
+
|
45 |
+
|
46 |
+
if getenv("HF_API_TOKEN") is not None:
|
47 |
+
log_queue = Queue()
|
48 |
+
t = Thread(target=check_thread, args=(log_queue,))
|
49 |
+
t.start()
|
50 |
|
51 |
|
52 |
config = PeftConfig.from_pretrained("lang-uk/dragoman")
|
|
|
71 |
|
72 |
@spaces.GPU(duration=30)
|
73 |
def translate(input_text):
|
74 |
+
# generated_text = ""
|
75 |
input_text = input_text.strip()
|
76 |
|
77 |
+
if getenv("HF_API_TOKEN") is not None:
|
78 |
+
log_queue.put([input_text])
|
79 |
+
|
80 |
input_text = f"[INST] {input_text} [/INST]"
|
81 |
inputs = tokenizer([input_text], return_tensors="pt").to(model.device)
|
82 |
|
|
|
115 |
with open(desc_file, "r") as f:
|
116 |
model_description = f.read()
|
117 |
model_description = model_description[model_description.find("---", 1) + 5 :]
|
118 |
+
model_description = (
|
119 |
+
"""### By using this service, users are required to agree to the following terms: you agree that user input will be collected for future research and model improvements. \n\n"""
|
120 |
+
+ model_description
|
121 |
+
)
|
122 |
|
123 |
|
124 |
iface = gr.Interface(
|
|
|
132 |
label="Translated sentence",
|
133 |
),
|
134 |
examples=[
|
135 |
+
[
|
136 |
+
"How many leaves would it drop in a month of February in a non-leap year?",
|
137 |
+
],
|
138 |
[
|
139 |
"ChatGPT (Chat Generative Pre-trained Transformer) is a chatbot developed by OpenAI and launched on November 30, 2022. Based on a large language model, it enables users to refine and steer a conversation towards a desired length, format, style, level of detail, and language. Successive prompts and replies, known as prompt engineering, are considered at each conversation stage as a context.[2] ",
|
140 |
+
],
|
141 |
+
[
|
142 |
"who holds this neighborhood?",
|
143 |
+
],
|
144 |
+
|
145 |
],
|
146 |
title="Dragoman: SOTA English-Ukrainian translation model",
|
147 |
description='This demo contains a model from paper "Setting up the Data Printer with Improved English to Ukrainian Machine Translation", accepted to UNLP 2024 workshop at the LREC-COLING 2024 conference.',
|
data_logger.py
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from gradio import utils
|
2 |
+
import os
|
3 |
+
import csv
|
4 |
+
import huggingface_hub
|
5 |
+
|
6 |
+
|
7 |
+
def log_data(hf_token: str, dataset_name: str, private=True):
|
8 |
+
path_to_dataset_repo = huggingface_hub.create_repo(
|
9 |
+
repo_id=dataset_name,
|
10 |
+
token=hf_token,
|
11 |
+
private=private,
|
12 |
+
repo_type="dataset",
|
13 |
+
exist_ok=True,
|
14 |
+
)
|
15 |
+
flagging_dir = "flagged"
|
16 |
+
dataset_dir = os.path.join(flagging_dir, dataset_name)
|
17 |
+
repo = huggingface_hub.Repository(
|
18 |
+
local_dir=dataset_dir,
|
19 |
+
clone_from=path_to_dataset_repo,
|
20 |
+
use_auth_token=hf_token,
|
21 |
+
)
|
22 |
+
repo.git_pull(lfs=True)
|
23 |
+
log_file = os.path.join(dataset_dir, "dragoman_logs.csv")
|
24 |
+
|
25 |
+
def log_function(data):
|
26 |
+
repo.git_pull(lfs=True)
|
27 |
+
|
28 |
+
with open(log_file, "a", newline="", encoding="utf-8") as csvfile:
|
29 |
+
writer = csv.writer(csvfile)
|
30 |
+
|
31 |
+
for row in data:
|
32 |
+
writer.writerow(utils.sanitize_list_for_csv(row))
|
33 |
+
|
34 |
+
with open(log_file, "r", encoding="utf-8") as csvfile:
|
35 |
+
line_count = len([None for row in csv.reader(csvfile)]) - 1
|
36 |
+
|
37 |
+
repo.push_to_hub(commit_message="Flagged sample #{}".format(line_count))
|
38 |
+
|
39 |
+
return line_count
|
40 |
+
|
41 |
+
return log_function
|