Spaces:
Runtime error
Runtime error
theQuert
commited on
Commit
•
9c3e084
1
Parent(s):
59dc487
init
Browse files- .gitignore +2 -0
- README.md +5 -4
- app.py +300 -0
- requirements.txt +141 -0
- util/experiments/classification.csv +79 -0
- util/experiments/here_comes_outputs +1 -0
- util/experiments/paragraphs_needed.csv +4 -0
.gitignore
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
*DS_Store
|
2 |
+
bart_model
|
README.md
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
---
|
2 |
title: Event Triggered Article Updating System
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
sdk: gradio
|
7 |
sdk_version: 3.40.1
|
8 |
app_file: app.py
|
@@ -10,4 +10,5 @@ pinned: false
|
|
10 |
license: mit
|
11 |
---
|
12 |
|
13 |
-
|
|
|
|
1 |
---
|
2 |
title: Event Triggered Article Updating System
|
3 |
+
emoji: 🤗
|
4 |
+
colorFrom: purple
|
5 |
+
colorTo: indigo
|
6 |
sdk: gradio
|
7 |
sdk_version: 3.40.1
|
8 |
app_file: app.py
|
|
|
10 |
license: mit
|
11 |
---
|
12 |
|
13 |
+
# NetKUp-HF
|
14 |
+
Event Triggered Article Updating System on HF
|
app.py
ADDED
@@ -0,0 +1,300 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
|
3 |
+
import os
|
4 |
+
import re
|
5 |
+
import numpy as np
|
6 |
+
import pandas as pd
|
7 |
+
from tqdm import tqdm
|
8 |
+
import matplotlib.pyplot as plt
|
9 |
+
import warnings
|
10 |
+
import nltk
|
11 |
+
import random, time
|
12 |
+
import datetime
|
13 |
+
# nltk.download("stopwords")
|
14 |
+
from nltk.corpus import stopwords
|
15 |
+
import torch
|
16 |
+
import torch.nn as nn
|
17 |
+
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler,random_split
|
18 |
+
from sklearn.metrics import classification_report
|
19 |
+
import transformers
|
20 |
+
from transformers import BartForSequenceClassification, AdamW, BartTokenizer, get_linear_schedule_with_warmup, pipeline, set_seed
|
21 |
+
from transformers import pipeline, set_seed, BartTokenizer
|
22 |
+
from datasets import load_dataset, load_metric
|
23 |
+
from dotenv import load_dotenv
|
24 |
+
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
|
25 |
+
from nltk.tokenize import sent_tokenize
|
26 |
+
from datasets import Dataset, load_metric
|
27 |
+
import datasets
|
28 |
+
import gradio as gr
|
29 |
+
import pyperclip
|
30 |
+
import openai
|
31 |
+
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
|
32 |
+
from transformers import TrainingArguments, Trainer
|
33 |
+
# from vicuna_generate import *
|
34 |
+
# from convert_article import *
|
35 |
+
|
36 |
+
# Data preprocessing
|
37 |
+
|
38 |
+
def text_preprocessing(s):
|
39 |
+
"""
|
40 |
+
- Lowercase the sentence
|
41 |
+
- Change "'t" to "not"
|
42 |
+
- Remove "@name"
|
43 |
+
- Isolate and remove punctuations except "?"
|
44 |
+
- Remove other special characters
|
45 |
+
- Remove stop words except "not" and "can"
|
46 |
+
- Remove trailing whitespace
|
47 |
+
"""
|
48 |
+
s = s.lower()
|
49 |
+
# Change 't to 'not'
|
50 |
+
s = re.sub(r"\'t", " not", s)
|
51 |
+
# Remove @name
|
52 |
+
s = re.sub(r'(@.*?)[\s]', ' ', s)
|
53 |
+
# Isolate and remove punctuations except '?'
|
54 |
+
s = re.sub(r'([\'\"\.\(\)\!\?\\\/\,])', r' \1 ', s)
|
55 |
+
s = re.sub(r'[^\w\s\?]', ' ', s)
|
56 |
+
# Remove some special characters
|
57 |
+
s = re.sub(r'([\;\:\|•«\n])', ' ', s)
|
58 |
+
# Remove stopwords except 'not' and 'can'
|
59 |
+
s = " ".join([word for word in s.split()
|
60 |
+
if word not in stopwords.words('english')
|
61 |
+
or word in ['not', 'can']])
|
62 |
+
# Remove trailing whitespace
|
63 |
+
s = re.sub(r'\s+', ' ', s).strip()
|
64 |
+
|
65 |
+
return s
|
66 |
+
|
67 |
+
def text_preprocessing(text):
|
68 |
+
"""
|
69 |
+
- Remove entity mentions (eg. '@united')
|
70 |
+
- Correct errors (eg. '&' to '&')
|
71 |
+
@param text (str): a string to be processed.
|
72 |
+
@return text (Str): the processed string.
|
73 |
+
"""
|
74 |
+
# Remove '@name'
|
75 |
+
text = re.sub(r'(@.*?)[\s]', ' ', text)
|
76 |
+
|
77 |
+
# Replace '&' with '&'
|
78 |
+
text = re.sub(r'&', '&', text)
|
79 |
+
|
80 |
+
# Remove trailing whitespace
|
81 |
+
text = re.sub(r'\s+', ' ', text).strip()
|
82 |
+
|
83 |
+
return text
|
84 |
+
|
85 |
+
# Total number of training steps is [number of batches] x [number of epochs].
|
86 |
+
# (Note that this is not the same as the number of training samples).
|
87 |
+
|
88 |
+
# Create the learning rate scheduler.
|
89 |
+
|
90 |
+
# Function to calculate the accuracy of our predictions vs labels
|
91 |
+
def flat_accuracy(preds, labels):
|
92 |
+
pred_flat = np.argmax(preds, axis=1).flatten()
|
93 |
+
labels_flat = labels.flatten()
|
94 |
+
return np.sum(pred_flat == labels_flat) / len(labels_flat)
|
95 |
+
|
96 |
+
def format_time(elapsed):
|
97 |
+
'''
|
98 |
+
Takes a time in seconds and returns a string hh:mm:ss
|
99 |
+
'''
|
100 |
+
# Round to the nearest second.
|
101 |
+
elapsed_rounded = int(round((elapsed)))
|
102 |
+
# Format as hh:mm:ss
|
103 |
+
return str(datetime.timedelta(seconds=elapsed_rounded))
|
104 |
+
|
105 |
+
def decode(paragraphs_needed):
|
106 |
+
# model_ckpt = "facebook/bart-large-cnn"
|
107 |
+
tokenizer = AutoTokenizer.from_pretrained("theQuert/NetKUp-tokenzier")
|
108 |
+
# pipe = pipeline("summarization", model="bart-decoder",tokenizer=tokenizer)
|
109 |
+
pipe = pipeline("summarization", model="hyesunyun/update-summarization-bart-large-longformer",tokenizer=tokenizer)
|
110 |
+
contexts = [str(pipe(paragraph)) for paragraph in paragraphs_needed]
|
111 |
+
return contexts
|
112 |
+
|
113 |
+
def split_article(article, trigger):
|
114 |
+
if article.split("\n"): article = article.replace("\n", "\\\\c\\\\c")
|
115 |
+
paragraphs = article.replace("\\c\\c", "\c\c").split("\\\\c\\\\c")
|
116 |
+
pars = [str(par) + " -- " + str(trigger) for par in paragraphs]
|
117 |
+
# pd.DataFrame({"paragraph": pars}).to_csv("./util/experiments/input_paragraphs.csv")
|
118 |
+
return pars
|
119 |
+
|
120 |
+
def config():
|
121 |
+
load_dotenv()
|
122 |
+
|
123 |
+
def call_gpt(paragraph, trigger):
|
124 |
+
openai.api_key = os.environ.get("GPT_API")
|
125 |
+
tokenizer = BartTokenizer.from_pretrained("theQuert/NetKUp-tokenzier")
|
126 |
+
inputs_for_gpt = f"""
|
127 |
+
As an article writer, your task is to provide an updated paragraph in the length same as non-updated paragraph based on the given non-updated paragraph and a triggered news.
|
128 |
+
Non-updated paragraph:
|
129 |
+
{paragraph}
|
130 |
+
|
131 |
+
Triggered News:
|
132 |
+
{trigger}
|
133 |
+
"""
|
134 |
+
# merged_with_prompts.append(merged.strip())
|
135 |
+
# pd.DataFrame({"paragraph": merged_with_prompts}).to_csv("./experiments/paragraphs_with_prompts.csv")
|
136 |
+
|
137 |
+
completion = openai.ChatCompletion.create(
|
138 |
+
model = "gpt-3.5-turbo",
|
139 |
+
messages = [
|
140 |
+
{"role": "user", "content": inputs_for_gpt}
|
141 |
+
]
|
142 |
+
)
|
143 |
+
response = completion.choices[0].message.content
|
144 |
+
return str(response)
|
145 |
+
|
146 |
+
def call_vicuna(paragraphs_tirgger):
|
147 |
+
tokenizer = BartTokenizer.from_pretrained("theQuert/NetKUp-tokenzier")
|
148 |
+
merged_with_prompts = []
|
149 |
+
for paragraph in paragraphs:
|
150 |
+
merged = f"""
|
151 |
+
As an article writer, your task is to provide an updated paragraph in the length same as non-updated paragraph based on the given non-updated paragraph and a triggered news.
|
152 |
+
Non-updated paragraph:
|
153 |
+
{paragraph}
|
154 |
+
|
155 |
+
Triggered News:
|
156 |
+
{trigger}
|
157 |
+
"""
|
158 |
+
merged_with_prompts.append(merged.strip())
|
159 |
+
pd.DataFrame({"paragraph": merged_with_prompts}).to_csv("./util/experiments/paragraphs_with_prompts.csv")
|
160 |
+
responses = vicuna_output()
|
161 |
+
return responses
|
162 |
+
|
163 |
+
|
164 |
+
def main(input_article, input_trigger):
|
165 |
+
# csv_path = "./util/experiments/input_paragraphs.csv"
|
166 |
+
# if os.path.isfile(csv_path):
|
167 |
+
# os.remove(csv_path)
|
168 |
+
modified = "TRUE"
|
169 |
+
# device = "cuda" if torch.cuda.is_available() else "cpu"
|
170 |
+
device="cpu"
|
171 |
+
# tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn', do_lower_case=True)
|
172 |
+
tokenizer = AutoTokenizer.from_pretrained('theQuert/NetKUp-tokenzier')
|
173 |
+
batch_size = 8
|
174 |
+
model = torch.load("./util/bart_model", map_location=torch.device("cpu"))
|
175 |
+
optimizer = AdamW(model.parameters(),
|
176 |
+
lr = 2e-5,
|
177 |
+
eps = 1e-8
|
178 |
+
)
|
179 |
+
|
180 |
+
# split the input article to paragraphs in tmp csv format
|
181 |
+
data_test = split_article(input_article, input_trigger)
|
182 |
+
|
183 |
+
seed_val = 42
|
184 |
+
random.seed(seed_val)
|
185 |
+
np.random.seed(seed_val)
|
186 |
+
torch.manual_seed(seed_val)
|
187 |
+
# torch.cuda.manual_seed_all(seed_val)
|
188 |
+
|
189 |
+
input_ids = []
|
190 |
+
attention_masks = []
|
191 |
+
for sent in data_test:
|
192 |
+
encoded_dict = tokenizer.encode_plus(
|
193 |
+
text_preprocessing(sent),
|
194 |
+
add_special_tokens = True,
|
195 |
+
max_length = 600,
|
196 |
+
pad_to_max_length = True,
|
197 |
+
return_attention_mask = True,
|
198 |
+
return_tensors = 'pt',
|
199 |
+
truncation=True
|
200 |
+
)
|
201 |
+
input_ids.append(encoded_dict['input_ids'])
|
202 |
+
attention_masks.append(encoded_dict['attention_mask'])
|
203 |
+
input_ids = torch.cat(input_ids, dim=0)
|
204 |
+
attention_masks = torch.cat(attention_masks, dim=0)
|
205 |
+
|
206 |
+
test_dataset = TensorDataset(input_ids, attention_masks)
|
207 |
+
test_dataloader = DataLoader(
|
208 |
+
test_dataset,
|
209 |
+
sampler = SequentialSampler(test_dataset),
|
210 |
+
batch_size = batch_size
|
211 |
+
)
|
212 |
+
|
213 |
+
# Predictions
|
214 |
+
predictions = []
|
215 |
+
for batch in test_dataloader:
|
216 |
+
b_input_ids = batch[0].to(device)
|
217 |
+
b_input_mask = batch[1].to(device)
|
218 |
+
with torch.no_grad():
|
219 |
+
output= model(b_input_ids,
|
220 |
+
attention_mask=b_input_mask)
|
221 |
+
logits = output.logits
|
222 |
+
logits = logits.detach().cpu().numpy()
|
223 |
+
pred_flat = np.argmax(logits, axis=1).flatten()
|
224 |
+
predictions.extend(list(pred_flat))
|
225 |
+
|
226 |
+
# Write predictions for each paragraph
|
227 |
+
df_output = pd.DataFrame({"target": predictions}).to_csv('./util/experiments/classification.csv', index=False)
|
228 |
+
if len(data_test)==1: predictions[0] = 1
|
229 |
+
|
230 |
+
# extract ids for update-needed paragraphs (extract the idx with predicted target == 1)
|
231 |
+
pos_ids = [idx for idx in range(len(predictions)) if predictions[idx]==1]
|
232 |
+
neg_ids = [idx for idx in range(len(predictions)) if predictions[idx]==0]
|
233 |
+
|
234 |
+
# feed the positive paragraphs to decoder
|
235 |
+
paragraphs_needed = [data_test[idx] for idx in pos_ids]
|
236 |
+
pd.DataFrame({"paragraph": paragraphs_needed}).to_csv("./util/experiments/paragraphs_needed.csv", index=False)
|
237 |
+
|
238 |
+
# updated_paragraphs = decode(input_paragraph, input_trigger)
|
239 |
+
config()
|
240 |
+
updated_paragraphs = [call_gpt(paragraph.split(" -- ")[0], input_trigger) for paragraph in paragraphs_needed]
|
241 |
+
# updated_paragraphs = call_vicuna(paragraphs_needed, input_trigger)
|
242 |
+
|
243 |
+
# merge updated paragraphs with non-updated paragraphs
|
244 |
+
paragraphs_merged = data_test.copy()
|
245 |
+
paragraphs_merged = [str(par).split(" -- ")[0] for par in paragraphs_merged]
|
246 |
+
for idx in range(len(pos_ids)):
|
247 |
+
paragraphs_merged[pos_ids[idx]] = updated_paragraphs[idx]
|
248 |
+
|
249 |
+
sep = "\n"
|
250 |
+
# paragarphs_merged = ["".join(par.split(" -- ")[:-1]) for par in paragraphs_merged]
|
251 |
+
updated_article = str(sep.join(paragraphs_merged))
|
252 |
+
updated_article = updated_article.replace("[{'summary_text': '", "").replace("'}]", "").strip()
|
253 |
+
class_res = pd.read_csv("./util/experiments/classification.csv")
|
254 |
+
if class_res.target.values.all() == 0: modified="False"
|
255 |
+
|
256 |
+
if len(data_test)==1:
|
257 |
+
modified="TRUE"
|
258 |
+
updated_article = call_gpt(input_article, input_trigger)
|
259 |
+
with open("./util/experiments/updated_article.txt", "w") as f:
|
260 |
+
f.write(updated_article)
|
261 |
+
|
262 |
+
# combine the predictions and paragraphs into csv format file
|
263 |
+
merged_par_pred_df = pd.DataFrame({"paragraphs": data_test, "predictions": predictions}).to_csv("./util/experiments/par_with_class.csv")
|
264 |
+
# return updated_article, modified, merged_par_pred_df
|
265 |
+
modified_in_all = str(len(paragraphs_needed)) + " / " + str(len(data_test))
|
266 |
+
return updated_article, modified_in_all
|
267 |
+
|
268 |
+
def copy_to_clipboard(t):
|
269 |
+
with open("./util/experiments/updated_article.txt", "r") as f:
|
270 |
+
t = f.read()
|
271 |
+
pyperclip.copy(t)
|
272 |
+
|
273 |
+
demo = gr.Interface(
|
274 |
+
main,
|
275 |
+
[
|
276 |
+
gr.Textbox(
|
277 |
+
lines=2, label="Non-updated Article", placeholder="Input the article..."
|
278 |
+
),
|
279 |
+
gr.Textbox(
|
280 |
+
lines=2, label="Triggered News Event", placeholder="Input the triggered news event..."
|
281 |
+
)
|
282 |
+
],
|
283 |
+
[
|
284 |
+
gr.Textbox(
|
285 |
+
lines=25,
|
286 |
+
label="Output",
|
287 |
+
),
|
288 |
+
gr.Textbox(
|
289 |
+
lines=1,
|
290 |
+
label="#MODIFIED/ALL"
|
291 |
+
),
|
292 |
+
# btn = gr.Button(value="Copy Updated Article to Clipboard")
|
293 |
+
# btn.click(copy_to_clipboard)
|
294 |
+
# gr.components.Button(value="Copy Updated Article to Clipboard", fn=copy_to_clipboard),
|
295 |
+
],
|
296 |
+
title="Event Triggered Article Updating System",
|
297 |
+
description="Powered by YTLee",
|
298 |
+
)
|
299 |
+
|
300 |
+
demo.launch()
|
requirements.txt
ADDED
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
absl-py==1.4.0
|
2 |
+
accelerate==0.15.0
|
3 |
+
aiofiles==23.2.1
|
4 |
+
aiohttp==3.8.5
|
5 |
+
aiosignal==1.3.1
|
6 |
+
altair==5.0.1
|
7 |
+
annotated-types==0.5.0
|
8 |
+
anyio==3.7.1
|
9 |
+
appdirs==1.4.4
|
10 |
+
async-timeout==4.0.3
|
11 |
+
attrs==23.1.0
|
12 |
+
bitsandbytes==0.37.0
|
13 |
+
cachetools==5.3.1
|
14 |
+
certifi==2023.7.22
|
15 |
+
charset-normalizer==3.2.0
|
16 |
+
click==8.1.6
|
17 |
+
cmake==3.27.1
|
18 |
+
contourpy==1.1.0
|
19 |
+
cycler==0.11.0
|
20 |
+
datasets
|
21 |
+
deepspeed==0.8.3
|
22 |
+
dill
|
23 |
+
docker-pycreds==0.4.0
|
24 |
+
einops==0.6.1
|
25 |
+
evaluate==0.4.0
|
26 |
+
exceptiongroup==1.1.2
|
27 |
+
fairscale==0.4.13
|
28 |
+
fastapi==0.101.0
|
29 |
+
ffmpy==0.3.1
|
30 |
+
filelock==3.12.2
|
31 |
+
fonttools==4.42.0
|
32 |
+
frozenlist==1.4.0
|
33 |
+
fsspec==2023.6.0
|
34 |
+
gitdb==4.0.10
|
35 |
+
GitPython==3.1.32
|
36 |
+
gradio==3.20.0
|
37 |
+
gradio-client==0.4.0
|
38 |
+
grpcio==1.57.0
|
39 |
+
h11==0.14.0
|
40 |
+
hjson==3.1.0
|
41 |
+
httpcore==0.17.3
|
42 |
+
httpx==0.24.1
|
43 |
+
huggingface-hub==0.13.3
|
44 |
+
idna==3.4
|
45 |
+
importlib-metadata==6.8.0
|
46 |
+
importlib-resources==6.0.1
|
47 |
+
Jinja2==3.1.2
|
48 |
+
joblib==1.3.2
|
49 |
+
jsonschema==4.19.0
|
50 |
+
jsonschema-specifications==2023.7.1
|
51 |
+
kiwisolver==1.4.4
|
52 |
+
linkify-it-py==2.0.2
|
53 |
+
lit==16.0.6
|
54 |
+
loralib==0.1.1
|
55 |
+
Markdown==3.4.4
|
56 |
+
MarkupSafe==2.1.3
|
57 |
+
matplotlib==3.7.2
|
58 |
+
mdit-py-plugins==0.3.3
|
59 |
+
mdurl==0.1.2
|
60 |
+
msgpack==1.0.5
|
61 |
+
multidict==6.0.4
|
62 |
+
multiprocess==0.70.15
|
63 |
+
networkx==3.1
|
64 |
+
ninja==1.11.1
|
65 |
+
nltk==3.6.1
|
66 |
+
numpy==1.24.4
|
67 |
+
nvitop==1.0.0
|
68 |
+
oauthlib==3.2.2
|
69 |
+
openai==0.27.8
|
70 |
+
orjson==3.9.4
|
71 |
+
packaging==23.1
|
72 |
+
pandas==2.0.3
|
73 |
+
pathtools==0.1.2
|
74 |
+
peft==0.3.0
|
75 |
+
Pillow==10.0.0
|
76 |
+
pkgutil-resolve-name==1.3.10
|
77 |
+
protobuf==4.24.0
|
78 |
+
psutil==5.9.5
|
79 |
+
py-cpuinfo==9.0.0
|
80 |
+
pyarrow==12.0.1
|
81 |
+
pyasn1==0.5.0
|
82 |
+
pyasn1-modules==0.3.0
|
83 |
+
pycryptodome==3.18.0
|
84 |
+
pydantic==1.10.2
|
85 |
+
pydantic-core==2.4.0
|
86 |
+
pydub==0.25.1
|
87 |
+
Pygments==2.16.1
|
88 |
+
pyparsing==3.0.9
|
89 |
+
pyperclip==1.8.2
|
90 |
+
python-dateutil==2.8.2
|
91 |
+
python-dotenv==1.0.0
|
92 |
+
python-multipart==0.0.6
|
93 |
+
pytz==2023.3
|
94 |
+
PyYAML==6.0.1
|
95 |
+
ray==2.6.2
|
96 |
+
referencing==0.30.2
|
97 |
+
regex==2023.8.8
|
98 |
+
requests==2.31.0
|
99 |
+
requests-oauthlib==1.3.1
|
100 |
+
responses==0.18.0
|
101 |
+
rich==13.5.2
|
102 |
+
rpds-py==0.9.2
|
103 |
+
rsa==4.9
|
104 |
+
scikit-learn==1.3.0
|
105 |
+
scipy==1.10.1
|
106 |
+
semantic-version==2.10.0
|
107 |
+
sentencepiece==0.1.96
|
108 |
+
sentry-sdk==1.29.2
|
109 |
+
setproctitle==1.3.2
|
110 |
+
six==1.16.0
|
111 |
+
smmap==5.0.0
|
112 |
+
sniffio==1.3.0
|
113 |
+
starlette==0.27.0
|
114 |
+
tabulate==0.9.0
|
115 |
+
tensorboard==2.12.0
|
116 |
+
tensorboard-data-server==0.7.1
|
117 |
+
tensorboard-plugin-wit==1.8.1
|
118 |
+
termcolor==2.3.0
|
119 |
+
texttable==1.6.7
|
120 |
+
threadpoolctl==3.2.0
|
121 |
+
tokenizers==0.13.2
|
122 |
+
toolz==0.12.0
|
123 |
+
torch==1.13.1
|
124 |
+
torchtyping==0.1.4
|
125 |
+
torchvision==0.14.1
|
126 |
+
tqdm==4.65.0
|
127 |
+
transformers==4.28.0
|
128 |
+
triton==2.0.0
|
129 |
+
typeguard==4.1.0
|
130 |
+
typing-extensions
|
131 |
+
tzdata==2023.3
|
132 |
+
uc-micro-py==1.0.2
|
133 |
+
urllib3==2.0.4
|
134 |
+
uvicorn==0.23.2
|
135 |
+
wandb==0.13.10
|
136 |
+
websockets==11.0.3
|
137 |
+
Werkzeug==2.3.6
|
138 |
+
xxhash==3.3.0
|
139 |
+
yarl==1.9.2
|
140 |
+
zipp==3.16.2
|
141 |
+
|
util/experiments/classification.csv
ADDED
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
target
|
2 |
+
0
|
3 |
+
0
|
4 |
+
0
|
5 |
+
0
|
6 |
+
0
|
7 |
+
0
|
8 |
+
0
|
9 |
+
0
|
10 |
+
0
|
11 |
+
0
|
12 |
+
0
|
13 |
+
0
|
14 |
+
0
|
15 |
+
0
|
16 |
+
0
|
17 |
+
0
|
18 |
+
0
|
19 |
+
0
|
20 |
+
0
|
21 |
+
0
|
22 |
+
0
|
23 |
+
0
|
24 |
+
0
|
25 |
+
0
|
26 |
+
0
|
27 |
+
0
|
28 |
+
0
|
29 |
+
0
|
30 |
+
0
|
31 |
+
0
|
32 |
+
0
|
33 |
+
0
|
34 |
+
0
|
35 |
+
0
|
36 |
+
0
|
37 |
+
0
|
38 |
+
0
|
39 |
+
0
|
40 |
+
0
|
41 |
+
0
|
42 |
+
0
|
43 |
+
0
|
44 |
+
0
|
45 |
+
0
|
46 |
+
0
|
47 |
+
0
|
48 |
+
0
|
49 |
+
0
|
50 |
+
0
|
51 |
+
0
|
52 |
+
0
|
53 |
+
0
|
54 |
+
0
|
55 |
+
0
|
56 |
+
0
|
57 |
+
0
|
58 |
+
0
|
59 |
+
0
|
60 |
+
0
|
61 |
+
0
|
62 |
+
0
|
63 |
+
0
|
64 |
+
0
|
65 |
+
0
|
66 |
+
0
|
67 |
+
0
|
68 |
+
0
|
69 |
+
1
|
70 |
+
1
|
71 |
+
1
|
72 |
+
0
|
73 |
+
0
|
74 |
+
0
|
75 |
+
0
|
76 |
+
0
|
77 |
+
0
|
78 |
+
0
|
79 |
+
0
|
util/experiments/here_comes_outputs
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
metadata
|
util/experiments/paragraphs_needed.csv
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
paragraph
|
2 |
+
"On 2 August there were 15 new cases of COVID-19, 2 overseas acquired. Consequently, the South-east Queensland’s lockdown was extended until 4:00pm on 8 August (Sunday). The same day, because of the extension, the Ekka agricultural show was cancelled for the second year, 5 days before it was to be open to the public from 7 August (Saturday). [ADD] <Timeline - Brisbane lockdowns> -- 'Cairns and Yarrabah enter a snap three-day lockdown after an ""unexpected"" case of COVID-19 was reported in a taxi driver from Kanimbla who was infectious in Far North Queensland for 10 days. \n'"
|
3 |
+
"On 8 August the lockdown in SE Queensland ended, though some restrictions remained in force, including mandatory wearing of masks. This was due to an ""unexpected"" case of COVID-19, a taxi driver who was infectious in the community for ten days. [ADD] <Timeline - Brisbane lockdowns> -- 'Cairns and Yarrabah enter a snap three-day lockdown after an ""unexpected"" case of COVID-19 was reported in a taxi driver from Kanimbla who was infectious in Far North Queensland for 10 days. \n'"
|
4 |
+
"On [ADD] 9 August, Cairns went into lockdown from 4:00pm for three days. The next festival event is scheduled for Saturday, 22 May 2021. <Event cancellations> -- 'Cairns and Yarrabah enter a snap three-day lockdown after an ""unexpected"" case of COVID-19 was reported in a taxi driver from Kanimbla who was infectious in Far North Queensland for 10 days. \n'"
|