Spaces:

theQuert
/

Event-Triggered-Article-Updating-System

Runtime error

App Files Files Community

theQuert commited on Aug 11, 2023

Commit

9c3e084

•

1 Parent(s): 59dc487

init

Browse files

Files changed (7) hide show

.gitignore +2 -0
README.md +5 -4
app.py +300 -0
requirements.txt +141 -0
util/experiments/classification.csv +79 -0
util/experiments/here_comes_outputs +1 -0
util/experiments/paragraphs_needed.csv +4 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ *DS_Store
2	+ bart_model

README.md CHANGED Viewed

@@ -1,8 +1,8 @@
 ---
 title: Event Triggered Article Updating System
-emoji: 🦀
-colorFrom: pink
-colorTo: pink
 sdk: gradio
 sdk_version: 3.40.1
 app_file: app.py
@@ -10,4 +10,5 @@ pinned: false
 license: mit
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: Event Triggered Article Updating System
+emoji: 🤗
+colorFrom: purple
+colorTo: indigo
 sdk: gradio
 sdk_version: 3.40.1
 app_file: app.py
 license: mit
 ---
+# NetKUp-HF
+Event Triggered Article Updating System on HF

app.py ADDED Viewed

	@@ -0,0 +1,300 @@

+# -*- coding: utf-8 -*-
+import os
+import re
+import numpy as np
+import pandas as pd
+from tqdm import tqdm
+import matplotlib.pyplot as plt
+import warnings
+import nltk
+import random, time
+import datetime
+# nltk.download("stopwords")
+from nltk.corpus import stopwords
+import torch
+import torch.nn as nn
+from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler,random_split
+from sklearn.metrics import classification_report
+import transformers
+from transformers import BartForSequenceClassification, AdamW, BartTokenizer, get_linear_schedule_with_warmup, pipeline, set_seed
+from transformers import pipeline, set_seed, BartTokenizer
+from datasets import load_dataset, load_metric
+from dotenv import load_dotenv
+from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
+from nltk.tokenize import sent_tokenize
+from datasets import Dataset, load_metric
+import datasets
+import gradio as gr
+import pyperclip
+import openai
+from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
+from transformers import TrainingArguments, Trainer
+# from vicuna_generate import *
+# from convert_article import *
+# Data preprocessing
+def text_preprocessing(s):
+    """
+    - Lowercase the sentence
+    - Change "'t" to "not"
+    - Remove "@name"
+    - Isolate and remove punctuations except "?"
+    - Remove other special characters
+    - Remove stop words except "not" and "can"
+    - Remove trailing whitespace
+    """
+    s = s.lower()
+    # Change 't to 'not'
+    s = re.sub(r"\'t", " not", s)
+    # Remove @name
+    s = re.sub(r'(@.*?)[\s]', ' ', s)
+    # Isolate and remove punctuations except '?'
+    s = re.sub(r'([\'\"\.\(\)\!\?\\\/\,])', r' \1 ', s)
+    s = re.sub(r'[^\w\s\?]', ' ', s)
+    # Remove some special characters
+    s = re.sub(r'([\;\:\|•«\n])', ' ', s)
+    # Remove stopwords except 'not' and 'can'
+    s = " ".join([word for word in s.split()
+                  if word not in stopwords.words('english')
+                  or word in ['not', 'can']])
+    # Remove trailing whitespace
+    s = re.sub(r'\s+', ' ', s).strip()
+    return s
+def text_preprocessing(text):
+    """
+    - Remove entity mentions (eg. '@united')
+    - Correct errors (eg. '&amp;' to '&')
+    @param    text (str): a string to be processed.
+    @return   text (Str): the processed string.
+    """
+    # Remove '@name'
+    text = re.sub(r'(@.*?)[\s]', ' ', text)
+    # Replace '&amp;' with '&'
+    text = re.sub(r'&amp;', '&', text)
+    # Remove trailing whitespace
+    text = re.sub(r'\s+', ' ', text).strip()
+    return text
+# Total number of training steps is [number of batches] x [number of epochs].
+# (Note that this is not the same as the number of training samples).
+# Create the learning rate scheduler.
+# Function to calculate the accuracy of our predictions vs labels
+def flat_accuracy(preds, labels):
+    pred_flat = np.argmax(preds, axis=1).flatten()
+    labels_flat = labels.flatten()
+    return np.sum(pred_flat == labels_flat) / len(labels_flat)
+def format_time(elapsed):
+    '''
+    Takes a time in seconds and returns a string hh:mm:ss
+    '''
+    # Round to the nearest second.
+    elapsed_rounded = int(round((elapsed)))
+    # Format as hh:mm:ss
+    return str(datetime.timedelta(seconds=elapsed_rounded))
+def decode(paragraphs_needed):
+    # model_ckpt = "facebook/bart-large-cnn"
+    tokenizer = AutoTokenizer.from_pretrained("theQuert/NetKUp-tokenzier")
+    # pipe = pipeline("summarization", model="bart-decoder",tokenizer=tokenizer)
+    pipe = pipeline("summarization", model="hyesunyun/update-summarization-bart-large-longformer",tokenizer=tokenizer)
+    contexts = [str(pipe(paragraph)) for paragraph in paragraphs_needed]
+    return contexts
+def split_article(article, trigger):
+    if article.split("\n"): article = article.replace("\n", "\\\\c\\\\c")
+    paragraphs = article.replace("\\c\\c", "\c\c").split("\\\\c\\\\c")
+    pars = [str(par) + " -- " + str(trigger) for par in paragraphs]
+    # pd.DataFrame({"paragraph": pars}).to_csv("./util/experiments/input_paragraphs.csv")
+    return pars
+def config():
+    load_dotenv()
+def call_gpt(paragraph, trigger):
+    openai.api_key = os.environ.get("GPT_API")
+    tokenizer = BartTokenizer.from_pretrained("theQuert/NetKUp-tokenzier")
+    inputs_for_gpt = f"""
+As an article writer, your task is to provide an updated paragraph in the length same as non-updated paragraph based on the given non-updated paragraph and a triggered news.
+    Non-updated paragraph:
+    {paragraph}
+    Triggered News:
+    {trigger}
+        """
+        # merged_with_prompts.append(merged.strip())
+        # pd.DataFrame({"paragraph": merged_with_prompts}).to_csv("./experiments/paragraphs_with_prompts.csv")
+    completion = openai.ChatCompletion.create(
+         model = "gpt-3.5-turbo",
+         messages = [
+             {"role": "user", "content": inputs_for_gpt}
+         ]
+     )
+    response = completion.choices[0].message.content
+    return str(response)
+def call_vicuna(paragraphs_tirgger):
+    tokenizer = BartTokenizer.from_pretrained("theQuert/NetKUp-tokenzier")
+    merged_with_prompts = []
+    for paragraph in paragraphs:
+        merged = f"""
+As an article writer, your task is to provide an updated paragraph in the length same as non-updated paragraph based on the given non-updated paragraph and a triggered news.
+    Non-updated paragraph:
+    {paragraph}
+    Triggered News:
+    {trigger}
+        """
+        merged_with_prompts.append(merged.strip())
+        pd.DataFrame({"paragraph": merged_with_prompts}).to_csv("./util/experiments/paragraphs_with_prompts.csv")
+    responses = vicuna_output()
+    return responses
+def main(input_article, input_trigger):
+    # csv_path = "./util/experiments/input_paragraphs.csv"
+    # if os.path.isfile(csv_path):
+    #     os.remove(csv_path)
+    modified = "TRUE"
+    # device = "cuda" if torch.cuda.is_available() else "cpu"
+    device="cpu"
+    # tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn', do_lower_case=True)
+    tokenizer = AutoTokenizer.from_pretrained('theQuert/NetKUp-tokenzier')
+    batch_size = 8
+    model = torch.load("./util/bart_model", map_location=torch.device("cpu"))
+    optimizer = AdamW(model.parameters(),
+                      lr = 2e-5,
+                      eps = 1e-8
+                    )
+    # split the input article to paragraphs in tmp csv format
+    data_test = split_article(input_article, input_trigger)
+    seed_val = 42
+    random.seed(seed_val)
+    np.random.seed(seed_val)
+    torch.manual_seed(seed_val)
+    # torch.cuda.manual_seed_all(seed_val)
+    input_ids = []
+    attention_masks = []
+    for sent in data_test:
+        encoded_dict = tokenizer.encode_plus(
+                            text_preprocessing(sent),
+                            add_special_tokens = True,
+                            max_length = 600,
+                            pad_to_max_length = True,
+                            return_attention_mask = True,
+                            return_tensors = 'pt',
+                            truncation=True
+                       )
+        input_ids.append(encoded_dict['input_ids'])
+        attention_masks.append(encoded_dict['attention_mask'])
+    input_ids = torch.cat(input_ids, dim=0)
+    attention_masks = torch.cat(attention_masks, dim=0)
+    test_dataset = TensorDataset(input_ids, attention_masks)
+    test_dataloader = DataLoader(
+                test_dataset,
+                sampler = SequentialSampler(test_dataset),
+                batch_size = batch_size
+            )
+    # Predictions
+    predictions = []
+    for batch in test_dataloader:
+            b_input_ids = batch[0].to(device)
+            b_input_mask = batch[1].to(device)
+            with torch.no_grad():
+                output= model(b_input_ids,
+                              attention_mask=b_input_mask)
+                logits = output.logits
+                logits = logits.detach().cpu().numpy()
+                pred_flat = np.argmax(logits, axis=1).flatten()
+                predictions.extend(list(pred_flat))
+    # Write predictions for each paragraph
+    df_output = pd.DataFrame({"target": predictions}).to_csv('./util/experiments/classification.csv', index=False)
+    if len(data_test)==1: predictions[0] = 1
+    # extract ids for update-needed paragraphs (extract the idx with predicted target == 1)
+    pos_ids = [idx for idx in range(len(predictions)) if predictions[idx]==1]
+    neg_ids = [idx for idx in range(len(predictions)) if predictions[idx]==0]
+    # feed the positive paragraphs to decoder
+    paragraphs_needed = [data_test[idx] for idx in pos_ids]
+    pd.DataFrame({"paragraph": paragraphs_needed}).to_csv("./util/experiments/paragraphs_needed.csv", index=False)
+    # updated_paragraphs = decode(input_paragraph, input_trigger)
+    config()
+    updated_paragraphs = [call_gpt(paragraph.split(" -- ")[0], input_trigger) for paragraph in paragraphs_needed]
+    # updated_paragraphs = call_vicuna(paragraphs_needed, input_trigger)
+    # merge updated paragraphs with non-updated paragraphs
+    paragraphs_merged = data_test.copy()
+    paragraphs_merged = [str(par).split(" -- ")[0] for par in paragraphs_merged]
+    for idx in range(len(pos_ids)):
+        paragraphs_merged[pos_ids[idx]] = updated_paragraphs[idx]
+    sep = "\n"
+    # paragarphs_merged = ["".join(par.split(" -- ")[:-1]) for par in paragraphs_merged]
+    updated_article = str(sep.join(paragraphs_merged))
+    updated_article = updated_article.replace("[{'summary_text': '", "").replace("'}]", "").strip()
+    class_res = pd.read_csv("./util/experiments/classification.csv")
+    if class_res.target.values.all() == 0: modified="False"
+    if len(data_test)==1:
+        modified="TRUE"
+        updated_article = call_gpt(input_article, input_trigger)
+    with open("./util/experiments/updated_article.txt", "w") as f:
+        f.write(updated_article)
+    # combine the predictions and paragraphs into csv format file
+    merged_par_pred_df = pd.DataFrame({"paragraphs": data_test, "predictions": predictions}).to_csv("./util/experiments/par_with_class.csv")
+    # return updated_article, modified, merged_par_pred_df
+    modified_in_all = str(len(paragraphs_needed)) + " / " + str(len(data_test))
+    return updated_article, modified_in_all
+def copy_to_clipboard(t):
+    with open("./util/experiments/updated_article.txt", "r") as f:
+        t = f.read()
+        pyperclip.copy(t)
+demo = gr.Interface(
+	main,
+	[
+    	gr.Textbox(
+            lines=2, label="Non-updated Article", placeholder="Input the article..."
+        ),
+        gr.Textbox(
+            lines=2, label="Triggered News Event", placeholder="Input the triggered news event..."
+        )
+    ],
+	[
+        gr.Textbox(
+            lines=25,
+            label="Output",
+        ),
+        gr.Textbox(
+            lines=1,
+            label="#MODIFIED/ALL"
+        ),
+        # btn = gr.Button(value="Copy Updated Article to Clipboard")
+        # btn.click(copy_to_clipboard)
+        # gr.components.Button(value="Copy Updated Article to Clipboard", fn=copy_to_clipboard),
+    ],
+    title="Event Triggered Article Updating System",
+    description="Powered by YTLee",
+)
+demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,141 @@

+absl-py==1.4.0
+accelerate==0.15.0
+aiofiles==23.2.1
+aiohttp==3.8.5
+aiosignal==1.3.1
+altair==5.0.1
+annotated-types==0.5.0
+anyio==3.7.1
+appdirs==1.4.4
+async-timeout==4.0.3
+attrs==23.1.0
+bitsandbytes==0.37.0
+cachetools==5.3.1
+certifi==2023.7.22
+charset-normalizer==3.2.0
+click==8.1.6
+cmake==3.27.1
+contourpy==1.1.0
+cycler==0.11.0
+datasets
+deepspeed==0.8.3
+dill
+docker-pycreds==0.4.0
+einops==0.6.1
+evaluate==0.4.0
+exceptiongroup==1.1.2
+fairscale==0.4.13
+fastapi==0.101.0
+ffmpy==0.3.1
+filelock==3.12.2
+fonttools==4.42.0
+frozenlist==1.4.0
+fsspec==2023.6.0
+gitdb==4.0.10
+GitPython==3.1.32
+gradio==3.20.0
+gradio-client==0.4.0
+grpcio==1.57.0
+h11==0.14.0
+hjson==3.1.0
+httpcore==0.17.3
+httpx==0.24.1
+huggingface-hub==0.13.3
+idna==3.4
+importlib-metadata==6.8.0
+importlib-resources==6.0.1
+Jinja2==3.1.2
+joblib==1.3.2
+jsonschema==4.19.0
+jsonschema-specifications==2023.7.1
+kiwisolver==1.4.4
+linkify-it-py==2.0.2
+lit==16.0.6
+loralib==0.1.1
+Markdown==3.4.4
+MarkupSafe==2.1.3
+matplotlib==3.7.2
+mdit-py-plugins==0.3.3
+mdurl==0.1.2
+msgpack==1.0.5
+multidict==6.0.4
+multiprocess==0.70.15
+networkx==3.1
+ninja==1.11.1
+nltk==3.6.1
+numpy==1.24.4
+nvitop==1.0.0
+oauthlib==3.2.2
+openai==0.27.8
+orjson==3.9.4
+packaging==23.1
+pandas==2.0.3
+pathtools==0.1.2
+peft==0.3.0
+Pillow==10.0.0
+pkgutil-resolve-name==1.3.10
+protobuf==4.24.0
+psutil==5.9.5
+py-cpuinfo==9.0.0
+pyarrow==12.0.1
+pyasn1==0.5.0
+pyasn1-modules==0.3.0
+pycryptodome==3.18.0
+pydantic==1.10.2
+pydantic-core==2.4.0
+pydub==0.25.1
+Pygments==2.16.1
+pyparsing==3.0.9
+pyperclip==1.8.2
+python-dateutil==2.8.2
+python-dotenv==1.0.0
+python-multipart==0.0.6
+pytz==2023.3
+PyYAML==6.0.1
+ray==2.6.2
+referencing==0.30.2
+regex==2023.8.8
+requests==2.31.0
+requests-oauthlib==1.3.1
+responses==0.18.0
+rich==13.5.2
+rpds-py==0.9.2
+rsa==4.9
+scikit-learn==1.3.0
+scipy==1.10.1
+semantic-version==2.10.0
+sentencepiece==0.1.96
+sentry-sdk==1.29.2
+setproctitle==1.3.2
+six==1.16.0
+smmap==5.0.0
+sniffio==1.3.0
+starlette==0.27.0
+tabulate==0.9.0
+tensorboard==2.12.0
+tensorboard-data-server==0.7.1
+tensorboard-plugin-wit==1.8.1
+termcolor==2.3.0
+texttable==1.6.7
+threadpoolctl==3.2.0
+tokenizers==0.13.2
+toolz==0.12.0
+torch==1.13.1
+torchtyping==0.1.4
+torchvision==0.14.1
+tqdm==4.65.0
+transformers==4.28.0
+triton==2.0.0
+typeguard==4.1.0
+typing-extensions
+tzdata==2023.3
+uc-micro-py==1.0.2
+urllib3==2.0.4
+uvicorn==0.23.2
+wandb==0.13.10
+websockets==11.0.3
+Werkzeug==2.3.6
+xxhash==3.3.0
+yarl==1.9.2
+zipp==3.16.2

util/experiments/classification.csv ADDED Viewed

	@@ -0,0 +1,79 @@

+target
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+1
+1
+0
+0
+0
+0
+0
+0
+0
+0

util/experiments/here_comes_outputs ADDED Viewed

	@@ -0,0 +1 @@


1	+ metadata

util/experiments/paragraphs_needed.csv ADDED Viewed

	@@ -0,0 +1,4 @@

+paragraph
+"On 2 August there were 15 new cases of COVID-19, 2 overseas acquired. Consequently, the South-east Queensland’s lockdown was extended until 4:00pm on 8 August (Sunday). The same day, because of the extension, the Ekka agricultural show was cancelled for the second year, 5 days before it was to be open to the public from 7 August (Saturday). [ADD] <Timeline - Brisbane lockdowns> -- 'Cairns and Yarrabah enter a snap three-day lockdown after an ""unexpected"" case of COVID-19 was reported in a taxi driver from Kanimbla who was infectious in Far North Queensland for 10 days. \n'"
+"On 8 August the lockdown in SE Queensland ended, though some restrictions remained in force, including mandatory wearing of masks. This was due to an ""unexpected"" case of COVID-19, a taxi driver who was infectious in the community for ten days. [ADD] <Timeline - Brisbane lockdowns> -- 'Cairns and Yarrabah enter a snap three-day lockdown after an ""unexpected"" case of COVID-19 was reported in a taxi driver from Kanimbla who was infectious in Far North Queensland for 10 days. \n'"
+"On [ADD] 9 August, Cairns went into lockdown from 4:00pm for three days. The next festival event is scheduled for Saturday, 22 May 2021. <Event cancellations> -- 'Cairns and Yarrabah enter a snap three-day lockdown after an ""unexpected"" case of COVID-19 was reported in a taxi driver from Kanimbla who was infectious in Far North Queensland for 10 days. \n'"