elshehawy's picture
update code to work with sentence-transformers instead of simcse
ed49033
raw
history blame
3.81 kB
from metrics import calc_metrics
import gradio as gr
from openai import OpenAI
import os
from transformers import pipeline
# from dotenv import load_dotenv, find_dotenv
import huggingface_hub
import json
from simcse import SimCSE # use for gpt
from evaluate_data import store_sample_data, get_metrics_trf
from sentence_transformers import SentenceTransformer
# store_sample_data()
# with open('./data/sample_data.json', 'r') as f:
# # sample_data = [
# # {'id': "", 'text': "", 'orgs': ["", ""]}
# # ]
# sample_data = json.load(f)
# _ = load_dotenv(find_dotenv()) # read local .env file
hf_token= os.environ['HF_TOKEN']
huggingface_hub.login(hf_token)
pipe = pipeline("token-classification", model="elshehawy/finer-ord-transformers", aggregation_strategy="first")
llm_model = 'gpt-3.5-turbo-0125'
# openai.api_key = os.environ['OPENAI_API_KEY']
client = OpenAI(
api_key=os.environ.get("OPENAI_API_KEY"),
)
def get_completion(prompt, model=llm_model):
messages = [{"role": "user", "content": prompt}]
response = client.chat.completions.create(
messages=messages,
model=model,
temperature=0,
)
return response.choices[0].message.content
def find_orgs_gpt(sentence):
prompt = f"""
In context of named entity recognition (NER), find all organizations in the text delimited by triple backticks.
text:
```
{sentence}
```
You should output only a list of organizations and follow this output format exactly: ["org_1", "org_2", "org_3"]
"""
sent_orgs_str = get_completion(prompt)
sent_orgs = json.loads(sent_orgs_str)
return sent_orgs
# def find_orgs_trf(sentence):
# org_list = []
# for ent in pipe(sentence):
# if ent['entity_group'] == 'ORG':
# # message += f'\n- {ent["word"]} \t- score: {ent["score"]}'
# # message += f'\n- {ent["word"]}'# \t- score: {ent["score"]}'
# org_list.append(ent['word'])
# return list(set(org_list))
# true_orgs = [sent['orgs'] for sent in sample_data]
# predicted_orgs_gpt = [find_orgs_gpt(sent['text']) for sent in sample_data]
# predicted_orgs_trf = [find_orgs_trf(sent['text']) for sent in sample_data]
# all_metrics = {}
# sim_model = SimCSE('sentence-transformers/all-MiniLM-L6-v2')
# all_metrics['gpt'] = calc_metrics(true_orgs, predicted_orgs_gpt, sim_model)
# print('Finiding all metrics trf')
# all_metrics['trf'] = get_metrics_trf()
example = """
My latest exclusive for The Hill : Conservative frustration over Republican efforts to force a House vote on reauthorizing the Export - Import Bank boiled over Wednesday during a contentious GOP meeting.
"""
def find_orgs(uploaded_file):
uploaded_data = json.loads(uploaded_file)
all_metrics = {}
all_metrics['trf'] = get_metrics_trf(uploaded_data)
sample_data = store_sample_data(uploaded_data)
# with open('./data/sample_data.json', 'r') as f:
# sample_data = json.load(f)
gpt_orgs, true_orgs = [], []
for sent in sample_data:
gpt_orgs.append(find_orgs_gpt(sent['text']))
true_orgs.append(sent['orgs'])
# sim_model = SimCSE('sentence-transformers/all-MiniLM-L6-v2')
sim_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
all_metrics['gpt'] = calc_metrics(true_orgs, gpt_orgs, sim_model, threshold=0.85)
return all_metrics
# radio_btn = gr.Radio(choices=['GPT', 'iSemantics'], value='iSemantics', label='Available models', show_label=True)
# textbox = gr.Textbox(label="Enter your text", placeholder=str(all_metrics), lines=8)
upload_btn = gr.UploadButton(label='Upload a json file.', type='binary')
iface = gr.Interface(fn=find_orgs, inputs=upload_btn, outputs="text")
iface.launch(share=True)