File size: 3,816 Bytes
894b24d
 
 
 
 
 
 
 
 
df16c53
894b24d
ed49033
894b24d
d4df546
894b24d
 
 
72fee02
 
 
 
 
894b24d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72fee02
894b24d
72fee02
894b24d
 
72fee02
894b24d
 
 
72fee02
f43384a
894b24d
 
 
d3f1526
 
894b24d
d3f1526
d4df546
72fee02
 
 
c9f9a75
72fee02
 
ef55574
 
953205d
72fee02
953205d
72fee02
 
 
953205d
 
ed49033
 
 
953205d
72fee02
d4df546
 
cc4118d
d4df546
52ded96
894b24d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
from metrics import calc_metrics
import gradio as gr
from openai import OpenAI
import os

from transformers import pipeline
# from dotenv import load_dotenv, find_dotenv
import huggingface_hub
import json
# from simcse import SimCSE # use for gpt
from evaluate_data import store_sample_data, get_metrics_trf
from sentence_transformers import SentenceTransformer

# store_sample_data()



# with open('./data/sample_data.json', 'r') as f:
#     # sample_data = [
#     #     {'id': "", 'text': "", 'orgs': ["", ""]}
#     # ]
#     sample_data = json.load(f)
    
# _ = load_dotenv(find_dotenv()) # read local .env file
hf_token= os.environ['HF_TOKEN']
huggingface_hub.login(hf_token)

pipe = pipeline("token-classification", model="elshehawy/finer-ord-transformers", aggregation_strategy="first")


llm_model = 'gpt-3.5-turbo-0125'
# openai.api_key = os.environ['OPENAI_API_KEY']

client = OpenAI(
    api_key=os.environ.get("OPENAI_API_KEY"),
)


def get_completion(prompt, model=llm_model):
    messages = [{"role": "user", "content": prompt}]
    response = client.chat.completions.create(
        messages=messages,
        model=model,
        temperature=0,
    )
    return response.choices[0].message.content


def find_orgs_gpt(sentence):
    prompt = f"""
    In context of named entity recognition (NER), find all organizations in the text delimited by triple backticks.
    
    text:
    ```
    {sentence}
    ```
    You should output only a list of organizations and follow this output format exactly: ["org_1", "org_2", "org_3"]
    """
    
    sent_orgs_str = get_completion(prompt)
    sent_orgs = json.loads(sent_orgs_str)
    
    return sent_orgs


    
# def find_orgs_trf(sentence):
#     org_list = []
#     for ent in pipe(sentence):
#         if ent['entity_group'] == 'ORG':
#             # message += f'\n- {ent["word"]} \t- score: {ent["score"]}'
#             # message += f'\n- {ent["word"]}'# \t- score: {ent["score"]}'
#             org_list.append(ent['word'])
#     return list(set(org_list))


# true_orgs = [sent['orgs'] for sent in sample_data]

# predicted_orgs_gpt = [find_orgs_gpt(sent['text']) for sent in sample_data]
# predicted_orgs_trf = [find_orgs_trf(sent['text']) for sent in sample_data]

# all_metrics = {}

# sim_model = SimCSE('sentence-transformers/all-MiniLM-L6-v2')
# all_metrics['gpt'] = calc_metrics(true_orgs, predicted_orgs_gpt, sim_model)
# print('Finiding all metrics trf')
# all_metrics['trf'] = get_metrics_trf()



example = """
My latest exclusive for The Hill : Conservative frustration over Republican efforts to force a House vote on reauthorizing the Export - Import Bank boiled over Wednesday during a contentious GOP meeting.

"""
def find_orgs(uploaded_file):
    uploaded_data = json.loads(uploaded_file)
    all_metrics = {}
    all_metrics['trf'] = get_metrics_trf(uploaded_data)
    

    sample_data = store_sample_data(uploaded_data)
    # with open('./data/sample_data.json', 'r') as f:
    #     sample_data = json.load(f)
    
    gpt_orgs, true_orgs = [], []
    
    for sent in sample_data:
        gpt_orgs.append(find_orgs_gpt(sent['text']))
        true_orgs.append(sent['orgs'])
        

    # sim_model = SimCSE('sentence-transformers/all-MiniLM-L6-v2')
    sim_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
    all_metrics['gpt'] = calc_metrics(true_orgs, gpt_orgs, sim_model, threshold=0.85)        

    return all_metrics
# radio_btn = gr.Radio(choices=['GPT', 'iSemantics'], value='iSemantics', label='Available models', show_label=True)
# textbox = gr.Textbox(label="Enter your text", placeholder=str(all_metrics), lines=8)
upload_btn = gr.UploadButton(label='Upload a json file.', type='binary')

iface = gr.Interface(fn=find_orgs, inputs=upload_btn, outputs="text")
iface.launch(share=True)