akdeniz27 commited on
Commit
898dd55
1 Parent(s): 4682a5d

First commit

Browse files
Files changed (4) hide show
  1. app.py +72 -0
  2. predict.py +113 -0
  3. requirements.txt +4 -0
  4. test.json +0 -0
app.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoModelForQuestionAnswering, AutoTokenizer
2
+ import streamlit as st
3
+ import json
4
+ from predict import run_prediction
5
+
6
+ st.set_page_config(layout="wide")
7
+
8
+ model_list = ['roberta-base-cuad',
9
+ 'roberta-large-cuad',
10
+ 'deberta-xlarge-cuad']
11
+ st.sidebar.header("Select CUAD Model")
12
+ model_checkpoint = st.sidebar.radio("", model_list)
13
+
14
+ st.sidebar.write("Project: https://www.atticusprojectai.org/cuad")
15
+ st.sidebar.write("Git Hub: https://github.com/TheAtticusProject/cuad")
16
+ st.sidebar.write("CUAD Dataset: https://huggingface.co/datasets/cuad")
17
+
18
+ @st.cache(allow_output_mutation=True)
19
+ def load_model():
20
+ model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)
21
+ tokenizer = AutoTokenizer.from_pretrained(model_checkpoint , use_fast=False)
22
+ return model, tokenizer
23
+
24
+ @st.cache(allow_output_mutation=True)
25
+ def load_questions():
26
+ with open('test.json') as json_file:
27
+ data = json.load(json_file)
28
+
29
+ questions = []
30
+ for i, q in enumerate(data['data'][0]['paragraphs'][0]['qas']):
31
+ question = data['data'][0]['paragraphs'][0]['qas'][i]['question']
32
+ questions.append(question)
33
+ return questions
34
+
35
+ @st.cache(allow_output_mutation=True)
36
+ def load_contracts():
37
+ with open('test.json') as json_file:
38
+ data = json.load(json_file)
39
+
40
+ contracts = []
41
+ for i, q in enumerate(data['data']):
42
+ contract = ' '.join(data['data'][i]['paragraphs'][0]['context'].split())
43
+ contracts.append(contract)
44
+ return contracts
45
+
46
+ model, tokenizer = load_model()
47
+ questions = load_questions()
48
+ contracts = load_contracts()
49
+
50
+ contract = contracts[0]
51
+
52
+ st.header("Contract Understanding Atticus Dataset (CUAD) Demo")
53
+ st.write("Based on https://github.com/marshmellow77/cuad-demo")
54
+
55
+
56
+ question = st.selectbox('Choose one of the 41 queries from the CUAD dataset:', questions)
57
+ # paragraph = st.text_area(label="Contract")
58
+
59
+ contract_type = st.radio("Select Contract", ("Sample Contract", "New Contract"))
60
+ if contract_type == "Sample Contract":
61
+ sample_contract_num = st.slider("Select Sample Contract #")
62
+ contract = contracts[sample_contract_num]
63
+ with st.expander(f"Sample Contract #{sample_contract_num}"):
64
+ st.write(contract)
65
+ else:
66
+ contract = st.text_area("Input New Contract", "", height=256)
67
+
68
+ Run_Button = st.button("Run", key=None)
69
+ if Run_Button == True and not len(contract)==0 and not len(question)==0:
70
+
71
+ prediction = run_prediction(question, contract, 'C:/Users/akden/Desktop/Legal NLP/CUAD/cuad-models/roberta-base/')
72
+ st.write("Answer: " + prediction.strip())
predict.py ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import time
3
+ from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
4
+
5
+ from transformers import (
6
+ AutoConfig,
7
+ AutoModelForQuestionAnswering,
8
+ AutoTokenizer,
9
+ squad_convert_examples_to_features
10
+ )
11
+
12
+ from transformers.data.processors.squad import SquadResult, SquadV2Processor, SquadExample
13
+ from transformers.data.metrics.squad_metrics import compute_predictions_logits
14
+
15
+ def run_prediction(question_texts, context_text, model_path):
16
+ ### Setting hyperparameters
17
+ max_seq_length = 512
18
+ doc_stride = 256
19
+ n_best_size = 1
20
+ max_query_length = 64
21
+ max_answer_length = 512
22
+ do_lower_case = False
23
+ null_score_diff_threshold = 0.0
24
+
25
+ # model_name_or_path = "../cuad-models/roberta-base/"
26
+
27
+ def to_list(tensor):
28
+ return tensor.detach().cpu().tolist()
29
+
30
+ config_class, model_class, tokenizer_class = (
31
+ AutoConfig, AutoModelForQuestionAnswering, AutoTokenizer)
32
+ config = config_class.from_pretrained(model_path)
33
+ tokenizer = tokenizer_class.from_pretrained(
34
+ model_path, do_lower_case=True, use_fast=False)
35
+ model = model_class.from_pretrained(model_path, config=config)
36
+
37
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
38
+ model.to(device)
39
+
40
+ processor = SquadV2Processor()
41
+ examples = []
42
+
43
+ for i, question_text in enumerate(question_texts):
44
+ example = SquadExample(
45
+ qas_id=str(i),
46
+ question_text=question_text,
47
+ context_text=context_text,
48
+ answer_text=None,
49
+ start_position_character=None,
50
+ title="Predict",
51
+ answers=None,
52
+ )
53
+
54
+ examples.append(example)
55
+
56
+ features, dataset = squad_convert_examples_to_features(
57
+ examples=examples,
58
+ tokenizer=tokenizer,
59
+ max_seq_length=max_seq_length,
60
+ doc_stride=doc_stride,
61
+ max_query_length=max_query_length,
62
+ is_training=False,
63
+ return_dataset="pt",
64
+ threads=1,
65
+ )
66
+
67
+ eval_sampler = SequentialSampler(dataset)
68
+ eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=10)
69
+
70
+ all_results = []
71
+
72
+ for batch in eval_dataloader:
73
+ model.eval()
74
+ batch = tuple(t.to(device) for t in batch)
75
+
76
+ with torch.no_grad():
77
+ inputs = {
78
+ "input_ids": batch[0],
79
+ "attention_mask": batch[1],
80
+ "token_type_ids": batch[2],
81
+ }
82
+
83
+ example_indices = batch[3]
84
+
85
+ outputs = model(**inputs)
86
+
87
+ for i, example_index in enumerate(example_indices):
88
+ eval_feature = features[example_index.item()]
89
+ unique_id = int(eval_feature.unique_id)
90
+
91
+ output = [to_list(output[i]) for output in outputs.to_tuple()]
92
+
93
+ start_logits, end_logits = output
94
+ result = SquadResult(unique_id, start_logits, end_logits)
95
+ all_results.append(result)
96
+
97
+ final_predictions = compute_predictions_logits(
98
+ all_examples=examples,
99
+ all_features=features,
100
+ all_results=all_results,
101
+ n_best_size=n_best_size,
102
+ max_answer_length=max_answer_length,
103
+ do_lower_case=do_lower_case,
104
+ output_prediction_file=None,
105
+ output_nbest_file=None,
106
+ output_null_log_odds_file=None,
107
+ verbose_logging=False,
108
+ version_2_with_negative=True,
109
+ null_score_diff_threshold=null_score_diff_threshold,
110
+ tokenizer=tokenizer
111
+ )
112
+
113
+ return final_predictions
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ streamlit
2
+ torch
3
+ transformers
4
+ json
test.json ADDED
The diff for this file is too large to render. See raw diff