Spaces:
Paused
Paused
Fabrice-TIERCELIN
commited on
Commit
•
6067d3e
1
Parent(s):
508030c
Upload 18 files
Browse files- llava/eval/eval_gpt_review.py +113 -0
- llava/eval/eval_gpt_review_bench.py +121 -0
- llava/eval/eval_gpt_review_visual.py +118 -0
- llava/eval/eval_pope.py +81 -0
- llava/eval/eval_science_qa.py +114 -0
- llava/eval/eval_science_qa_gpt4.py +104 -0
- llava/eval/eval_science_qa_gpt4_requery.py +149 -0
- llava/eval/eval_textvqa.py +65 -0
- llava/eval/generate_webpage_data_from_table.py +111 -0
- llava/eval/m4c_evaluator.py +334 -0
- llava/eval/model_qa.py +85 -0
- llava/eval/model_vqa.py +125 -0
- llava/eval/model_vqa_loader.py +144 -0
- llava/eval/model_vqa_mmbench.py +170 -0
- llava/eval/model_vqa_science.py +147 -0
- llava/eval/qa_baseline_gpt35.py +74 -0
- llava/eval/run_llava.py +97 -0
- llava/eval/summarize_gpt_review.py +60 -0
llava/eval/eval_gpt_review.py
ADDED
@@ -0,0 +1,113 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
import json
|
3 |
+
import os
|
4 |
+
|
5 |
+
import openai
|
6 |
+
import tqdm
|
7 |
+
import ray
|
8 |
+
import time
|
9 |
+
|
10 |
+
NUM_SECONDS_TO_SLEEP = 3
|
11 |
+
|
12 |
+
@ray.remote(num_cpus=4)
|
13 |
+
def get_eval(content: str, max_tokens: int):
|
14 |
+
while True:
|
15 |
+
try:
|
16 |
+
response = openai.ChatCompletion.create(
|
17 |
+
model='gpt-4',
|
18 |
+
messages=[{
|
19 |
+
'role': 'system',
|
20 |
+
'content': 'You are a helpful and precise assistant for checking the quality of the answer.'
|
21 |
+
}, {
|
22 |
+
'role': 'user',
|
23 |
+
'content': content,
|
24 |
+
}],
|
25 |
+
temperature=0.2, # TODO: figure out which temperature is best for evaluation
|
26 |
+
max_tokens=max_tokens,
|
27 |
+
)
|
28 |
+
break
|
29 |
+
except openai.error.RateLimitError:
|
30 |
+
pass
|
31 |
+
except Exception as e:
|
32 |
+
print(e)
|
33 |
+
time.sleep(NUM_SECONDS_TO_SLEEP)
|
34 |
+
|
35 |
+
print('success!')
|
36 |
+
return response['choices'][0]['message']['content']
|
37 |
+
|
38 |
+
|
39 |
+
def parse_score(review):
|
40 |
+
try:
|
41 |
+
score_pair = review.split('\n')[0]
|
42 |
+
score_pair = score_pair.replace(',', ' ')
|
43 |
+
sp = score_pair.split(' ')
|
44 |
+
if len(sp) == 2:
|
45 |
+
return [float(sp[0]), float(sp[1])]
|
46 |
+
else:
|
47 |
+
print('error', review)
|
48 |
+
return [-1, -1]
|
49 |
+
except Exception as e:
|
50 |
+
print(e)
|
51 |
+
print('error', review)
|
52 |
+
return [-1, -1]
|
53 |
+
|
54 |
+
|
55 |
+
if __name__ == '__main__':
|
56 |
+
parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.')
|
57 |
+
parser.add_argument('-q', '--question')
|
58 |
+
# parser.add_argument('-a', '--answer')
|
59 |
+
parser.add_argument('-a', '--answer-list', nargs='+', default=[])
|
60 |
+
parser.add_argument('-r', '--rule')
|
61 |
+
parser.add_argument('-o', '--output')
|
62 |
+
parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output')
|
63 |
+
args = parser.parse_args()
|
64 |
+
|
65 |
+
ray.init()
|
66 |
+
|
67 |
+
f_q = open(os.path.expanduser(args.question))
|
68 |
+
f_ans1 = open(os.path.expanduser(args.answer_list[0]))
|
69 |
+
f_ans2 = open(os.path.expanduser(args.answer_list[1]))
|
70 |
+
rule_dict = json.load(open(os.path.expanduser(args.rule), 'r'))
|
71 |
+
|
72 |
+
review_file = open(f'{args.output}', 'w')
|
73 |
+
|
74 |
+
js_list = []
|
75 |
+
handles = []
|
76 |
+
idx = 0
|
77 |
+
for ques_js, ans1_js, ans2_js in zip(f_q, f_ans1, f_ans2):
|
78 |
+
# if idx == 1:
|
79 |
+
# break
|
80 |
+
|
81 |
+
ques = json.loads(ques_js)
|
82 |
+
ans1 = json.loads(ans1_js)
|
83 |
+
ans2 = json.loads(ans2_js)
|
84 |
+
|
85 |
+
category = json.loads(ques_js)['category']
|
86 |
+
if category in rule_dict:
|
87 |
+
rule = rule_dict[category]
|
88 |
+
else:
|
89 |
+
rule = rule_dict['default']
|
90 |
+
prompt = rule['prompt']
|
91 |
+
role = rule['role']
|
92 |
+
content = (f'[Question]\n{ques["text"]}\n\n'
|
93 |
+
f'[{role} 1]\n{ans1["text"]}\n\n[End of {role} 1]\n\n'
|
94 |
+
f'[{role} 2]\n{ans2["text"]}\n\n[End of {role} 2]\n\n'
|
95 |
+
f'[System]\n{prompt}\n\n')
|
96 |
+
js_list.append({
|
97 |
+
'id': idx+1,
|
98 |
+
'question_id': ques['question_id'],
|
99 |
+
'answer1_id': ans1['answer_id'],
|
100 |
+
'answer2_id': ans2['answer_id'],
|
101 |
+
'category': category})
|
102 |
+
idx += 1
|
103 |
+
handles.append(get_eval.remote(content, args.max_tokens))
|
104 |
+
# To avoid the rate limit set by OpenAI
|
105 |
+
time.sleep(NUM_SECONDS_TO_SLEEP)
|
106 |
+
|
107 |
+
reviews = ray.get(handles)
|
108 |
+
for idx, review in enumerate(reviews):
|
109 |
+
scores = parse_score(review)
|
110 |
+
js_list[idx]['content'] = review
|
111 |
+
js_list[idx]['tuple'] = scores
|
112 |
+
review_file.write(json.dumps(js_list[idx]) + '\n')
|
113 |
+
review_file.close()
|
llava/eval/eval_gpt_review_bench.py
ADDED
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
import json
|
3 |
+
import os
|
4 |
+
|
5 |
+
import openai
|
6 |
+
import time
|
7 |
+
|
8 |
+
NUM_SECONDS_TO_SLEEP = 0.5
|
9 |
+
|
10 |
+
|
11 |
+
def get_eval(content: str, max_tokens: int):
|
12 |
+
while True:
|
13 |
+
try:
|
14 |
+
response = openai.ChatCompletion.create(
|
15 |
+
model='gpt-4-0314',
|
16 |
+
messages=[{
|
17 |
+
'role': 'system',
|
18 |
+
'content': 'You are a helpful and precise assistant for checking the quality of the answer.'
|
19 |
+
}, {
|
20 |
+
'role': 'user',
|
21 |
+
'content': content,
|
22 |
+
}],
|
23 |
+
temperature=0.2, # TODO: figure out which temperature is best for evaluation
|
24 |
+
max_tokens=max_tokens,
|
25 |
+
)
|
26 |
+
break
|
27 |
+
except openai.error.RateLimitError:
|
28 |
+
pass
|
29 |
+
except Exception as e:
|
30 |
+
print(e)
|
31 |
+
time.sleep(NUM_SECONDS_TO_SLEEP)
|
32 |
+
|
33 |
+
return response['choices'][0]['message']['content']
|
34 |
+
|
35 |
+
|
36 |
+
def parse_score(review):
|
37 |
+
try:
|
38 |
+
score_pair = review.split('\n')[0]
|
39 |
+
score_pair = score_pair.replace(',', ' ')
|
40 |
+
sp = score_pair.split(' ')
|
41 |
+
if len(sp) == 2:
|
42 |
+
return [float(sp[0]), float(sp[1])]
|
43 |
+
else:
|
44 |
+
print('error', review)
|
45 |
+
return [-1, -1]
|
46 |
+
except Exception as e:
|
47 |
+
print(e)
|
48 |
+
print('error', review)
|
49 |
+
return [-1, -1]
|
50 |
+
|
51 |
+
|
52 |
+
if __name__ == '__main__':
|
53 |
+
parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.')
|
54 |
+
parser.add_argument('-q', '--question')
|
55 |
+
parser.add_argument('-c', '--context')
|
56 |
+
parser.add_argument('-a', '--answer-list', nargs='+', default=[])
|
57 |
+
parser.add_argument('-r', '--rule')
|
58 |
+
parser.add_argument('-o', '--output')
|
59 |
+
parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output')
|
60 |
+
args = parser.parse_args()
|
61 |
+
|
62 |
+
f_q = open(os.path.expanduser(args.question))
|
63 |
+
f_ans1 = open(os.path.expanduser(args.answer_list[0]))
|
64 |
+
f_ans2 = open(os.path.expanduser(args.answer_list[1]))
|
65 |
+
rule_dict = json.load(open(os.path.expanduser(args.rule), 'r'))
|
66 |
+
|
67 |
+
if os.path.isfile(os.path.expanduser(args.output)):
|
68 |
+
cur_reviews = [json.loads(line) for line in open(os.path.expanduser(args.output))]
|
69 |
+
else:
|
70 |
+
cur_reviews = []
|
71 |
+
|
72 |
+
review_file = open(f'{args.output}', 'a')
|
73 |
+
|
74 |
+
context_list = [json.loads(line) for line in open(os.path.expanduser(args.context))]
|
75 |
+
image_to_context = {context['image']: context for context in context_list}
|
76 |
+
|
77 |
+
handles = []
|
78 |
+
idx = 0
|
79 |
+
for ques_js, ans1_js, ans2_js in zip(f_q, f_ans1, f_ans2):
|
80 |
+
ques = json.loads(ques_js)
|
81 |
+
ans1 = json.loads(ans1_js)
|
82 |
+
ans2 = json.loads(ans2_js)
|
83 |
+
|
84 |
+
inst = image_to_context[ques['image']]
|
85 |
+
|
86 |
+
if isinstance(inst['caption'], list):
|
87 |
+
cap_str = '\n'.join(inst['caption'])
|
88 |
+
else:
|
89 |
+
cap_str = inst['caption']
|
90 |
+
|
91 |
+
category = 'llava_bench_' + json.loads(ques_js)['category']
|
92 |
+
if category in rule_dict:
|
93 |
+
rule = rule_dict[category]
|
94 |
+
else:
|
95 |
+
assert False, f"Visual QA category not found in rule file: {category}."
|
96 |
+
prompt = rule['prompt']
|
97 |
+
role = rule['role']
|
98 |
+
content = (f'[Context]\n{cap_str}\n\n'
|
99 |
+
f'[Question]\n{ques["text"]}\n\n'
|
100 |
+
f'[{role} 1]\n{ans1["text"]}\n\n[End of {role} 1]\n\n'
|
101 |
+
f'[{role} 2]\n{ans2["text"]}\n\n[End of {role} 2]\n\n'
|
102 |
+
f'[System]\n{prompt}\n\n')
|
103 |
+
cur_js = {
|
104 |
+
'id': idx+1,
|
105 |
+
'question_id': ques['question_id'],
|
106 |
+
'answer1_id': ans1.get('answer_id', ans1['question_id']),
|
107 |
+
'answer2_id': ans2.get('answer_id', ans2['answer_id']),
|
108 |
+
'category': category
|
109 |
+
}
|
110 |
+
if idx >= len(cur_reviews):
|
111 |
+
review = get_eval(content, args.max_tokens)
|
112 |
+
scores = parse_score(review)
|
113 |
+
cur_js['content'] = review
|
114 |
+
cur_js['tuple'] = scores
|
115 |
+
review_file.write(json.dumps(cur_js) + '\n')
|
116 |
+
review_file.flush()
|
117 |
+
else:
|
118 |
+
print(f'Skipping {idx} as we already have it.')
|
119 |
+
idx += 1
|
120 |
+
print(idx)
|
121 |
+
review_file.close()
|
llava/eval/eval_gpt_review_visual.py
ADDED
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
import json
|
3 |
+
import os
|
4 |
+
|
5 |
+
import openai
|
6 |
+
import time
|
7 |
+
|
8 |
+
NUM_SECONDS_TO_SLEEP = 0.5
|
9 |
+
|
10 |
+
|
11 |
+
def get_eval(content: str, max_tokens: int):
|
12 |
+
while True:
|
13 |
+
try:
|
14 |
+
response = openai.ChatCompletion.create(
|
15 |
+
model='gpt-4-0314',
|
16 |
+
messages=[{
|
17 |
+
'role': 'system',
|
18 |
+
'content': 'You are a helpful and precise assistant for checking the quality of the answer.'
|
19 |
+
}, {
|
20 |
+
'role': 'user',
|
21 |
+
'content': content,
|
22 |
+
}],
|
23 |
+
temperature=0.2, # TODO: figure out which temperature is best for evaluation
|
24 |
+
max_tokens=max_tokens,
|
25 |
+
)
|
26 |
+
break
|
27 |
+
except openai.error.RateLimitError:
|
28 |
+
pass
|
29 |
+
except Exception as e:
|
30 |
+
print(e)
|
31 |
+
time.sleep(NUM_SECONDS_TO_SLEEP)
|
32 |
+
|
33 |
+
return response['choices'][0]['message']['content']
|
34 |
+
|
35 |
+
|
36 |
+
def parse_score(review):
|
37 |
+
try:
|
38 |
+
score_pair = review.split('\n')[0]
|
39 |
+
score_pair = score_pair.replace(',', ' ')
|
40 |
+
sp = score_pair.split(' ')
|
41 |
+
if len(sp) == 2:
|
42 |
+
return [float(sp[0]), float(sp[1])]
|
43 |
+
else:
|
44 |
+
print('error', review)
|
45 |
+
return [-1, -1]
|
46 |
+
except Exception as e:
|
47 |
+
print(e)
|
48 |
+
print('error', review)
|
49 |
+
return [-1, -1]
|
50 |
+
|
51 |
+
|
52 |
+
if __name__ == '__main__':
|
53 |
+
parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.')
|
54 |
+
parser.add_argument('-q', '--question')
|
55 |
+
parser.add_argument('-c', '--context')
|
56 |
+
parser.add_argument('-a', '--answer-list', nargs='+', default=[])
|
57 |
+
parser.add_argument('-r', '--rule')
|
58 |
+
parser.add_argument('-o', '--output')
|
59 |
+
parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output')
|
60 |
+
args = parser.parse_args()
|
61 |
+
|
62 |
+
f_q = open(os.path.expanduser(args.question))
|
63 |
+
f_ans1 = open(os.path.expanduser(args.answer_list[0]))
|
64 |
+
f_ans2 = open(os.path.expanduser(args.answer_list[1]))
|
65 |
+
rule_dict = json.load(open(os.path.expanduser(args.rule), 'r'))
|
66 |
+
|
67 |
+
if os.path.isfile(os.path.expanduser(args.output)):
|
68 |
+
cur_reviews = [json.loads(line) for line in open(os.path.expanduser(args.output))]
|
69 |
+
else:
|
70 |
+
cur_reviews = []
|
71 |
+
|
72 |
+
review_file = open(f'{args.output}', 'a')
|
73 |
+
|
74 |
+
context_list = [json.loads(line) for line in open(os.path.expanduser(args.context))]
|
75 |
+
image_to_context = {context['image']: context for context in context_list}
|
76 |
+
|
77 |
+
handles = []
|
78 |
+
idx = 0
|
79 |
+
for ques_js, ans1_js, ans2_js in zip(f_q, f_ans1, f_ans2):
|
80 |
+
ques = json.loads(ques_js)
|
81 |
+
ans1 = json.loads(ans1_js)
|
82 |
+
ans2 = json.loads(ans2_js)
|
83 |
+
|
84 |
+
inst = image_to_context[ques['image']]
|
85 |
+
cap_str = '\n'.join(inst['captions'])
|
86 |
+
box_str = '\n'.join([f'{instance["category"]}: {instance["bbox"]}' for instance in inst['instances']])
|
87 |
+
|
88 |
+
category = json.loads(ques_js)['category']
|
89 |
+
if category in rule_dict:
|
90 |
+
rule = rule_dict[category]
|
91 |
+
else:
|
92 |
+
assert False, f"Visual QA category not found in rule file: {category}."
|
93 |
+
prompt = rule['prompt']
|
94 |
+
role = rule['role']
|
95 |
+
content = (f'[Context]\n{cap_str}\n\n{box_str}\n\n'
|
96 |
+
f'[Question]\n{ques["text"]}\n\n'
|
97 |
+
f'[{role} 1]\n{ans1["text"]}\n\n[End of {role} 1]\n\n'
|
98 |
+
f'[{role} 2]\n{ans2["text"]}\n\n[End of {role} 2]\n\n'
|
99 |
+
f'[System]\n{prompt}\n\n')
|
100 |
+
cur_js = {
|
101 |
+
'id': idx+1,
|
102 |
+
'question_id': ques['question_id'],
|
103 |
+
'answer1_id': ans1.get('answer_id', ans1['question_id']),
|
104 |
+
'answer2_id': ans2.get('answer_id', ans2['answer_id']),
|
105 |
+
'category': category
|
106 |
+
}
|
107 |
+
if idx >= len(cur_reviews):
|
108 |
+
review = get_eval(content, args.max_tokens)
|
109 |
+
scores = parse_score(review)
|
110 |
+
cur_js['content'] = review
|
111 |
+
cur_js['tuple'] = scores
|
112 |
+
review_file.write(json.dumps(cur_js) + '\n')
|
113 |
+
review_file.flush()
|
114 |
+
else:
|
115 |
+
print(f'Skipping {idx} as we already have it.')
|
116 |
+
idx += 1
|
117 |
+
print(idx)
|
118 |
+
review_file.close()
|
llava/eval/eval_pope.py
ADDED
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import json
|
3 |
+
import argparse
|
4 |
+
|
5 |
+
def eval_pope(answers, label_file):
|
6 |
+
label_list = [json.loads(q)['label'] for q in open(label_file, 'r')]
|
7 |
+
|
8 |
+
for answer in answers:
|
9 |
+
text = answer['text']
|
10 |
+
|
11 |
+
# Only keep the first sentence
|
12 |
+
if text.find('.') != -1:
|
13 |
+
text = text.split('.')[0]
|
14 |
+
|
15 |
+
text = text.replace(',', '')
|
16 |
+
words = text.split(' ')
|
17 |
+
if 'No' in words or 'not' in words or 'no' in words:
|
18 |
+
answer['text'] = 'no'
|
19 |
+
else:
|
20 |
+
answer['text'] = 'yes'
|
21 |
+
|
22 |
+
for i in range(len(label_list)):
|
23 |
+
if label_list[i] == 'no':
|
24 |
+
label_list[i] = 0
|
25 |
+
else:
|
26 |
+
label_list[i] = 1
|
27 |
+
|
28 |
+
pred_list = []
|
29 |
+
for answer in answers:
|
30 |
+
if answer['text'] == 'no':
|
31 |
+
pred_list.append(0)
|
32 |
+
else:
|
33 |
+
pred_list.append(1)
|
34 |
+
|
35 |
+
pos = 1
|
36 |
+
neg = 0
|
37 |
+
yes_ratio = pred_list.count(1) / len(pred_list)
|
38 |
+
|
39 |
+
TP, TN, FP, FN = 0, 0, 0, 0
|
40 |
+
for pred, label in zip(pred_list, label_list):
|
41 |
+
if pred == pos and label == pos:
|
42 |
+
TP += 1
|
43 |
+
elif pred == pos and label == neg:
|
44 |
+
FP += 1
|
45 |
+
elif pred == neg and label == neg:
|
46 |
+
TN += 1
|
47 |
+
elif pred == neg and label == pos:
|
48 |
+
FN += 1
|
49 |
+
|
50 |
+
print('TP\tFP\tTN\tFN\t')
|
51 |
+
print('{}\t{}\t{}\t{}'.format(TP, FP, TN, FN))
|
52 |
+
|
53 |
+
precision = float(TP) / float(TP + FP)
|
54 |
+
recall = float(TP) / float(TP + FN)
|
55 |
+
f1 = 2*precision*recall / (precision + recall)
|
56 |
+
acc = (TP + TN) / (TP + TN + FP + FN)
|
57 |
+
print('Accuracy: {}'.format(acc))
|
58 |
+
print('Precision: {}'.format(precision))
|
59 |
+
print('Recall: {}'.format(recall))
|
60 |
+
print('F1 score: {}'.format(f1))
|
61 |
+
print('Yes ratio: {}'.format(yes_ratio))
|
62 |
+
print('%.3f, %.3f, %.3f, %.3f, %.3f' % (f1, acc, precision, recall, yes_ratio) )
|
63 |
+
|
64 |
+
if __name__ == "__main__":
|
65 |
+
parser = argparse.ArgumentParser()
|
66 |
+
parser.add_argument("--annotation-dir", type=str)
|
67 |
+
parser.add_argument("--question-file", type=str)
|
68 |
+
parser.add_argument("--result-file", type=str)
|
69 |
+
args = parser.parse_args()
|
70 |
+
|
71 |
+
questions = [json.loads(line) for line in open(args.question_file)]
|
72 |
+
questions = {question['question_id']: question for question in questions}
|
73 |
+
answers = [json.loads(q) for q in open(args.result_file)]
|
74 |
+
for file in os.listdir(args.annotation_dir):
|
75 |
+
assert file.startswith('coco_pope_')
|
76 |
+
assert file.endswith('.json')
|
77 |
+
category = file[10:-5]
|
78 |
+
cur_answers = [x for x in answers if questions[x['question_id']]['category'] == category]
|
79 |
+
print('Category: {}, # samples: {}'.format(category, len(cur_answers)))
|
80 |
+
eval_pope(cur_answers, os.path.join(args.annotation_dir, file))
|
81 |
+
print("====================================")
|
llava/eval/eval_science_qa.py
ADDED
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
import json
|
3 |
+
import os
|
4 |
+
import re
|
5 |
+
import random
|
6 |
+
|
7 |
+
|
8 |
+
def get_args():
|
9 |
+
parser = argparse.ArgumentParser()
|
10 |
+
parser.add_argument('--base-dir', type=str)
|
11 |
+
parser.add_argument('--result-file', type=str)
|
12 |
+
parser.add_argument('--output-file', type=str)
|
13 |
+
parser.add_argument('--output-result', type=str)
|
14 |
+
parser.add_argument('--split', type=str, default='test')
|
15 |
+
parser.add_argument('--options', type=list, default=["A", "B", "C", "D", "E"])
|
16 |
+
return parser.parse_args()
|
17 |
+
|
18 |
+
|
19 |
+
def convert_caps(results):
|
20 |
+
fakecaps = []
|
21 |
+
for result in results:
|
22 |
+
image_id = result['question_id']
|
23 |
+
caption = result['text']
|
24 |
+
fakecaps.append({"image_id": int(image_id), "caption": caption})
|
25 |
+
return fakecaps
|
26 |
+
|
27 |
+
|
28 |
+
def get_pred_idx(prediction, choices, options):
|
29 |
+
"""
|
30 |
+
Get the index (e.g. 2) from the prediction (e.g. 'C')
|
31 |
+
"""
|
32 |
+
if prediction in options[:len(choices)]:
|
33 |
+
return options.index(prediction)
|
34 |
+
else:
|
35 |
+
return -1
|
36 |
+
return random.choice(range(len(choices)))
|
37 |
+
|
38 |
+
|
39 |
+
if __name__ == "__main__":
|
40 |
+
args = get_args()
|
41 |
+
|
42 |
+
base_dir = args.base_dir
|
43 |
+
split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[args.split]
|
44 |
+
problems = json.load(open(os.path.join(base_dir, "problems.json")))
|
45 |
+
predictions = [json.loads(line) for line in open(args.result_file)]
|
46 |
+
predictions = {pred['question_id']: pred for pred in predictions}
|
47 |
+
split_problems = {idx: problems[idx] for idx in split_indices}
|
48 |
+
|
49 |
+
results = {'correct': [], 'incorrect': []}
|
50 |
+
sqa_results = {}
|
51 |
+
sqa_results['acc'] = None
|
52 |
+
sqa_results['correct'] = None
|
53 |
+
sqa_results['count'] = None
|
54 |
+
sqa_results['results'] = {}
|
55 |
+
sqa_results['outputs'] = {}
|
56 |
+
|
57 |
+
for prob_id, prob in split_problems.items():
|
58 |
+
if prob_id not in predictions:
|
59 |
+
pred = {'text': 'FAILED', 'prompt': 'Unknown'}
|
60 |
+
pred_text = 'FAILED'
|
61 |
+
else:
|
62 |
+
pred = predictions[prob_id]
|
63 |
+
pred_text = pred['text']
|
64 |
+
|
65 |
+
if pred_text in args.options:
|
66 |
+
answer = pred_text
|
67 |
+
elif len(pred_text) >= 3 and pred_text[0] in args.options and pred_text[1:3] == ". ":
|
68 |
+
answer = pred_text[0]
|
69 |
+
else:
|
70 |
+
pattern = re.compile(r'The answer is ([A-Z]).')
|
71 |
+
res = pattern.findall(pred_text)
|
72 |
+
if len(res) == 1:
|
73 |
+
answer = res[0] # 'A', 'B', ...
|
74 |
+
else:
|
75 |
+
answer = "FAILED"
|
76 |
+
|
77 |
+
pred_idx = get_pred_idx(answer, prob['choices'], args.options)
|
78 |
+
|
79 |
+
analysis = {
|
80 |
+
'question_id': prob_id,
|
81 |
+
'parsed_ans': answer,
|
82 |
+
'ground_truth': args.options[prob['answer']],
|
83 |
+
'question': pred['prompt'],
|
84 |
+
'pred': pred_text,
|
85 |
+
'is_multimodal': '<image>' in pred['prompt'],
|
86 |
+
}
|
87 |
+
|
88 |
+
sqa_results['results'][prob_id] = get_pred_idx(answer, prob['choices'], args.options)
|
89 |
+
sqa_results['outputs'][prob_id] = pred_text
|
90 |
+
|
91 |
+
if pred_idx == prob['answer']:
|
92 |
+
results['correct'].append(analysis)
|
93 |
+
else:
|
94 |
+
results['incorrect'].append(analysis)
|
95 |
+
|
96 |
+
correct = len(results['correct'])
|
97 |
+
total = len(results['correct']) + len(results['incorrect'])
|
98 |
+
|
99 |
+
###### IMG ######
|
100 |
+
multimodal_correct = len([x for x in results['correct'] if x['is_multimodal']])
|
101 |
+
multimodal_incorrect = len([x for x in results['incorrect'] if x['is_multimodal']])
|
102 |
+
multimodal_total = multimodal_correct + multimodal_incorrect
|
103 |
+
###### IMG ######
|
104 |
+
|
105 |
+
print(f'Total: {total}, Correct: {correct}, Accuracy: {correct / total * 100:.2f}%, IMG-Accuracy: {multimodal_correct / multimodal_total * 100:.2f}%')
|
106 |
+
|
107 |
+
sqa_results['acc'] = correct / total * 100
|
108 |
+
sqa_results['correct'] = correct
|
109 |
+
sqa_results['count'] = total
|
110 |
+
|
111 |
+
with open(args.output_file, 'w') as f:
|
112 |
+
json.dump(results, f, indent=2)
|
113 |
+
with open(args.output_result, 'w') as f:
|
114 |
+
json.dump(sqa_results, f, indent=2)
|
llava/eval/eval_science_qa_gpt4.py
ADDED
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
import json
|
3 |
+
import os
|
4 |
+
import re
|
5 |
+
import random
|
6 |
+
from collections import defaultdict
|
7 |
+
|
8 |
+
|
9 |
+
def get_args():
|
10 |
+
parser = argparse.ArgumentParser()
|
11 |
+
parser.add_argument('--base-dir', type=str)
|
12 |
+
parser.add_argument('--gpt4-result', type=str)
|
13 |
+
parser.add_argument('--our-result', type=str)
|
14 |
+
parser.add_argument('--split', type=str, default='test')
|
15 |
+
parser.add_argument('--options', type=list, default=["A", "B", "C", "D", "E"])
|
16 |
+
return parser.parse_args()
|
17 |
+
|
18 |
+
|
19 |
+
def convert_caps(results):
|
20 |
+
fakecaps = []
|
21 |
+
for result in results:
|
22 |
+
image_id = result['question_id']
|
23 |
+
caption = result['text']
|
24 |
+
fakecaps.append({"image_id": int(image_id), "caption": caption})
|
25 |
+
return fakecaps
|
26 |
+
|
27 |
+
|
28 |
+
def get_pred_idx(prediction, choices, options):
|
29 |
+
"""
|
30 |
+
Get the index (e.g. 2) from the prediction (e.g. 'C')
|
31 |
+
"""
|
32 |
+
if prediction in options[:len(choices)]:
|
33 |
+
return options.index(prediction)
|
34 |
+
else:
|
35 |
+
return random.choice(range(len(choices)))
|
36 |
+
|
37 |
+
|
38 |
+
if __name__ == "__main__":
|
39 |
+
args = get_args()
|
40 |
+
|
41 |
+
base_dir = args.base_dir
|
42 |
+
split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[args.split]
|
43 |
+
problems = json.load(open(os.path.join(base_dir, "problems.json")))
|
44 |
+
our_predictions = [json.loads(line) for line in open(args.our_result)]
|
45 |
+
our_predictions = {pred['question_id']: pred for pred in our_predictions}
|
46 |
+
split_problems = {idx: problems[idx] for idx in split_indices}
|
47 |
+
|
48 |
+
gpt4_predictions = json.load(open(args.gpt4_result))['outputs']
|
49 |
+
|
50 |
+
results = defaultdict(lambda: 0)
|
51 |
+
|
52 |
+
for prob_id, prob in split_problems.items():
|
53 |
+
if prob_id not in our_predictions:
|
54 |
+
continue
|
55 |
+
if prob_id not in gpt4_predictions:
|
56 |
+
continue
|
57 |
+
our_pred = our_predictions[prob_id]['text']
|
58 |
+
gpt4_pred = gpt4_predictions[prob_id]
|
59 |
+
|
60 |
+
pattern = re.compile(r'The answer is ([A-Z]).')
|
61 |
+
our_res = pattern.findall(our_pred)
|
62 |
+
if len(our_res) == 1:
|
63 |
+
our_answer = our_res[0] # 'A', 'B', ...
|
64 |
+
else:
|
65 |
+
our_answer = "FAILED"
|
66 |
+
gpt4_res = pattern.findall(gpt4_pred)
|
67 |
+
if len(gpt4_res) == 1:
|
68 |
+
gpt4_answer = gpt4_res[0] # 'A', 'B', ...
|
69 |
+
else:
|
70 |
+
gpt4_answer = "FAILED"
|
71 |
+
|
72 |
+
our_pred_idx = get_pred_idx(our_answer, prob['choices'], args.options)
|
73 |
+
gpt4_pred_idx = get_pred_idx(gpt4_answer, prob['choices'], args.options)
|
74 |
+
|
75 |
+
if gpt4_answer == 'FAILED':
|
76 |
+
results['gpt4_failed'] += 1
|
77 |
+
# continue
|
78 |
+
gpt4_pred_idx = our_pred_idx
|
79 |
+
# if our_pred_idx != prob['answer']:
|
80 |
+
# print(our_predictions[prob_id]['prompt'])
|
81 |
+
# print('-----------------')
|
82 |
+
# print(f'LECTURE: {prob["lecture"]}')
|
83 |
+
# print(f'SOLUTION: {prob["solution"]}')
|
84 |
+
# print('=====================')
|
85 |
+
else:
|
86 |
+
# continue
|
87 |
+
pass
|
88 |
+
# gpt4_pred_idx = our_pred_idx
|
89 |
+
|
90 |
+
if gpt4_pred_idx == prob['answer']:
|
91 |
+
results['correct'] += 1
|
92 |
+
else:
|
93 |
+
results['incorrect'] += 1
|
94 |
+
|
95 |
+
|
96 |
+
if gpt4_pred_idx == prob['answer'] or our_pred_idx == prob['answer']:
|
97 |
+
results['correct_upperbound'] += 1
|
98 |
+
|
99 |
+
correct = results['correct']
|
100 |
+
total = results['correct'] + results['incorrect']
|
101 |
+
print(f'Total: {total}, Correct: {correct}, Accuracy: {correct / total * 100:.2f}%')
|
102 |
+
print(f'Total: {total}, Correct (upper): {results["correct_upperbound"]}, Accuracy: {results["correct_upperbound"] / total * 100:.2f}%')
|
103 |
+
print(f'Total: {total}, GPT-4 NO-ANS (RANDOM): {results["gpt4_failed"]}, Percentage: {results["gpt4_failed"] / total * 100:.2f}%')
|
104 |
+
|
llava/eval/eval_science_qa_gpt4_requery.py
ADDED
@@ -0,0 +1,149 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
import json
|
3 |
+
import os
|
4 |
+
import re
|
5 |
+
import random
|
6 |
+
from collections import defaultdict
|
7 |
+
|
8 |
+
|
9 |
+
def get_args():
|
10 |
+
parser = argparse.ArgumentParser()
|
11 |
+
parser.add_argument('--base-dir', type=str)
|
12 |
+
parser.add_argument('--gpt4-result', type=str)
|
13 |
+
parser.add_argument('--requery-result', type=str)
|
14 |
+
parser.add_argument('--our-result', type=str)
|
15 |
+
parser.add_argument('--output-result', type=str)
|
16 |
+
parser.add_argument('--split', type=str, default='test')
|
17 |
+
parser.add_argument('--options', type=list, default=["A", "B", "C", "D", "E"])
|
18 |
+
return parser.parse_args()
|
19 |
+
|
20 |
+
|
21 |
+
def convert_caps(results):
|
22 |
+
fakecaps = []
|
23 |
+
for result in results:
|
24 |
+
image_id = result['question_id']
|
25 |
+
caption = result['text']
|
26 |
+
fakecaps.append({"image_id": int(image_id), "caption": caption})
|
27 |
+
return fakecaps
|
28 |
+
|
29 |
+
|
30 |
+
def get_pred_idx(prediction, choices, options):
|
31 |
+
"""
|
32 |
+
Get the index (e.g. 2) from the prediction (e.g. 'C')
|
33 |
+
"""
|
34 |
+
if prediction in options[:len(choices)]:
|
35 |
+
return options.index(prediction)
|
36 |
+
else:
|
37 |
+
return random.choice(range(len(choices)))
|
38 |
+
|
39 |
+
|
40 |
+
if __name__ == "__main__":
|
41 |
+
args = get_args()
|
42 |
+
|
43 |
+
base_dir = args.base_dir
|
44 |
+
split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[args.split]
|
45 |
+
problems = json.load(open(os.path.join(base_dir, "problems.json")))
|
46 |
+
our_predictions = [json.loads(line) for line in open(args.our_result)]
|
47 |
+
our_predictions = {pred['question_id']: pred for pred in our_predictions}
|
48 |
+
split_problems = {idx: problems[idx] for idx in split_indices}
|
49 |
+
|
50 |
+
requery_predictions = [json.loads(line) for line in open(args.requery_result)]
|
51 |
+
requery_predictions = {pred['question_id']: pred for pred in requery_predictions}
|
52 |
+
|
53 |
+
gpt4_predictions = json.load(open(args.gpt4_result))['outputs']
|
54 |
+
|
55 |
+
results = defaultdict(lambda: 0)
|
56 |
+
|
57 |
+
sqa_results = {}
|
58 |
+
sqa_results['acc'] = None
|
59 |
+
sqa_results['correct'] = None
|
60 |
+
sqa_results['count'] = None
|
61 |
+
sqa_results['results'] = {}
|
62 |
+
sqa_results['outputs'] = {}
|
63 |
+
|
64 |
+
for prob_id, prob in split_problems.items():
|
65 |
+
if prob_id not in our_predictions:
|
66 |
+
assert False
|
67 |
+
if prob_id not in gpt4_predictions:
|
68 |
+
assert False
|
69 |
+
our_pred = our_predictions[prob_id]['text']
|
70 |
+
gpt4_pred = gpt4_predictions[prob_id]
|
71 |
+
if prob_id not in requery_predictions:
|
72 |
+
results['missing_requery'] += 1
|
73 |
+
requery_pred = "MISSING"
|
74 |
+
else:
|
75 |
+
requery_pred = requery_predictions[prob_id]['text']
|
76 |
+
|
77 |
+
pattern = re.compile(r'The answer is ([A-Z]).')
|
78 |
+
our_res = pattern.findall(our_pred)
|
79 |
+
if len(our_res) == 1:
|
80 |
+
our_answer = our_res[0] # 'A', 'B', ...
|
81 |
+
else:
|
82 |
+
our_answer = "FAILED"
|
83 |
+
|
84 |
+
requery_res = pattern.findall(requery_pred)
|
85 |
+
if len(requery_res) == 1:
|
86 |
+
requery_answer = requery_res[0] # 'A', 'B', ...
|
87 |
+
else:
|
88 |
+
requery_answer = "FAILED"
|
89 |
+
|
90 |
+
gpt4_res = pattern.findall(gpt4_pred)
|
91 |
+
if len(gpt4_res) == 1:
|
92 |
+
gpt4_answer = gpt4_res[0] # 'A', 'B', ...
|
93 |
+
else:
|
94 |
+
gpt4_answer = "FAILED"
|
95 |
+
|
96 |
+
our_pred_idx = get_pred_idx(our_answer, prob['choices'], args.options)
|
97 |
+
gpt4_pred_idx = get_pred_idx(gpt4_answer, prob['choices'], args.options)
|
98 |
+
requery_pred_idx = get_pred_idx(requery_answer, prob['choices'], args.options)
|
99 |
+
|
100 |
+
results['total'] += 1
|
101 |
+
|
102 |
+
if gpt4_answer == 'FAILED':
|
103 |
+
results['gpt4_failed'] += 1
|
104 |
+
if gpt4_pred_idx == prob['answer']:
|
105 |
+
results['gpt4_correct'] += 1
|
106 |
+
if our_pred_idx == prob['answer']:
|
107 |
+
results['gpt4_ourvisual_correct'] += 1
|
108 |
+
elif gpt4_pred_idx == prob['answer']:
|
109 |
+
results['gpt4_correct'] += 1
|
110 |
+
results['gpt4_ourvisual_correct'] += 1
|
111 |
+
|
112 |
+
if our_pred_idx == prob['answer']:
|
113 |
+
results['our_correct'] += 1
|
114 |
+
|
115 |
+
if requery_answer == 'FAILED':
|
116 |
+
sqa_results['results'][prob_id] = our_pred_idx
|
117 |
+
if our_pred_idx == prob['answer']:
|
118 |
+
results['requery_correct'] += 1
|
119 |
+
else:
|
120 |
+
sqa_results['results'][prob_id] = requery_pred_idx
|
121 |
+
if requery_pred_idx == prob['answer']:
|
122 |
+
results['requery_correct'] += 1
|
123 |
+
else:
|
124 |
+
print(f"""
|
125 |
+
Question ({args.options[prob['answer']]}): {our_predictions[prob_id]['prompt']}
|
126 |
+
Our ({our_answer}): {our_pred}
|
127 |
+
GPT-4 ({gpt4_answer}): {gpt4_pred}
|
128 |
+
Requery ({requery_answer}): {requery_pred}
|
129 |
+
print("=====================================")
|
130 |
+
""")
|
131 |
+
|
132 |
+
if gpt4_pred_idx == prob['answer'] or our_pred_idx == prob['answer']:
|
133 |
+
results['correct_upperbound'] += 1
|
134 |
+
|
135 |
+
total = results['total']
|
136 |
+
print(f'Total: {total}, Our-Correct: {results["our_correct"]}, Accuracy: {results["our_correct"] / total * 100:.2f}%')
|
137 |
+
print(f'Total: {total}, GPT-4-Correct: {results["gpt4_correct"]}, Accuracy: {results["gpt4_correct"] / total * 100:.2f}%')
|
138 |
+
print(f'Total: {total}, GPT-4 NO-ANS (RANDOM): {results["gpt4_failed"]}, Percentage: {results["gpt4_failed"] / total * 100:.2f}%')
|
139 |
+
print(f'Total: {total}, GPT-4-OursVisual-Correct: {results["gpt4_ourvisual_correct"]}, Accuracy: {results["gpt4_ourvisual_correct"] / total * 100:.2f}%')
|
140 |
+
print(f'Total: {total}, Requery-Correct: {results["requery_correct"]}, Accuracy: {results["requery_correct"] / total * 100:.2f}%')
|
141 |
+
print(f'Total: {total}, Correct upper: {results["correct_upperbound"]}, Accuracy: {results["correct_upperbound"] / total * 100:.2f}%')
|
142 |
+
|
143 |
+
sqa_results['acc'] = results["requery_correct"] / total * 100
|
144 |
+
sqa_results['correct'] = results["requery_correct"]
|
145 |
+
sqa_results['count'] = total
|
146 |
+
|
147 |
+
with open(args.output_result, 'w') as f:
|
148 |
+
json.dump(sqa_results, f, indent=2)
|
149 |
+
|
llava/eval/eval_textvqa.py
ADDED
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import argparse
|
3 |
+
import json
|
4 |
+
import re
|
5 |
+
|
6 |
+
from llava.eval.m4c_evaluator import TextVQAAccuracyEvaluator
|
7 |
+
|
8 |
+
|
9 |
+
def get_args():
|
10 |
+
parser = argparse.ArgumentParser()
|
11 |
+
parser.add_argument('--annotation-file', type=str)
|
12 |
+
parser.add_argument('--result-file', type=str)
|
13 |
+
parser.add_argument('--result-dir', type=str)
|
14 |
+
return parser.parse_args()
|
15 |
+
|
16 |
+
|
17 |
+
def prompt_processor(prompt):
|
18 |
+
if prompt.startswith('OCR tokens: '):
|
19 |
+
pattern = r"Question: (.*?) Short answer:"
|
20 |
+
match = re.search(pattern, prompt, re.DOTALL)
|
21 |
+
question = match.group(1)
|
22 |
+
elif 'Reference OCR token: ' in prompt and len(prompt.split('\n')) == 3:
|
23 |
+
if prompt.startswith('Reference OCR token:'):
|
24 |
+
question = prompt.split('\n')[1]
|
25 |
+
else:
|
26 |
+
question = prompt.split('\n')[0]
|
27 |
+
elif len(prompt.split('\n')) == 2:
|
28 |
+
question = prompt.split('\n')[0]
|
29 |
+
else:
|
30 |
+
assert False
|
31 |
+
|
32 |
+
return question.lower()
|
33 |
+
|
34 |
+
|
35 |
+
def eval_single(annotation_file, result_file):
|
36 |
+
experiment_name = os.path.splitext(os.path.basename(result_file))[0]
|
37 |
+
print(experiment_name)
|
38 |
+
annotations = json.load(open(annotation_file))['data']
|
39 |
+
annotations = {(annotation['image_id'], annotation['question'].lower()): annotation for annotation in annotations}
|
40 |
+
results = [json.loads(line) for line in open(result_file)]
|
41 |
+
|
42 |
+
pred_list = []
|
43 |
+
for result in results:
|
44 |
+
annotation = annotations[(result['question_id'], prompt_processor(result['prompt']))]
|
45 |
+
pred_list.append({
|
46 |
+
"pred_answer": result['text'],
|
47 |
+
"gt_answers": annotation['answers'],
|
48 |
+
})
|
49 |
+
|
50 |
+
evaluator = TextVQAAccuracyEvaluator()
|
51 |
+
print('Samples: {}\nAccuracy: {:.2f}%\n'.format(len(pred_list), 100. * evaluator.eval_pred_list(pred_list)))
|
52 |
+
|
53 |
+
|
54 |
+
if __name__ == "__main__":
|
55 |
+
args = get_args()
|
56 |
+
|
57 |
+
if args.result_file is not None:
|
58 |
+
eval_single(args.annotation_file, args.result_file)
|
59 |
+
|
60 |
+
if args.result_dir is not None:
|
61 |
+
for result_file in sorted(os.listdir(args.result_dir)):
|
62 |
+
if not result_file.endswith('.jsonl'):
|
63 |
+
print(f'Skipping {result_file}')
|
64 |
+
continue
|
65 |
+
eval_single(args.annotation_file, os.path.join(args.result_dir, result_file))
|
llava/eval/generate_webpage_data_from_table.py
ADDED
@@ -0,0 +1,111 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Generate json file for webpage."""
|
2 |
+
import json
|
3 |
+
import os
|
4 |
+
import re
|
5 |
+
|
6 |
+
# models = ['llama', 'alpaca', 'gpt35', 'bard']
|
7 |
+
models = ['vicuna']
|
8 |
+
|
9 |
+
|
10 |
+
def read_jsonl(path: str, key: str=None):
|
11 |
+
data = []
|
12 |
+
with open(os.path.expanduser(path)) as f:
|
13 |
+
for line in f:
|
14 |
+
if not line:
|
15 |
+
continue
|
16 |
+
data.append(json.loads(line))
|
17 |
+
if key is not None:
|
18 |
+
data.sort(key=lambda x: x[key])
|
19 |
+
data = {item[key]: item for item in data}
|
20 |
+
return data
|
21 |
+
|
22 |
+
|
23 |
+
def trim_hanging_lines(s: str, n: int) -> str:
|
24 |
+
s = s.strip()
|
25 |
+
for _ in range(n):
|
26 |
+
s = s.split('\n', 1)[1].strip()
|
27 |
+
return s
|
28 |
+
|
29 |
+
|
30 |
+
if __name__ == '__main__':
|
31 |
+
questions = read_jsonl('table/question.jsonl', key='question_id')
|
32 |
+
|
33 |
+
# alpaca_answers = read_jsonl('table/answer/answer_alpaca-13b.jsonl', key='question_id')
|
34 |
+
# bard_answers = read_jsonl('table/answer/answer_bard.jsonl', key='question_id')
|
35 |
+
# gpt35_answers = read_jsonl('table/answer/answer_gpt35.jsonl', key='question_id')
|
36 |
+
# llama_answers = read_jsonl('table/answer/answer_llama-13b.jsonl', key='question_id')
|
37 |
+
vicuna_answers = read_jsonl('table/answer/answer_vicuna-13b.jsonl', key='question_id')
|
38 |
+
ours_answers = read_jsonl('table/results/llama-13b-hf-alpaca.jsonl', key='question_id')
|
39 |
+
|
40 |
+
review_vicuna = read_jsonl('table/review/review_vicuna-13b_llama-13b-hf-alpaca.jsonl', key='question_id')
|
41 |
+
# review_alpaca = read_jsonl('table/review/review_alpaca-13b_vicuna-13b.jsonl', key='question_id')
|
42 |
+
# review_bard = read_jsonl('table/review/review_bard_vicuna-13b.jsonl', key='question_id')
|
43 |
+
# review_gpt35 = read_jsonl('table/review/review_gpt35_vicuna-13b.jsonl', key='question_id')
|
44 |
+
# review_llama = read_jsonl('table/review/review_llama-13b_vicuna-13b.jsonl', key='question_id')
|
45 |
+
|
46 |
+
records = []
|
47 |
+
for qid in questions.keys():
|
48 |
+
r = {
|
49 |
+
'id': qid,
|
50 |
+
'category': questions[qid]['category'],
|
51 |
+
'question': questions[qid]['text'],
|
52 |
+
'answers': {
|
53 |
+
# 'alpaca': alpaca_answers[qid]['text'],
|
54 |
+
# 'llama': llama_answers[qid]['text'],
|
55 |
+
# 'bard': bard_answers[qid]['text'],
|
56 |
+
# 'gpt35': gpt35_answers[qid]['text'],
|
57 |
+
'vicuna': vicuna_answers[qid]['text'],
|
58 |
+
'ours': ours_answers[qid]['text'],
|
59 |
+
},
|
60 |
+
'evaluations': {
|
61 |
+
# 'alpaca': review_alpaca[qid]['text'],
|
62 |
+
# 'llama': review_llama[qid]['text'],
|
63 |
+
# 'bard': review_bard[qid]['text'],
|
64 |
+
'vicuna': review_vicuna[qid]['content'],
|
65 |
+
# 'gpt35': review_gpt35[qid]['text'],
|
66 |
+
},
|
67 |
+
'scores': {
|
68 |
+
'vicuna': review_vicuna[qid]['tuple'],
|
69 |
+
# 'alpaca': review_alpaca[qid]['score'],
|
70 |
+
# 'llama': review_llama[qid]['score'],
|
71 |
+
# 'bard': review_bard[qid]['score'],
|
72 |
+
# 'gpt35': review_gpt35[qid]['score'],
|
73 |
+
},
|
74 |
+
}
|
75 |
+
|
76 |
+
# cleanup data
|
77 |
+
cleaned_evals = {}
|
78 |
+
for k, v in r['evaluations'].items():
|
79 |
+
v = v.strip()
|
80 |
+
lines = v.split('\n')
|
81 |
+
# trim the first line if it's a pair of numbers
|
82 |
+
if re.match(r'\d+[, ]+\d+', lines[0]):
|
83 |
+
lines = lines[1:]
|
84 |
+
v = '\n'.join(lines)
|
85 |
+
cleaned_evals[k] = v.replace('Assistant 1', "**Assistant 1**").replace('Assistant 2', '**Assistant 2**')
|
86 |
+
|
87 |
+
r['evaluations'] = cleaned_evals
|
88 |
+
records.append(r)
|
89 |
+
|
90 |
+
# Reorder the records, this is optional
|
91 |
+
for r in records:
|
92 |
+
if r['id'] <= 20:
|
93 |
+
r['id'] += 60
|
94 |
+
else:
|
95 |
+
r['id'] -= 20
|
96 |
+
for r in records:
|
97 |
+
if r['id'] <= 50:
|
98 |
+
r['id'] += 10
|
99 |
+
elif 50 < r['id'] <= 60:
|
100 |
+
r['id'] -= 50
|
101 |
+
for r in records:
|
102 |
+
if r['id'] == 7:
|
103 |
+
r['id'] = 1
|
104 |
+
elif r['id'] < 7:
|
105 |
+
r['id'] += 1
|
106 |
+
|
107 |
+
records.sort(key=lambda x: x['id'])
|
108 |
+
|
109 |
+
# Write to file
|
110 |
+
with open('webpage/data.json', 'w') as f:
|
111 |
+
json.dump({'questions': records, 'models': models}, f, indent=2)
|
llava/eval/m4c_evaluator.py
ADDED
@@ -0,0 +1,334 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) Facebook, Inc. and its affiliates.
|
2 |
+
import re
|
3 |
+
|
4 |
+
from tqdm import tqdm
|
5 |
+
|
6 |
+
|
7 |
+
class EvalAIAnswerProcessor:
|
8 |
+
"""
|
9 |
+
Processes an answer similar to Eval AI
|
10 |
+
copied from
|
11 |
+
https://github.com/facebookresearch/mmf/blob/c46b3b3391275b4181567db80943473a89ab98ab/pythia/tasks/processors.py#L897
|
12 |
+
"""
|
13 |
+
|
14 |
+
CONTRACTIONS = {
|
15 |
+
"aint": "ain't",
|
16 |
+
"arent": "aren't",
|
17 |
+
"cant": "can't",
|
18 |
+
"couldve": "could've",
|
19 |
+
"couldnt": "couldn't",
|
20 |
+
"couldn'tve": "couldn't've",
|
21 |
+
"couldnt've": "couldn't've",
|
22 |
+
"didnt": "didn't",
|
23 |
+
"doesnt": "doesn't",
|
24 |
+
"dont": "don't",
|
25 |
+
"hadnt": "hadn't",
|
26 |
+
"hadnt've": "hadn't've",
|
27 |
+
"hadn'tve": "hadn't've",
|
28 |
+
"hasnt": "hasn't",
|
29 |
+
"havent": "haven't",
|
30 |
+
"hed": "he'd",
|
31 |
+
"hed've": "he'd've",
|
32 |
+
"he'dve": "he'd've",
|
33 |
+
"hes": "he's",
|
34 |
+
"howd": "how'd",
|
35 |
+
"howll": "how'll",
|
36 |
+
"hows": "how's",
|
37 |
+
"Id've": "I'd've",
|
38 |
+
"I'dve": "I'd've",
|
39 |
+
"Im": "I'm",
|
40 |
+
"Ive": "I've",
|
41 |
+
"isnt": "isn't",
|
42 |
+
"itd": "it'd",
|
43 |
+
"itd've": "it'd've",
|
44 |
+
"it'dve": "it'd've",
|
45 |
+
"itll": "it'll",
|
46 |
+
"let's": "let's",
|
47 |
+
"maam": "ma'am",
|
48 |
+
"mightnt": "mightn't",
|
49 |
+
"mightnt've": "mightn't've",
|
50 |
+
"mightn'tve": "mightn't've",
|
51 |
+
"mightve": "might've",
|
52 |
+
"mustnt": "mustn't",
|
53 |
+
"mustve": "must've",
|
54 |
+
"neednt": "needn't",
|
55 |
+
"notve": "not've",
|
56 |
+
"oclock": "o'clock",
|
57 |
+
"oughtnt": "oughtn't",
|
58 |
+
"ow's'at": "'ow's'at",
|
59 |
+
"'ows'at": "'ow's'at",
|
60 |
+
"'ow'sat": "'ow's'at",
|
61 |
+
"shant": "shan't",
|
62 |
+
"shed've": "she'd've",
|
63 |
+
"she'dve": "she'd've",
|
64 |
+
"she's": "she's",
|
65 |
+
"shouldve": "should've",
|
66 |
+
"shouldnt": "shouldn't",
|
67 |
+
"shouldnt've": "shouldn't've",
|
68 |
+
"shouldn'tve": "shouldn't've",
|
69 |
+
"somebody'd": "somebodyd",
|
70 |
+
"somebodyd've": "somebody'd've",
|
71 |
+
"somebody'dve": "somebody'd've",
|
72 |
+
"somebodyll": "somebody'll",
|
73 |
+
"somebodys": "somebody's",
|
74 |
+
"someoned": "someone'd",
|
75 |
+
"someoned've": "someone'd've",
|
76 |
+
"someone'dve": "someone'd've",
|
77 |
+
"someonell": "someone'll",
|
78 |
+
"someones": "someone's",
|
79 |
+
"somethingd": "something'd",
|
80 |
+
"somethingd've": "something'd've",
|
81 |
+
"something'dve": "something'd've",
|
82 |
+
"somethingll": "something'll",
|
83 |
+
"thats": "that's",
|
84 |
+
"thered": "there'd",
|
85 |
+
"thered've": "there'd've",
|
86 |
+
"there'dve": "there'd've",
|
87 |
+
"therere": "there're",
|
88 |
+
"theres": "there's",
|
89 |
+
"theyd": "they'd",
|
90 |
+
"theyd've": "they'd've",
|
91 |
+
"they'dve": "they'd've",
|
92 |
+
"theyll": "they'll",
|
93 |
+
"theyre": "they're",
|
94 |
+
"theyve": "they've",
|
95 |
+
"twas": "'twas",
|
96 |
+
"wasnt": "wasn't",
|
97 |
+
"wed've": "we'd've",
|
98 |
+
"we'dve": "we'd've",
|
99 |
+
"weve": "we've",
|
100 |
+
"werent": "weren't",
|
101 |
+
"whatll": "what'll",
|
102 |
+
"whatre": "what're",
|
103 |
+
"whats": "what's",
|
104 |
+
"whatve": "what've",
|
105 |
+
"whens": "when's",
|
106 |
+
"whered": "where'd",
|
107 |
+
"wheres": "where's",
|
108 |
+
"whereve": "where've",
|
109 |
+
"whod": "who'd",
|
110 |
+
"whod've": "who'd've",
|
111 |
+
"who'dve": "who'd've",
|
112 |
+
"wholl": "who'll",
|
113 |
+
"whos": "who's",
|
114 |
+
"whove": "who've",
|
115 |
+
"whyll": "why'll",
|
116 |
+
"whyre": "why're",
|
117 |
+
"whys": "why's",
|
118 |
+
"wont": "won't",
|
119 |
+
"wouldve": "would've",
|
120 |
+
"wouldnt": "wouldn't",
|
121 |
+
"wouldnt've": "wouldn't've",
|
122 |
+
"wouldn'tve": "wouldn't've",
|
123 |
+
"yall": "y'all",
|
124 |
+
"yall'll": "y'all'll",
|
125 |
+
"y'allll": "y'all'll",
|
126 |
+
"yall'd've": "y'all'd've",
|
127 |
+
"y'alld've": "y'all'd've",
|
128 |
+
"y'all'dve": "y'all'd've",
|
129 |
+
"youd": "you'd",
|
130 |
+
"youd've": "you'd've",
|
131 |
+
"you'dve": "you'd've",
|
132 |
+
"youll": "you'll",
|
133 |
+
"youre": "you're",
|
134 |
+
"youve": "you've",
|
135 |
+
}
|
136 |
+
|
137 |
+
NUMBER_MAP = {
|
138 |
+
"none": "0",
|
139 |
+
"zero": "0",
|
140 |
+
"one": "1",
|
141 |
+
"two": "2",
|
142 |
+
"three": "3",
|
143 |
+
"four": "4",
|
144 |
+
"five": "5",
|
145 |
+
"six": "6",
|
146 |
+
"seven": "7",
|
147 |
+
"eight": "8",
|
148 |
+
"nine": "9",
|
149 |
+
"ten": "10",
|
150 |
+
}
|
151 |
+
ARTICLES = ["a", "an", "the"]
|
152 |
+
PERIOD_STRIP = re.compile(r"(?!<=\d)(\.)(?!\d)")
|
153 |
+
COMMA_STRIP = re.compile(r"(?<=\d)(\,)+(?=\d)")
|
154 |
+
PUNCTUATIONS = [
|
155 |
+
";",
|
156 |
+
r"/",
|
157 |
+
"[",
|
158 |
+
"]",
|
159 |
+
'"',
|
160 |
+
"{",
|
161 |
+
"}",
|
162 |
+
"(",
|
163 |
+
")",
|
164 |
+
"=",
|
165 |
+
"+",
|
166 |
+
"\\",
|
167 |
+
"_",
|
168 |
+
"-",
|
169 |
+
">",
|
170 |
+
"<",
|
171 |
+
"@",
|
172 |
+
"`",
|
173 |
+
",",
|
174 |
+
"?",
|
175 |
+
"!",
|
176 |
+
]
|
177 |
+
|
178 |
+
def __init__(self, *args, **kwargs):
|
179 |
+
pass
|
180 |
+
|
181 |
+
def word_tokenize(self, word):
|
182 |
+
word = word.lower()
|
183 |
+
word = word.replace(",", "").replace("?", "").replace("'s", " 's")
|
184 |
+
return word.strip()
|
185 |
+
|
186 |
+
def process_punctuation(self, in_text):
|
187 |
+
out_text = in_text
|
188 |
+
for p in self.PUNCTUATIONS:
|
189 |
+
if (p + " " in in_text or " " + p in in_text) or (
|
190 |
+
re.search(self.COMMA_STRIP, in_text) is not None
|
191 |
+
):
|
192 |
+
out_text = out_text.replace(p, "")
|
193 |
+
else:
|
194 |
+
out_text = out_text.replace(p, " ")
|
195 |
+
out_text = self.PERIOD_STRIP.sub("", out_text, re.UNICODE)
|
196 |
+
return out_text
|
197 |
+
|
198 |
+
def process_digit_article(self, in_text):
|
199 |
+
out_text = []
|
200 |
+
temp_text = in_text.lower().split()
|
201 |
+
for word in temp_text:
|
202 |
+
word = self.NUMBER_MAP.setdefault(word, word)
|
203 |
+
if word not in self.ARTICLES:
|
204 |
+
out_text.append(word)
|
205 |
+
else:
|
206 |
+
pass
|
207 |
+
for word_id, word in enumerate(out_text):
|
208 |
+
if word in self.CONTRACTIONS:
|
209 |
+
out_text[word_id] = self.CONTRACTIONS[word]
|
210 |
+
out_text = " ".join(out_text)
|
211 |
+
return out_text
|
212 |
+
|
213 |
+
def __call__(self, item):
|
214 |
+
item = self.word_tokenize(item)
|
215 |
+
item = item.replace("\n", " ").replace("\t", " ").strip()
|
216 |
+
item = self.process_punctuation(item)
|
217 |
+
item = self.process_digit_article(item)
|
218 |
+
return item
|
219 |
+
|
220 |
+
|
221 |
+
class TextVQAAccuracyEvaluator:
|
222 |
+
def __init__(self):
|
223 |
+
self.answer_processor = EvalAIAnswerProcessor()
|
224 |
+
|
225 |
+
def _compute_answer_scores(self, raw_answers):
|
226 |
+
"""
|
227 |
+
compute the accuracy (soft score) of human answers
|
228 |
+
"""
|
229 |
+
answers = [self.answer_processor(a) for a in raw_answers]
|
230 |
+
assert len(answers) == 10
|
231 |
+
gt_answers = list(enumerate(answers))
|
232 |
+
unique_answers = set(answers)
|
233 |
+
unique_answer_scores = {}
|
234 |
+
|
235 |
+
for unique_answer in unique_answers:
|
236 |
+
accs = []
|
237 |
+
for gt_answer in gt_answers:
|
238 |
+
other_answers = [item for item in gt_answers if item != gt_answer]
|
239 |
+
matching_answers = [
|
240 |
+
item for item in other_answers if item[1] == unique_answer
|
241 |
+
]
|
242 |
+
acc = min(1, float(len(matching_answers)) / 3)
|
243 |
+
accs.append(acc)
|
244 |
+
unique_answer_scores[unique_answer] = sum(accs) / len(accs)
|
245 |
+
|
246 |
+
return unique_answer_scores
|
247 |
+
|
248 |
+
def eval_pred_list(self, pred_list):
|
249 |
+
pred_scores = []
|
250 |
+
for entry in tqdm(pred_list):
|
251 |
+
pred_answer = self.answer_processor(entry["pred_answer"])
|
252 |
+
unique_answer_scores = self._compute_answer_scores(entry["gt_answers"])
|
253 |
+
score = unique_answer_scores.get(pred_answer, 0.0)
|
254 |
+
pred_scores.append(score)
|
255 |
+
|
256 |
+
accuracy = sum(pred_scores) / len(pred_scores)
|
257 |
+
return accuracy
|
258 |
+
|
259 |
+
|
260 |
+
class STVQAAccuracyEvaluator:
|
261 |
+
def __init__(self):
|
262 |
+
self.answer_processor = EvalAIAnswerProcessor()
|
263 |
+
|
264 |
+
def eval_pred_list(self, pred_list):
|
265 |
+
pred_scores = []
|
266 |
+
for entry in pred_list:
|
267 |
+
pred_answer = self.answer_processor(entry["pred_answer"])
|
268 |
+
gts = [self.answer_processor(a) for a in entry["gt_answers"]]
|
269 |
+
score = 1.0 if pred_answer in gts else 0.0
|
270 |
+
pred_scores.append(score)
|
271 |
+
|
272 |
+
accuracy = sum(pred_scores) / len(pred_scores)
|
273 |
+
return accuracy
|
274 |
+
|
275 |
+
|
276 |
+
class STVQAANLSEvaluator:
|
277 |
+
def __init__(self):
|
278 |
+
import editdistance # install with `pip install editdistance`
|
279 |
+
|
280 |
+
self.get_edit_distance = editdistance.eval
|
281 |
+
|
282 |
+
def get_anls(self, s1, s2):
|
283 |
+
s1 = s1.lower().strip()
|
284 |
+
s2 = s2.lower().strip()
|
285 |
+
iou = 1 - self.get_edit_distance(s1, s2) / max(len(s1), len(s2))
|
286 |
+
anls = iou if iou >= 0.5 else 0.0
|
287 |
+
return anls
|
288 |
+
|
289 |
+
def eval_pred_list(self, pred_list):
|
290 |
+
pred_scores = []
|
291 |
+
for entry in pred_list:
|
292 |
+
anls = max(
|
293 |
+
self.get_anls(entry["pred_answer"], gt) for gt in entry["gt_answers"]
|
294 |
+
)
|
295 |
+
pred_scores.append(anls)
|
296 |
+
|
297 |
+
accuracy = sum(pred_scores) / len(pred_scores)
|
298 |
+
return accuracy
|
299 |
+
|
300 |
+
|
301 |
+
class TextCapsBleu4Evaluator:
|
302 |
+
def __init__(self):
|
303 |
+
# The following script requires Java 1.8.0 and pycocotools installed.
|
304 |
+
# The pycocoevalcap can be installed with pip as
|
305 |
+
# pip install git+https://github.com/ronghanghu/coco-caption.git@python23
|
306 |
+
# Original pycocoevalcap code is at https://github.com/tylin/coco-caption
|
307 |
+
# but has no python3 support yet.
|
308 |
+
try:
|
309 |
+
from pycocoevalcap.bleu.bleu import Bleu
|
310 |
+
from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer
|
311 |
+
except ModuleNotFoundError:
|
312 |
+
print(
|
313 |
+
"Please install pycocoevalcap module using "
|
314 |
+
"pip install git+https://github.com/ronghanghu/coco-caption.git@python23" # noqa
|
315 |
+
)
|
316 |
+
raise
|
317 |
+
|
318 |
+
self.tokenizer = PTBTokenizer()
|
319 |
+
self.scorer = Bleu(4)
|
320 |
+
|
321 |
+
def eval_pred_list(self, pred_list):
|
322 |
+
# Create reference and hypotheses captions.
|
323 |
+
gts = {}
|
324 |
+
res = {}
|
325 |
+
for idx, entry in enumerate(pred_list):
|
326 |
+
gts[idx] = [{"caption": a} for a in entry["gt_answers"]]
|
327 |
+
res[idx] = [{"caption": entry["pred_answer"]}]
|
328 |
+
|
329 |
+
gts = self.tokenizer.tokenize(gts)
|
330 |
+
res = self.tokenizer.tokenize(res)
|
331 |
+
score, _ = self.scorer.compute_score(gts, res)
|
332 |
+
|
333 |
+
bleu4 = score[3] # score is (Bleu-1, Bleu-2, Bleu-3, Bleu-4)
|
334 |
+
return bleu4
|
llava/eval/model_qa.py
ADDED
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM, StoppingCriteria
|
3 |
+
import torch
|
4 |
+
import os
|
5 |
+
import json
|
6 |
+
from tqdm import tqdm
|
7 |
+
import shortuuid
|
8 |
+
|
9 |
+
from llava.conversation import default_conversation
|
10 |
+
from llava.utils import disable_torch_init
|
11 |
+
|
12 |
+
|
13 |
+
# new stopping implementation
|
14 |
+
class KeywordsStoppingCriteria(StoppingCriteria):
|
15 |
+
def __init__(self, keywords, tokenizer, input_ids):
|
16 |
+
self.keywords = keywords
|
17 |
+
self.tokenizer = tokenizer
|
18 |
+
self.start_len = None
|
19 |
+
self.input_ids = input_ids
|
20 |
+
|
21 |
+
def __call__(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
|
22 |
+
if self.start_len is None:
|
23 |
+
self.start_len = self.input_ids.shape[1]
|
24 |
+
else:
|
25 |
+
outputs = self.tokenizer.batch_decode(output_ids[:, self.start_len:], skip_special_tokens=True)[0]
|
26 |
+
for keyword in self.keywords:
|
27 |
+
if keyword in outputs:
|
28 |
+
return True
|
29 |
+
return False
|
30 |
+
|
31 |
+
|
32 |
+
@torch.inference_mode()
|
33 |
+
def eval_model(model_name, questions_file, answers_file):
|
34 |
+
# Model
|
35 |
+
disable_torch_init()
|
36 |
+
model_name = os.path.expanduser(model_name)
|
37 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
|
38 |
+
model = AutoModelForCausalLM.from_pretrained(model_name,
|
39 |
+
torch_dtype=torch.float16).cuda()
|
40 |
+
|
41 |
+
|
42 |
+
ques_file = open(os.path.expanduser(questions_file), "r")
|
43 |
+
ans_file = open(os.path.expanduser(answers_file), "w")
|
44 |
+
for i, line in enumerate(tqdm(ques_file)):
|
45 |
+
idx = json.loads(line)["question_id"]
|
46 |
+
qs = json.loads(line)["text"]
|
47 |
+
cat = json.loads(line)["category"]
|
48 |
+
conv = default_conversation.copy()
|
49 |
+
conv.append_message(conv.roles[0], qs)
|
50 |
+
prompt = conv.get_prompt()
|
51 |
+
inputs = tokenizer([prompt])
|
52 |
+
input_ids = torch.as_tensor(inputs.input_ids).cuda()
|
53 |
+
stopping_criteria = KeywordsStoppingCriteria([conv.sep], tokenizer, input_ids)
|
54 |
+
output_ids = model.generate(
|
55 |
+
input_ids,
|
56 |
+
do_sample=True,
|
57 |
+
use_cache=True,
|
58 |
+
temperature=0.7,
|
59 |
+
max_new_tokens=1024,
|
60 |
+
stopping_criteria=[stopping_criteria])
|
61 |
+
outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0]
|
62 |
+
try:
|
63 |
+
index = outputs.index(conv.sep, len(prompt))
|
64 |
+
except ValueError:
|
65 |
+
outputs += conv.sep
|
66 |
+
index = outputs.index(conv.sep, len(prompt))
|
67 |
+
|
68 |
+
outputs = outputs[len(prompt) + len(conv.roles[1]) + 2:index].strip()
|
69 |
+
ans_id = shortuuid.uuid()
|
70 |
+
ans_file.write(json.dumps({"question_id": idx,
|
71 |
+
"text": outputs,
|
72 |
+
"answer_id": ans_id,
|
73 |
+
"model_id": model_name,
|
74 |
+
"metadata": {}}) + "\n")
|
75 |
+
ans_file.flush()
|
76 |
+
ans_file.close()
|
77 |
+
|
78 |
+
if __name__ == "__main__":
|
79 |
+
parser = argparse.ArgumentParser()
|
80 |
+
parser.add_argument("--model-name", type=str, default="facebook/opt-350m")
|
81 |
+
parser.add_argument("--question-file", type=str, default="tables/question.jsonl")
|
82 |
+
parser.add_argument("--answers-file", type=str, default="answer.jsonl")
|
83 |
+
args = parser.parse_args()
|
84 |
+
|
85 |
+
eval_model(args.model_name, args.question_file, args.answers_file)
|
llava/eval/model_vqa.py
ADDED
@@ -0,0 +1,125 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
import torch
|
3 |
+
import os
|
4 |
+
import json
|
5 |
+
from tqdm import tqdm
|
6 |
+
import shortuuid
|
7 |
+
|
8 |
+
from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
|
9 |
+
from llava.conversation import conv_templates, SeparatorStyle
|
10 |
+
from llava.model.builder import load_pretrained_model
|
11 |
+
from llava.utils import disable_torch_init
|
12 |
+
from llava.mm_utils import tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria
|
13 |
+
|
14 |
+
from PIL import Image
|
15 |
+
import math
|
16 |
+
|
17 |
+
|
18 |
+
def split_list(lst, n):
|
19 |
+
"""Split a list into n (roughly) equal-sized chunks"""
|
20 |
+
chunk_size = math.ceil(len(lst) / n) # integer division
|
21 |
+
return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
|
22 |
+
|
23 |
+
|
24 |
+
def get_chunk(lst, n, k):
|
25 |
+
chunks = split_list(lst, n)
|
26 |
+
return chunks[k]
|
27 |
+
|
28 |
+
|
29 |
+
def eval_model(args):
|
30 |
+
# Model
|
31 |
+
disable_torch_init()
|
32 |
+
model_path = os.path.expanduser(args.model_path)
|
33 |
+
model_name = get_model_name_from_path(model_path)
|
34 |
+
tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name)
|
35 |
+
|
36 |
+
meta_pth = '/opt/data/private/metas/unsplash_ISO300-_PIL_1024_x2x4_APEX.txt'
|
37 |
+
img_pths = []
|
38 |
+
with open(meta_pth, 'r') as f:
|
39 |
+
for line in f.readlines():
|
40 |
+
img_pths.append(line.split('\t')[0])
|
41 |
+
f.close()
|
42 |
+
|
43 |
+
img_pths = get_chunk(img_pths, args.num_chunks, args.chunk_idx)
|
44 |
+
|
45 |
+
# split to batch 8
|
46 |
+
img_pths = split_list(img_pths, 8)
|
47 |
+
|
48 |
+
|
49 |
+
questions = [json.loads(q) for q in open(os.path.expanduser(args.question_file), "r")]
|
50 |
+
questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
|
51 |
+
answers_file = os.path.expanduser(args.answers_file)
|
52 |
+
os.makedirs(os.path.dirname(answers_file), exist_ok=True)
|
53 |
+
ans_file = open(answers_file, "w")
|
54 |
+
for line in tqdm(questions):
|
55 |
+
idx = line["question_id"]
|
56 |
+
image_file = line["image"]
|
57 |
+
qs = line["text"]
|
58 |
+
cur_prompt = qs
|
59 |
+
if model.config.mm_use_im_start_end:
|
60 |
+
qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs
|
61 |
+
else:
|
62 |
+
qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
|
63 |
+
|
64 |
+
conv = conv_templates[args.conv_mode].copy()
|
65 |
+
conv.append_message(conv.roles[0], qs)
|
66 |
+
conv.append_message(conv.roles[1], None)
|
67 |
+
prompt = conv.get_prompt()
|
68 |
+
|
69 |
+
input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
|
70 |
+
|
71 |
+
image = Image.open(os.path.join(args.image_folder, image_file))
|
72 |
+
image_tensor = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
|
73 |
+
|
74 |
+
stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
|
75 |
+
keywords = [stop_str]
|
76 |
+
stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
|
77 |
+
|
78 |
+
with torch.inference_mode():
|
79 |
+
output_ids = model.generate(
|
80 |
+
input_ids,
|
81 |
+
images=image_tensor.unsqueeze(0).half().cuda(),
|
82 |
+
do_sample=True if args.temperature > 0 else False,
|
83 |
+
temperature=args.temperature,
|
84 |
+
top_p=args.top_p,
|
85 |
+
num_beams=args.num_beams,
|
86 |
+
# no_repeat_ngram_size=3,
|
87 |
+
max_new_tokens=1024,
|
88 |
+
use_cache=True)
|
89 |
+
|
90 |
+
input_token_len = input_ids.shape[1]
|
91 |
+
n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
|
92 |
+
if n_diff_input_output > 0:
|
93 |
+
print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')
|
94 |
+
outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
|
95 |
+
outputs = outputs.strip()
|
96 |
+
if outputs.endswith(stop_str):
|
97 |
+
outputs = outputs[:-len(stop_str)]
|
98 |
+
outputs = outputs.strip()
|
99 |
+
|
100 |
+
ans_id = shortuuid.uuid()
|
101 |
+
ans_file.write(json.dumps({"question_id": idx,
|
102 |
+
"prompt": cur_prompt,
|
103 |
+
"text": outputs,
|
104 |
+
"answer_id": ans_id,
|
105 |
+
"model_id": model_name,
|
106 |
+
"metadata": {}}) + "\n")
|
107 |
+
ans_file.flush()
|
108 |
+
ans_file.close()
|
109 |
+
|
110 |
+
if __name__ == "__main__":
|
111 |
+
parser = argparse.ArgumentParser()
|
112 |
+
parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
|
113 |
+
parser.add_argument("--model-base", type=str, default=None)
|
114 |
+
parser.add_argument("--image-folder", type=str, default="")
|
115 |
+
parser.add_argument("--question-file", type=str, default="tables/question.jsonl")
|
116 |
+
parser.add_argument("--answers-file", type=str, default="answer.jsonl")
|
117 |
+
parser.add_argument("--conv-mode", type=str, default="llava_v1")
|
118 |
+
parser.add_argument("--num-chunks", type=int, default=1)
|
119 |
+
parser.add_argument("--chunk-idx", type=int, default=0)
|
120 |
+
parser.add_argument("--temperature", type=float, default=0.2)
|
121 |
+
parser.add_argument("--top_p", type=float, default=None)
|
122 |
+
parser.add_argument("--num_beams", type=int, default=1)
|
123 |
+
args = parser.parse_args()
|
124 |
+
|
125 |
+
eval_model(args)
|
llava/eval/model_vqa_loader.py
ADDED
@@ -0,0 +1,144 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
import torch
|
3 |
+
import os
|
4 |
+
import json
|
5 |
+
from tqdm import tqdm
|
6 |
+
import shortuuid
|
7 |
+
|
8 |
+
from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
|
9 |
+
from llava.conversation import conv_templates, SeparatorStyle
|
10 |
+
from llava.model.builder import load_pretrained_model
|
11 |
+
from llava.utils import disable_torch_init
|
12 |
+
from llava.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path
|
13 |
+
from torch.utils.data import Dataset, DataLoader
|
14 |
+
|
15 |
+
from PIL import Image
|
16 |
+
import math
|
17 |
+
|
18 |
+
|
19 |
+
def split_list(lst, n):
|
20 |
+
"""Split a list into n (roughly) equal-sized chunks"""
|
21 |
+
chunk_size = math.ceil(len(lst) / n) # integer division
|
22 |
+
return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
|
23 |
+
|
24 |
+
|
25 |
+
def get_chunk(lst, n, k):
|
26 |
+
chunks = split_list(lst, n)
|
27 |
+
return chunks[k]
|
28 |
+
|
29 |
+
|
30 |
+
# Custom dataset class
|
31 |
+
class CustomDataset(Dataset):
|
32 |
+
def __init__(self, questions, image_folder, tokenizer, image_processor, model_config):
|
33 |
+
self.questions = questions
|
34 |
+
self.image_folder = image_folder
|
35 |
+
self.tokenizer = tokenizer
|
36 |
+
self.image_processor = image_processor
|
37 |
+
self.model_config = model_config
|
38 |
+
|
39 |
+
def __getitem__(self, index):
|
40 |
+
line = self.questions[index]
|
41 |
+
image_file = line["image"]
|
42 |
+
qs = line["text"]
|
43 |
+
if self.model_config.mm_use_im_start_end:
|
44 |
+
qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs
|
45 |
+
else:
|
46 |
+
qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
|
47 |
+
|
48 |
+
conv = conv_templates[args.conv_mode].copy()
|
49 |
+
conv.append_message(conv.roles[0], qs)
|
50 |
+
conv.append_message(conv.roles[1], None)
|
51 |
+
prompt = conv.get_prompt()
|
52 |
+
|
53 |
+
image = Image.open(os.path.join(self.image_folder, image_file)).convert('RGB')
|
54 |
+
image_tensor = process_images([image], self.image_processor, self.model_config)[0]
|
55 |
+
|
56 |
+
input_ids = tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt')
|
57 |
+
|
58 |
+
return input_ids, image_tensor
|
59 |
+
|
60 |
+
def __len__(self):
|
61 |
+
return len(self.questions)
|
62 |
+
|
63 |
+
|
64 |
+
# DataLoader
|
65 |
+
def create_data_loader(questions, image_folder, tokenizer, image_processor, model_config, batch_size=1, num_workers=4):
|
66 |
+
assert batch_size == 1, "batch_size must be 1"
|
67 |
+
dataset = CustomDataset(questions, image_folder, tokenizer, image_processor, model_config)
|
68 |
+
data_loader = DataLoader(dataset, batch_size=batch_size, num_workers=num_workers, shuffle=False)
|
69 |
+
return data_loader
|
70 |
+
|
71 |
+
|
72 |
+
def eval_model(args):
|
73 |
+
# Model
|
74 |
+
disable_torch_init()
|
75 |
+
model_path = os.path.expanduser(args.model_path)
|
76 |
+
model_name = get_model_name_from_path(model_path)
|
77 |
+
tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name)
|
78 |
+
|
79 |
+
questions = [json.loads(q) for q in open(os.path.expanduser(args.question_file), "r")]
|
80 |
+
questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
|
81 |
+
answers_file = os.path.expanduser(args.answers_file)
|
82 |
+
os.makedirs(os.path.dirname(answers_file), exist_ok=True)
|
83 |
+
ans_file = open(answers_file, "w")
|
84 |
+
|
85 |
+
if 'plain' in model_name and 'finetune' not in model_name.lower() and 'mmtag' not in args.conv_mode:
|
86 |
+
args.conv_mode = args.conv_mode + '_mmtag'
|
87 |
+
print(f'It seems that this is a plain model, but it is not using a mmtag prompt, auto switching to {args.conv_mode}.')
|
88 |
+
|
89 |
+
data_loader = create_data_loader(questions, args.image_folder, tokenizer, image_processor, model.config)
|
90 |
+
|
91 |
+
for (input_ids, image_tensor), line in tqdm(zip(data_loader, questions), total=len(questions)):
|
92 |
+
idx = line["question_id"]
|
93 |
+
cur_prompt = line["text"]
|
94 |
+
|
95 |
+
stop_str = conv_templates[args.conv_mode].sep if conv_templates[args.conv_mode].sep_style != SeparatorStyle.TWO else conv_templates[args.conv_mode].sep2
|
96 |
+
input_ids = input_ids.to(device='cuda', non_blocking=True)
|
97 |
+
|
98 |
+
with torch.inference_mode():
|
99 |
+
output_ids = model.generate(
|
100 |
+
input_ids,
|
101 |
+
images=image_tensor.to(dtype=torch.float16, device='cuda', non_blocking=True),
|
102 |
+
do_sample=True if args.temperature > 0 else False,
|
103 |
+
temperature=args.temperature,
|
104 |
+
top_p=args.top_p,
|
105 |
+
num_beams=args.num_beams,
|
106 |
+
max_new_tokens=128,
|
107 |
+
use_cache=True)
|
108 |
+
|
109 |
+
input_token_len = input_ids.shape[1]
|
110 |
+
n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
|
111 |
+
if n_diff_input_output > 0:
|
112 |
+
print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')
|
113 |
+
outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
|
114 |
+
outputs = outputs.strip()
|
115 |
+
if outputs.endswith(stop_str):
|
116 |
+
outputs = outputs[:-len(stop_str)]
|
117 |
+
outputs = outputs.strip()
|
118 |
+
|
119 |
+
ans_id = shortuuid.uuid()
|
120 |
+
ans_file.write(json.dumps({"question_id": idx,
|
121 |
+
"prompt": cur_prompt,
|
122 |
+
"text": outputs,
|
123 |
+
"answer_id": ans_id,
|
124 |
+
"model_id": model_name,
|
125 |
+
"metadata": {}}) + "\n")
|
126 |
+
# ans_file.flush()
|
127 |
+
ans_file.close()
|
128 |
+
|
129 |
+
if __name__ == "__main__":
|
130 |
+
parser = argparse.ArgumentParser()
|
131 |
+
parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
|
132 |
+
parser.add_argument("--model-base", type=str, default=None)
|
133 |
+
parser.add_argument("--image-folder", type=str, default="")
|
134 |
+
parser.add_argument("--question-file", type=str, default="tables/question.jsonl")
|
135 |
+
parser.add_argument("--answers-file", type=str, default="answer.jsonl")
|
136 |
+
parser.add_argument("--conv-mode", type=str, default="llava_v1")
|
137 |
+
parser.add_argument("--num-chunks", type=int, default=1)
|
138 |
+
parser.add_argument("--chunk-idx", type=int, default=0)
|
139 |
+
parser.add_argument("--temperature", type=float, default=0.2)
|
140 |
+
parser.add_argument("--top_p", type=float, default=None)
|
141 |
+
parser.add_argument("--num_beams", type=int, default=1)
|
142 |
+
args = parser.parse_args()
|
143 |
+
|
144 |
+
eval_model(args)
|
llava/eval/model_vqa_mmbench.py
ADDED
@@ -0,0 +1,170 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
import torch
|
3 |
+
import os
|
4 |
+
import json
|
5 |
+
import pandas as pd
|
6 |
+
from tqdm import tqdm
|
7 |
+
import shortuuid
|
8 |
+
|
9 |
+
from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
|
10 |
+
from llava.conversation import conv_templates, SeparatorStyle
|
11 |
+
from llava.model.builder import load_pretrained_model
|
12 |
+
from llava.utils import disable_torch_init
|
13 |
+
from llava.mm_utils import tokenizer_image_token, process_images, load_image_from_base64, get_model_name_from_path
|
14 |
+
|
15 |
+
from PIL import Image
|
16 |
+
import math
|
17 |
+
|
18 |
+
|
19 |
+
all_options = ['A', 'B', 'C', 'D']
|
20 |
+
|
21 |
+
|
22 |
+
def split_list(lst, n):
|
23 |
+
"""Split a list into n (roughly) equal-sized chunks"""
|
24 |
+
chunk_size = math.ceil(len(lst) / n) # integer division
|
25 |
+
return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
|
26 |
+
|
27 |
+
|
28 |
+
def get_chunk(lst, n, k):
|
29 |
+
chunks = split_list(lst, n)
|
30 |
+
return chunks[k]
|
31 |
+
|
32 |
+
|
33 |
+
def is_none(value):
|
34 |
+
if value is None:
|
35 |
+
return True
|
36 |
+
if type(value) is float and math.isnan(value):
|
37 |
+
return True
|
38 |
+
if type(value) is str and value.lower() == 'nan':
|
39 |
+
return True
|
40 |
+
if type(value) is str and value.lower() == 'none':
|
41 |
+
return True
|
42 |
+
return False
|
43 |
+
|
44 |
+
def get_options(row, options):
|
45 |
+
parsed_options = []
|
46 |
+
for option in options:
|
47 |
+
option_value = row[option]
|
48 |
+
if is_none(option_value):
|
49 |
+
break
|
50 |
+
parsed_options.append(option_value)
|
51 |
+
return parsed_options
|
52 |
+
|
53 |
+
|
54 |
+
def eval_model(args):
|
55 |
+
# Model
|
56 |
+
disable_torch_init()
|
57 |
+
model_path = os.path.expanduser(args.model_path)
|
58 |
+
model_name = get_model_name_from_path(model_path)
|
59 |
+
tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name)
|
60 |
+
|
61 |
+
questions = pd.read_table(os.path.expanduser(args.question_file))
|
62 |
+
questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
|
63 |
+
answers_file = os.path.expanduser(args.answers_file)
|
64 |
+
os.makedirs(os.path.dirname(answers_file), exist_ok=True)
|
65 |
+
ans_file = open(answers_file, "w")
|
66 |
+
|
67 |
+
if 'plain' in model_name and 'finetune' not in model_name.lower() and 'mmtag' not in args.conv_mode:
|
68 |
+
args.conv_mode = args.conv_mode + '_mmtag'
|
69 |
+
print(f'It seems that this is a plain model, but it is not using a mmtag prompt, auto switching to {args.conv_mode}.')
|
70 |
+
|
71 |
+
for index, row in tqdm(questions.iterrows(), total=len(questions)):
|
72 |
+
options = get_options(row, all_options)
|
73 |
+
cur_option_char = all_options[:len(options)]
|
74 |
+
|
75 |
+
if args.all_rounds:
|
76 |
+
num_rounds = len(options)
|
77 |
+
else:
|
78 |
+
num_rounds = 1
|
79 |
+
|
80 |
+
for round_idx in range(num_rounds):
|
81 |
+
idx = row['index']
|
82 |
+
question = row['question']
|
83 |
+
hint = row['hint']
|
84 |
+
image = load_image_from_base64(row['image'])
|
85 |
+
if not is_none(hint):
|
86 |
+
question = hint + '\n' + question
|
87 |
+
for option_char, option in zip(all_options[:len(options)], options):
|
88 |
+
question = question + '\n' + option_char + '. ' + option
|
89 |
+
qs = cur_prompt = question
|
90 |
+
if model.config.mm_use_im_start_end:
|
91 |
+
qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs
|
92 |
+
else:
|
93 |
+
qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
|
94 |
+
|
95 |
+
if args.single_pred_prompt:
|
96 |
+
if args.lang == 'cn':
|
97 |
+
qs = qs + '\n' + "请直接回答选项字母。"
|
98 |
+
else:
|
99 |
+
qs = qs + '\n' + "Answer with the option's letter from the given choices directly."
|
100 |
+
|
101 |
+
conv = conv_templates[args.conv_mode].copy()
|
102 |
+
conv.append_message(conv.roles[0], qs)
|
103 |
+
conv.append_message(conv.roles[1], None)
|
104 |
+
prompt = conv.get_prompt()
|
105 |
+
|
106 |
+
input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
|
107 |
+
|
108 |
+
image_tensor = process_images([image], image_processor, model.config)[0]
|
109 |
+
# image_tensor = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
|
110 |
+
|
111 |
+
stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
|
112 |
+
|
113 |
+
with torch.inference_mode():
|
114 |
+
output_ids = model.generate(
|
115 |
+
input_ids,
|
116 |
+
images=image_tensor.unsqueeze(0).half().cuda(),
|
117 |
+
do_sample=True if args.temperature > 0 else False,
|
118 |
+
temperature=args.temperature,
|
119 |
+
top_p=args.top_p,
|
120 |
+
num_beams=args.num_beams,
|
121 |
+
# no_repeat_ngram_size=3,
|
122 |
+
max_new_tokens=1024,
|
123 |
+
use_cache=True)
|
124 |
+
|
125 |
+
input_token_len = input_ids.shape[1]
|
126 |
+
n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
|
127 |
+
if n_diff_input_output > 0:
|
128 |
+
print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')
|
129 |
+
outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
|
130 |
+
outputs = outputs.strip()
|
131 |
+
if outputs.endswith(stop_str):
|
132 |
+
outputs = outputs[:-len(stop_str)]
|
133 |
+
outputs = outputs.strip()
|
134 |
+
|
135 |
+
ans_id = shortuuid.uuid()
|
136 |
+
ans_file.write(json.dumps({"question_id": idx,
|
137 |
+
"round_id": round_idx,
|
138 |
+
"prompt": cur_prompt,
|
139 |
+
"text": outputs,
|
140 |
+
"options": options,
|
141 |
+
"option_char": cur_option_char,
|
142 |
+
"answer_id": ans_id,
|
143 |
+
"model_id": model_name,
|
144 |
+
"metadata": {}}) + "\n")
|
145 |
+
ans_file.flush()
|
146 |
+
|
147 |
+
# rotate options
|
148 |
+
options = options[1:] + options[:1]
|
149 |
+
cur_option_char = cur_option_char[1:] + cur_option_char[:1]
|
150 |
+
ans_file.close()
|
151 |
+
|
152 |
+
if __name__ == "__main__":
|
153 |
+
parser = argparse.ArgumentParser()
|
154 |
+
parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
|
155 |
+
parser.add_argument("--model-base", type=str, default=None)
|
156 |
+
parser.add_argument("--image-folder", type=str, default="")
|
157 |
+
parser.add_argument("--question-file", type=str, default="tables/question.jsonl")
|
158 |
+
parser.add_argument("--answers-file", type=str, default="answer.jsonl")
|
159 |
+
parser.add_argument("--conv-mode", type=str, default="llava_v1")
|
160 |
+
parser.add_argument("--num-chunks", type=int, default=1)
|
161 |
+
parser.add_argument("--chunk-idx", type=int, default=0)
|
162 |
+
parser.add_argument("--temperature", type=float, default=0.2)
|
163 |
+
parser.add_argument("--top_p", type=float, default=None)
|
164 |
+
parser.add_argument("--num_beams", type=int, default=1)
|
165 |
+
parser.add_argument("--all-rounds", action="store_true")
|
166 |
+
parser.add_argument("--single-pred-prompt", action="store_true")
|
167 |
+
parser.add_argument("--lang", type=str, default="en")
|
168 |
+
args = parser.parse_args()
|
169 |
+
|
170 |
+
eval_model(args)
|
llava/eval/model_vqa_science.py
ADDED
@@ -0,0 +1,147 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
import torch
|
3 |
+
import os
|
4 |
+
import json
|
5 |
+
from tqdm import tqdm
|
6 |
+
import shortuuid
|
7 |
+
|
8 |
+
from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
|
9 |
+
from llava.conversation import conv_templates, SeparatorStyle
|
10 |
+
from llava.model.builder import load_pretrained_model
|
11 |
+
from llava.utils import disable_torch_init
|
12 |
+
from llava.mm_utils import tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria
|
13 |
+
|
14 |
+
from PIL import Image
|
15 |
+
import math
|
16 |
+
|
17 |
+
|
18 |
+
def split_list(lst, n):
|
19 |
+
"""Split a list into n (roughly) equal-sized chunks"""
|
20 |
+
chunk_size = math.ceil(len(lst) / n) # integer division
|
21 |
+
return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
|
22 |
+
|
23 |
+
|
24 |
+
def get_chunk(lst, n, k):
|
25 |
+
chunks = split_list(lst, n)
|
26 |
+
return chunks[k]
|
27 |
+
|
28 |
+
|
29 |
+
def eval_model(args):
|
30 |
+
# Model
|
31 |
+
disable_torch_init()
|
32 |
+
model_path = os.path.expanduser(args.model_path)
|
33 |
+
model_name = get_model_name_from_path(model_path)
|
34 |
+
tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name)
|
35 |
+
|
36 |
+
questions = json.load(open(os.path.expanduser(args.question_file), "r"))
|
37 |
+
questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
|
38 |
+
answers_file = os.path.expanduser(args.answers_file)
|
39 |
+
os.makedirs(os.path.dirname(answers_file), exist_ok=True)
|
40 |
+
ans_file = open(answers_file, "w")
|
41 |
+
for i, line in enumerate(tqdm(questions)):
|
42 |
+
idx = line["id"]
|
43 |
+
question = line['conversations'][0]
|
44 |
+
qs = question['value'].replace('<image>', '').strip()
|
45 |
+
cur_prompt = qs
|
46 |
+
|
47 |
+
if 'image' in line:
|
48 |
+
image_file = line["image"]
|
49 |
+
image = Image.open(os.path.join(args.image_folder, image_file))
|
50 |
+
image_tensor = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
|
51 |
+
images = image_tensor.unsqueeze(0).half().cuda()
|
52 |
+
if getattr(model.config, 'mm_use_im_start_end', False):
|
53 |
+
qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs
|
54 |
+
else:
|
55 |
+
qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
|
56 |
+
cur_prompt = '<image>' + '\n' + cur_prompt
|
57 |
+
else:
|
58 |
+
images = None
|
59 |
+
|
60 |
+
if args.single_pred_prompt:
|
61 |
+
qs = qs + '\n' + "Answer with the option's letter from the given choices directly."
|
62 |
+
cur_prompt = cur_prompt + '\n' + "Answer with the option's letter from the given choices directly."
|
63 |
+
|
64 |
+
conv = conv_templates[args.conv_mode].copy()
|
65 |
+
conv.append_message(conv.roles[0], qs)
|
66 |
+
conv.append_message(conv.roles[1], None)
|
67 |
+
prompt = conv.get_prompt()
|
68 |
+
|
69 |
+
input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
|
70 |
+
|
71 |
+
stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
|
72 |
+
keywords = [stop_str]
|
73 |
+
stopping_criteria = [KeywordsStoppingCriteria(keywords, tokenizer, input_ids)] if conv.version == "v0" else None
|
74 |
+
|
75 |
+
with torch.inference_mode():
|
76 |
+
output_ids = model.generate(
|
77 |
+
input_ids,
|
78 |
+
images=images,
|
79 |
+
do_sample=True if args.temperature > 0 else False,
|
80 |
+
temperature=args.temperature,
|
81 |
+
max_new_tokens=1024,
|
82 |
+
use_cache=True,
|
83 |
+
stopping_criteria=stopping_criteria,
|
84 |
+
)
|
85 |
+
|
86 |
+
input_token_len = input_ids.shape[1]
|
87 |
+
n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
|
88 |
+
if n_diff_input_output > 0:
|
89 |
+
print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')
|
90 |
+
outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
|
91 |
+
outputs = outputs.strip()
|
92 |
+
if outputs.endswith(stop_str):
|
93 |
+
outputs = outputs[:-len(stop_str)]
|
94 |
+
outputs = outputs.strip()
|
95 |
+
|
96 |
+
# prompt for answer
|
97 |
+
if args.answer_prompter:
|
98 |
+
outputs_reasoning = outputs
|
99 |
+
input_ids = tokenizer_image_token(prompt + outputs_reasoning + ' ###\nANSWER:', tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
|
100 |
+
|
101 |
+
with torch.inference_mode():
|
102 |
+
output_ids = model.generate(
|
103 |
+
input_ids,
|
104 |
+
images=images,
|
105 |
+
do_sample=True if args.temperature > 0 else False,
|
106 |
+
temperature=args.temperature,
|
107 |
+
max_new_tokens=64,
|
108 |
+
use_cache=True,
|
109 |
+
stopping_criteria=[stopping_criteria])
|
110 |
+
|
111 |
+
input_token_len = input_ids.shape[1]
|
112 |
+
n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
|
113 |
+
if n_diff_input_output > 0:
|
114 |
+
print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')
|
115 |
+
outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
|
116 |
+
outputs = outputs.strip()
|
117 |
+
if outputs.endswith(stop_str):
|
118 |
+
outputs = outputs[:-len(stop_str)]
|
119 |
+
outputs = outputs.strip()
|
120 |
+
outputs = outputs_reasoning + '\n The answer is ' + outputs
|
121 |
+
|
122 |
+
ans_id = shortuuid.uuid()
|
123 |
+
ans_file.write(json.dumps({"question_id": idx,
|
124 |
+
"prompt": cur_prompt,
|
125 |
+
"text": outputs,
|
126 |
+
"answer_id": ans_id,
|
127 |
+
"model_id": model_name,
|
128 |
+
"metadata": {}}) + "\n")
|
129 |
+
ans_file.flush()
|
130 |
+
ans_file.close()
|
131 |
+
|
132 |
+
if __name__ == "__main__":
|
133 |
+
parser = argparse.ArgumentParser()
|
134 |
+
parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
|
135 |
+
parser.add_argument("--model-base", type=str, default=None)
|
136 |
+
parser.add_argument("--image-folder", type=str, default="")
|
137 |
+
parser.add_argument("--question-file", type=str, default="tables/question.json")
|
138 |
+
parser.add_argument("--answers-file", type=str, default="answer.jsonl")
|
139 |
+
parser.add_argument("--conv-mode", type=str, default="llava_v0")
|
140 |
+
parser.add_argument("--num-chunks", type=int, default=1)
|
141 |
+
parser.add_argument("--chunk-idx", type=int, default=0)
|
142 |
+
parser.add_argument("--temperature", type=float, default=0.2)
|
143 |
+
parser.add_argument("--answer-prompter", action="store_true")
|
144 |
+
parser.add_argument("--single-pred-prompt", action="store_true")
|
145 |
+
args = parser.parse_args()
|
146 |
+
|
147 |
+
eval_model(args)
|
llava/eval/qa_baseline_gpt35.py
ADDED
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Generate answers with GPT-3.5"""
|
2 |
+
# Note: you need to be using OpenAI Python v0.27.0 for the code below to work
|
3 |
+
import argparse
|
4 |
+
import json
|
5 |
+
import os
|
6 |
+
import time
|
7 |
+
import concurrent.futures
|
8 |
+
|
9 |
+
import openai
|
10 |
+
import tqdm
|
11 |
+
import shortuuid
|
12 |
+
|
13 |
+
MODEL = 'gpt-3.5-turbo'
|
14 |
+
MODEL_ID = 'gpt-3.5-turbo:20230327'
|
15 |
+
|
16 |
+
def get_answer(question_id: int, question: str, max_tokens: int):
|
17 |
+
ans = {
|
18 |
+
'answer_id': shortuuid.uuid(),
|
19 |
+
'question_id': question_id,
|
20 |
+
'model_id': MODEL_ID,
|
21 |
+
}
|
22 |
+
for _ in range(3):
|
23 |
+
try:
|
24 |
+
response = openai.ChatCompletion.create(
|
25 |
+
model=MODEL,
|
26 |
+
messages=[{
|
27 |
+
'role': 'system',
|
28 |
+
'content': 'You are a helpful assistant.'
|
29 |
+
}, {
|
30 |
+
'role': 'user',
|
31 |
+
'content': question,
|
32 |
+
}],
|
33 |
+
max_tokens=max_tokens,
|
34 |
+
)
|
35 |
+
ans['text'] = response['choices'][0]['message']['content']
|
36 |
+
return ans
|
37 |
+
except Exception as e:
|
38 |
+
print('[ERROR]', e)
|
39 |
+
ans['text'] = '#ERROR#'
|
40 |
+
time.sleep(1)
|
41 |
+
return ans
|
42 |
+
|
43 |
+
|
44 |
+
if __name__ == '__main__':
|
45 |
+
parser = argparse.ArgumentParser(description='ChatGPT answer generation.')
|
46 |
+
parser.add_argument('-q', '--question')
|
47 |
+
parser.add_argument('-o', '--output')
|
48 |
+
parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output')
|
49 |
+
args = parser.parse_args()
|
50 |
+
|
51 |
+
questions_dict = {}
|
52 |
+
with open(os.path.expanduser(args.question)) as f:
|
53 |
+
for line in f:
|
54 |
+
if not line:
|
55 |
+
continue
|
56 |
+
q = json.loads(line)
|
57 |
+
questions_dict[q['question_id']] = q['text']
|
58 |
+
|
59 |
+
answers = []
|
60 |
+
|
61 |
+
with concurrent.futures.ThreadPoolExecutor(max_workers=32) as executor:
|
62 |
+
futures = []
|
63 |
+
for qid, question in questions_dict.items():
|
64 |
+
future = executor.submit(get_answer, qid, question, args.max_tokens)
|
65 |
+
futures.append(future)
|
66 |
+
|
67 |
+
for future in tqdm.tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
|
68 |
+
answers.append(future.result())
|
69 |
+
|
70 |
+
answers.sort(key=lambda x: x['question_id'])
|
71 |
+
|
72 |
+
with open(os.path.expanduser(args.output), 'w') as f:
|
73 |
+
table = [json.dumps(ans) for ans in answers]
|
74 |
+
f.write('\n'.join(table))
|
llava/eval/run_llava.py
ADDED
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
import torch
|
3 |
+
|
4 |
+
from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
|
5 |
+
from llava.conversation import conv_templates, SeparatorStyle
|
6 |
+
from llava.model.builder import load_pretrained_model
|
7 |
+
from llava.utils import disable_torch_init
|
8 |
+
from llava.mm_utils import tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria
|
9 |
+
|
10 |
+
from PIL import Image
|
11 |
+
|
12 |
+
import requests
|
13 |
+
from PIL import Image
|
14 |
+
from io import BytesIO
|
15 |
+
|
16 |
+
|
17 |
+
def load_image(image_file):
|
18 |
+
if image_file.startswith('http') or image_file.startswith('https'):
|
19 |
+
response = requests.get(image_file)
|
20 |
+
image = Image.open(BytesIO(response.content)).convert('RGB')
|
21 |
+
else:
|
22 |
+
image = Image.open(image_file).convert('RGB')
|
23 |
+
return image
|
24 |
+
|
25 |
+
|
26 |
+
def eval_model(args):
|
27 |
+
# Model
|
28 |
+
disable_torch_init()
|
29 |
+
|
30 |
+
model_name = get_model_name_from_path(args.model_path)
|
31 |
+
tokenizer, model, image_processor, context_len = load_pretrained_model(args.model_path, args.model_base, model_name)
|
32 |
+
|
33 |
+
qs = args.query
|
34 |
+
if model.config.mm_use_im_start_end:
|
35 |
+
qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs
|
36 |
+
else:
|
37 |
+
qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
|
38 |
+
|
39 |
+
if 'llama-2' in model_name.lower():
|
40 |
+
conv_mode = "llava_llama_2"
|
41 |
+
elif "v1" in model_name.lower():
|
42 |
+
conv_mode = "llava_v1"
|
43 |
+
elif "mpt" in model_name.lower():
|
44 |
+
conv_mode = "mpt"
|
45 |
+
else:
|
46 |
+
conv_mode = "llava_v0"
|
47 |
+
|
48 |
+
if args.conv_mode is not None and conv_mode != args.conv_mode:
|
49 |
+
print('[WARNING] the auto inferred conversation mode is {}, while `--conv-mode` is {}, using {}'.format(conv_mode, args.conv_mode, args.conv_mode))
|
50 |
+
else:
|
51 |
+
args.conv_mode = conv_mode
|
52 |
+
|
53 |
+
conv = conv_templates[args.conv_mode].copy()
|
54 |
+
conv.append_message(conv.roles[0], qs)
|
55 |
+
conv.append_message(conv.roles[1], None)
|
56 |
+
prompt = conv.get_prompt()
|
57 |
+
|
58 |
+
image = load_image(args.image_file)
|
59 |
+
image_tensor = image_processor.preprocess(image, return_tensors='pt')['pixel_values'].half().cuda()
|
60 |
+
|
61 |
+
input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
|
62 |
+
|
63 |
+
stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
|
64 |
+
keywords = [stop_str]
|
65 |
+
stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
|
66 |
+
|
67 |
+
with torch.inference_mode():
|
68 |
+
output_ids = model.generate(
|
69 |
+
input_ids,
|
70 |
+
images=image_tensor,
|
71 |
+
do_sample=True,
|
72 |
+
temperature=0.2,
|
73 |
+
max_new_tokens=1024,
|
74 |
+
use_cache=True,
|
75 |
+
stopping_criteria=[stopping_criteria])
|
76 |
+
|
77 |
+
input_token_len = input_ids.shape[1]
|
78 |
+
n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
|
79 |
+
if n_diff_input_output > 0:
|
80 |
+
print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')
|
81 |
+
outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
|
82 |
+
outputs = outputs.strip()
|
83 |
+
if outputs.endswith(stop_str):
|
84 |
+
outputs = outputs[:-len(stop_str)]
|
85 |
+
outputs = outputs.strip()
|
86 |
+
print(outputs)
|
87 |
+
|
88 |
+
if __name__ == "__main__":
|
89 |
+
parser = argparse.ArgumentParser()
|
90 |
+
parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
|
91 |
+
parser.add_argument("--model-base", type=str, default=None)
|
92 |
+
parser.add_argument("--image-file", type=str, required=True)
|
93 |
+
parser.add_argument("--query", type=str, required=True)
|
94 |
+
parser.add_argument("--conv-mode", type=str, default=None)
|
95 |
+
args = parser.parse_args()
|
96 |
+
|
97 |
+
eval_model(args)
|
llava/eval/summarize_gpt_review.py
ADDED
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import os
|
3 |
+
from collections import defaultdict
|
4 |
+
|
5 |
+
import numpy as np
|
6 |
+
|
7 |
+
import argparse
|
8 |
+
|
9 |
+
def parse_args():
|
10 |
+
parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.')
|
11 |
+
parser.add_argument('-d', '--dir', default=None)
|
12 |
+
parser.add_argument('-v', '--version', default=None)
|
13 |
+
parser.add_argument('-s', '--select', nargs='*', default=None)
|
14 |
+
parser.add_argument('-f', '--files', nargs='*', default=[])
|
15 |
+
parser.add_argument('-i', '--ignore', nargs='*', default=[])
|
16 |
+
return parser.parse_args()
|
17 |
+
|
18 |
+
|
19 |
+
if __name__ == '__main__':
|
20 |
+
args = parse_args()
|
21 |
+
|
22 |
+
if args.ignore is not None:
|
23 |
+
args.ignore = [int(x) for x in args.ignore]
|
24 |
+
|
25 |
+
if len(args.files) > 0:
|
26 |
+
review_files = args.files
|
27 |
+
else:
|
28 |
+
review_files = [x for x in os.listdir(args.dir) if x.endswith('.jsonl') and (x.startswith('gpt4_text') or x.startswith('reviews_') or x.startswith('review_') or 'review' in args.dir)]
|
29 |
+
|
30 |
+
for review_file in sorted(review_files):
|
31 |
+
config = os.path.basename(review_file).replace('gpt4_text_', '').replace('.jsonl', '')
|
32 |
+
if args.select is not None and any(x not in config for x in args.select):
|
33 |
+
continue
|
34 |
+
if '0613' in config:
|
35 |
+
version = '0613'
|
36 |
+
else:
|
37 |
+
version = '0314'
|
38 |
+
if args.version is not None and args.version != version:
|
39 |
+
continue
|
40 |
+
scores = defaultdict(list)
|
41 |
+
print(config)
|
42 |
+
with open(os.path.join(args.dir, review_file) if args.dir is not None else review_file) as f:
|
43 |
+
for review_str in f:
|
44 |
+
review = json.loads(review_str)
|
45 |
+
if review['question_id'] in args.ignore:
|
46 |
+
continue
|
47 |
+
if 'category' in review:
|
48 |
+
scores[review['category']].append(review['tuple'])
|
49 |
+
scores['all'].append(review['tuple'])
|
50 |
+
else:
|
51 |
+
if 'tuple' in review:
|
52 |
+
scores['all'].append(review['tuple'])
|
53 |
+
else:
|
54 |
+
scores['all'].append(review['score'])
|
55 |
+
for k, v in sorted(scores.items()):
|
56 |
+
stats = np.asarray(v).mean(0).tolist()
|
57 |
+
stats = [round(x, 3) for x in stats]
|
58 |
+
# print(k, stats, round(stats[1]/stats[0]*100, 1))
|
59 |
+
print(k, round(stats[1]/stats[0]*100, 1), round(stats[0] * 10, 1), round(stats[1] * 10, 1))
|
60 |
+
print('=================================')
|