Spaces:
Sleeping
Sleeping
import xml.etree.ElementTree as ET | |
import jsonlines | |
import random | |
from tqdm import tqdm | |
import argparse | |
import os | |
import glob | |
def get_sentence_data(fn): | |
""" | |
Parses a sentence file from the Flickr30K Entities dataset | |
input: | |
fn - full file path to the sentence file to parse | |
output: | |
a list of dictionaries for each sentence with the following fields: | |
sentence - the original sentence | |
phrases - a list of dictionaries for each phrase with the | |
following fields: | |
phrase - the text of the annotated phrase | |
first_word_index - the position of the first word of | |
the phrase in the sentence | |
phrase_id - an identifier for this phrase | |
phrase_type - a list of the coarse categories this | |
phrase belongs to | |
""" | |
with open(fn, 'r') as f: | |
sentences = f.read().split('\n') | |
annotations = [] | |
for sentence in sentences: | |
if not sentence: | |
continue | |
first_word = [] | |
phrases = [] | |
phrase_id = [] | |
phrase_type = [] | |
words = [] | |
current_phrase = [] | |
add_to_phrase = False | |
for token in sentence.split(): | |
if add_to_phrase: | |
if token[-1] == ']': | |
add_to_phrase = False | |
token = token[:-1] | |
current_phrase.append(token) | |
phrases.append(' '.join(current_phrase)) | |
current_phrase = [] | |
else: | |
current_phrase.append(token) | |
words.append(token) | |
else: | |
if token[0] == '[': | |
add_to_phrase = True | |
first_word.append(len(words)) | |
parts = token.split('/') | |
phrase_id.append(parts[1][3:]) | |
phrase_type.append(parts[2:]) | |
else: | |
words.append(token) | |
sentence_data = {'sentence' : ' '.join(words), 'phrases' : []} | |
for index, phrase, p_id, p_type in zip(first_word, phrases, phrase_id, phrase_type): | |
sentence_data['phrases'].append({'first_word_index' : index, | |
'phrase' : phrase, | |
'phrase_id' : p_id, | |
'phrase_type' : p_type}) | |
annotations.append(sentence_data) | |
return annotations | |
def get_annotations(fn): | |
""" | |
Parses the xml files in the Flickr30K Entities dataset | |
input: | |
fn - full file path to the annotations file to parse | |
output: | |
dictionary with the following fields: | |
scene - list of identifiers which were annotated as | |
pertaining to the whole scene | |
nobox - list of identifiers which were annotated as | |
not being visible in the image | |
boxes - a dictionary where the fields are identifiers | |
and the values are its list of boxes in the | |
[xmin ymin xmax ymax] format | |
""" | |
tree = ET.parse(fn) | |
root = tree.getroot() | |
filename = root.findall('filename')[0].text | |
size_container = root.findall('size')[0] | |
anno_info = {'filename': filename, 'boxes' : {}, 'scene' : [], 'nobox' : []} | |
for size_element in size_container: | |
anno_info[size_element.tag] = int(size_element.text) | |
for object_container in root.findall('object'): | |
for names in object_container.findall('name'): | |
box_id = names.text | |
box_container = object_container.findall('bndbox') | |
if len(box_container) > 0: | |
if box_id not in anno_info['boxes']: | |
anno_info['boxes'][box_id] = [] | |
xmin = int(box_container[0].findall('xmin')[0].text) - 1 | |
ymin = int(box_container[0].findall('ymin')[0].text) - 1 | |
xmax = int(box_container[0].findall('xmax')[0].text) - 1 | |
ymax = int(box_container[0].findall('ymax')[0].text) - 1 | |
anno_info['boxes'][box_id].append([xmin, ymin, xmax, ymax]) | |
else: | |
nobndbox = int(object_container.findall('nobndbox')[0].text) | |
if nobndbox > 0: | |
anno_info['nobox'].append(box_id) | |
scene = int(object_container.findall('scene')[0].text) | |
if scene > 0: | |
anno_info['scene'].append(box_id) | |
return anno_info | |
def gen_record(sd, an): | |
filename = an["filename"] | |
caption = sd["sentence"] | |
regions = [] | |
for ph in sd["phrases"]: | |
if ph["phrase_id"] in an["boxes"]: | |
for box in an["boxes"][ph["phrase_id"]]: | |
regions.append( | |
{ | |
"phrase": ph["phrase"], | |
"bbox": box | |
} | |
) | |
if len(regions) < 1: | |
print("no phrase regions") | |
return None | |
return { | |
"filename": filename, | |
"height": an["height"], | |
"width": an["width"], | |
"grounding":{ | |
"caption": caption, | |
"regions": regions | |
} | |
} | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser(description="flickr30k entities to ODVG List.") | |
parser.add_argument("--root", type=str, default="", help="Source anno root") | |
parser.add_argument("--output_file", type=str, default="flickr30k_entities_odvg.jsonl") | |
parser.add_argument("--osoi", action="store_true", default=False) | |
args = parser.parse_args() | |
print(args) | |
odvg_anno = [] | |
sentence_list = os.path.join(args.root, "Sentences") | |
annotation_list = os.path.join(args.root, "Annotations") | |
sentence_list = sorted(glob.glob(sentence_list + "/*")) | |
annotation_list = sorted(glob.glob(annotation_list + "/*")) | |
len_anno = len(annotation_list) | |
for idx in tqdm(range(len_anno)): | |
sds = get_sentence_data(sentence_list[idx]) | |
an = get_annotations(annotation_list[idx]) | |
if args.osoi: | |
sd = sds[random.randint(0, len(sds)-1)] | |
x = gen_record(sd, an) | |
if x: | |
odvg_anno.append(x) | |
else: | |
for sd in sds: | |
x = gen_record(sd, an) | |
if x: | |
odvg_anno.append(x) | |
with jsonlines.open(args.output_file, mode="w") as fwriter: | |
fwriter.write_all(odvg_anno) |