import xml.etree.ElementTree as ET import jsonlines import random from tqdm import tqdm import argparse import os import glob def get_sentence_data(fn): """ Parses a sentence file from the Flickr30K Entities dataset input: fn - full file path to the sentence file to parse output: a list of dictionaries for each sentence with the following fields: sentence - the original sentence phrases - a list of dictionaries for each phrase with the following fields: phrase - the text of the annotated phrase first_word_index - the position of the first word of the phrase in the sentence phrase_id - an identifier for this phrase phrase_type - a list of the coarse categories this phrase belongs to """ with open(fn, 'r') as f: sentences = f.read().split('\n') annotations = [] for sentence in sentences: if not sentence: continue first_word = [] phrases = [] phrase_id = [] phrase_type = [] words = [] current_phrase = [] add_to_phrase = False for token in sentence.split(): if add_to_phrase: if token[-1] == ']': add_to_phrase = False token = token[:-1] current_phrase.append(token) phrases.append(' '.join(current_phrase)) current_phrase = [] else: current_phrase.append(token) words.append(token) else: if token[0] == '[': add_to_phrase = True first_word.append(len(words)) parts = token.split('/') phrase_id.append(parts[1][3:]) phrase_type.append(parts[2:]) else: words.append(token) sentence_data = {'sentence' : ' '.join(words), 'phrases' : []} for index, phrase, p_id, p_type in zip(first_word, phrases, phrase_id, phrase_type): sentence_data['phrases'].append({'first_word_index' : index, 'phrase' : phrase, 'phrase_id' : p_id, 'phrase_type' : p_type}) annotations.append(sentence_data) return annotations def get_annotations(fn): """ Parses the xml files in the Flickr30K Entities dataset input: fn - full file path to the annotations file to parse output: dictionary with the following fields: scene - list of identifiers which were annotated as pertaining to the whole scene nobox - list of identifiers which were annotated as not being visible in the image boxes - a dictionary where the fields are identifiers and the values are its list of boxes in the [xmin ymin xmax ymax] format """ tree = ET.parse(fn) root = tree.getroot() filename = root.findall('filename')[0].text size_container = root.findall('size')[0] anno_info = {'filename': filename, 'boxes' : {}, 'scene' : [], 'nobox' : []} for size_element in size_container: anno_info[size_element.tag] = int(size_element.text) for object_container in root.findall('object'): for names in object_container.findall('name'): box_id = names.text box_container = object_container.findall('bndbox') if len(box_container) > 0: if box_id not in anno_info['boxes']: anno_info['boxes'][box_id] = [] xmin = int(box_container[0].findall('xmin')[0].text) - 1 ymin = int(box_container[0].findall('ymin')[0].text) - 1 xmax = int(box_container[0].findall('xmax')[0].text) - 1 ymax = int(box_container[0].findall('ymax')[0].text) - 1 anno_info['boxes'][box_id].append([xmin, ymin, xmax, ymax]) else: nobndbox = int(object_container.findall('nobndbox')[0].text) if nobndbox > 0: anno_info['nobox'].append(box_id) scene = int(object_container.findall('scene')[0].text) if scene > 0: anno_info['scene'].append(box_id) return anno_info def gen_record(sd, an): filename = an["filename"] caption = sd["sentence"] regions = [] for ph in sd["phrases"]: if ph["phrase_id"] in an["boxes"]: for box in an["boxes"][ph["phrase_id"]]: regions.append( { "phrase": ph["phrase"], "bbox": box } ) if len(regions) < 1: print("no phrase regions") return None return { "filename": filename, "height": an["height"], "width": an["width"], "grounding":{ "caption": caption, "regions": regions } } if __name__ == "__main__": parser = argparse.ArgumentParser(description="flickr30k entities to ODVG List.") parser.add_argument("--root", type=str, default="", help="Source anno root") parser.add_argument("--output_file", type=str, default="flickr30k_entities_odvg.jsonl") parser.add_argument("--osoi", action="store_true", default=False) args = parser.parse_args() print(args) odvg_anno = [] sentence_list = os.path.join(args.root, "Sentences") annotation_list = os.path.join(args.root, "Annotations") sentence_list = sorted(glob.glob(sentence_list + "/*")) annotation_list = sorted(glob.glob(annotation_list + "/*")) len_anno = len(annotation_list) for idx in tqdm(range(len_anno)): sds = get_sentence_data(sentence_list[idx]) an = get_annotations(annotation_list[idx]) if args.osoi: sd = sds[random.randint(0, len(sds)-1)] x = gen_record(sd, an) if x: odvg_anno.append(x) else: for sd in sds: x = gen_record(sd, an) if x: odvg_anno.append(x) with jsonlines.open(args.output_file, mode="w") as fwriter: fwriter.write_all(odvg_anno)