chendl's picture
Add application file
0b7b08a
raw
history blame
5.89 kB
# Those are manual mapping that are not caught by our stemming rules or would
# would be done incorrectly by our automatic stemming rule. In details,
# the keys of the _MANUAL_MATCHES dict contains the original word and the value
# contains the transformation of the word expected by the OKVQA stemming rule.
# These manual rules were found by checking the `raw_answers` and the `answers`
# fields of the released OKVQA dataset and checking all things that were not
# properly mapped by our automatic rules. In particular some of the mapping
# are sometimes constant, e.g. christmas -> christmas which was incorrectly
# singularized by our inflection.singularize.
import re
import nltk
from nltk.corpus.reader import VERB
import inflection
_MANUAL_MATCHES = {
"police": "police",
"las": "las",
"vegas": "vegas",
"yes": "yes",
"jeans": "jean",
"hell's": "hell",
"domino's": "domino",
"morning": "morn",
"clothes": "cloth",
"are": "are",
"riding": "ride",
"leaves": "leaf",
"dangerous": "danger",
"clothing": "cloth",
"texting": "text",
"kiting": "kite",
"firefighters": "firefight",
"ties": "tie",
"married": "married",
"teething": "teeth",
"gloves": "glove",
"tennis": "tennis",
"dining": "dine",
"directions": "direct",
"waves": "wave",
"christmas": "christmas",
"drives": "drive",
"pudding": "pud",
"coding": "code",
"plating": "plate",
"quantas": "quanta",
"hornes": "horn",
"graves": "grave",
"mating": "mate",
"paned": "pane",
"alertness": "alert",
"sunbathing": "sunbath",
"tenning": "ten",
"wetness": "wet",
"urinating": "urine",
"sickness": "sick",
"braves": "brave",
"firefighting": "firefight",
"lenses": "lens",
"reflections": "reflect",
"backpackers": "backpack",
"eatting": "eat",
"designers": "design",
"curiousity": "curious",
"playfulness": "play",
"blindness": "blind",
"hawke": "hawk",
"tomatoe": "tomato",
"rodeoing": "rodeo",
"brightness": "bright",
"circuses": "circus",
"skateboarders": "skateboard",
"staring": "stare",
"electronics": "electron",
"electicity": "elect",
"mountainous": "mountain",
"socializing": "social",
"hamburgers": "hamburg",
"caves": "cave",
"transitions": "transit",
"wading": "wade",
"creame": "cream",
"toileting": "toilet",
"sautee": "saute",
"buildings": "build",
"belongings": "belong",
"stockings": "stock",
"walle": "wall",
"cumulis": "cumuli",
"travelers": "travel",
"conducter": "conduct",
"browsing": "brows",
"pooping": "poop",
"haircutting": "haircut",
"toppings": "top",
"hearding": "heard",
"sunblocker": "sunblock",
"bases": "base",
"markings": "mark",
"mopeds": "mope",
"kindergartener": "kindergarten",
"pies": "pie",
"scrapbooking": "scrapbook",
"couponing": "coupon",
"meetings": "meet",
"elevators": "elev",
"lowes": "low",
"men's": "men",
"childrens": "children",
"shelves": "shelve",
"paintings": "paint",
"raines": "rain",
"paring": "pare",
"expressions": "express",
"routes": "rout",
"pease": "peas",
"vastness": "vast",
"awning": "awn",
"boy's": "boy",
"drunkenness": "drunken",
"teasing": "teas",
"conferences": "confer",
"ripeness": "ripe",
"suspenders": "suspend",
"earnings": "earn",
"reporters": "report",
"kid's": "kid",
"containers": "contain",
"corgie": "corgi",
"porche": "porch",
"microwaves": "microwave",
"batter's": "batter",
"sadness": "sad",
"apartments": "apart",
"oxygenize": "oxygen",
"striping": "stripe",
"purring": "pure",
"professionals": "profession",
"piping": "pipe",
"farmer's": "farmer",
"potatoe": "potato",
"emirates": "emir",
"womens": "women",
"veteran's": "veteran",
"wilderness": "wilder",
"propellers": "propel",
"alpes": "alp",
"charioteering": "chariot",
"swining": "swine",
"illness": "ill",
"crepte": "crept",
"adhesives": "adhesive",
"regent's": "regent",
"decorations": "decor",
"rabbies": "rabbi",
"overseas": "oversea",
"travellers": "travel",
"casings": "case",
"smugness": "smug",
"doves": "dove",
"nationals": "nation",
"mustange": "mustang",
"ringe": "ring",
"gondoliere": "gondolier",
"vacationing": "vacate",
"reminders": "remind",
"baldness": "bald",
"settings": "set",
"glaced": "glace",
"coniferous": "conifer",
"revelations": "revel",
"personals": "person",
"daughter's": "daughter",
"badness": "bad",
"projections": "project",
"polarizing": "polar",
"vandalizers": "vandal",
"minerals": "miner",
"protesters": "protest",
"controllers": "control",
"weddings": "wed",
"sometimes": "sometime",
"earing": "ear",
}
class OKVQAStemmer:
"""Stemmer to match OKVQA v1.1 procedure."""
def __init__(self):
self._wordnet_lemmatizer = nltk.stem.WordNetLemmatizer()
def stem(self, input_string):
"""Apply stemming."""
word_and_pos = nltk.pos_tag(nltk.tokenize.word_tokenize(input_string))
stemmed_words = []
for w, p in word_and_pos:
if w in _MANUAL_MATCHES:
w = _MANUAL_MATCHES[w]
elif w.endswith("ing"):
w = self._wordnet_lemmatizer.lemmatize(w, VERB)
elif p.startswith("NNS") or p.startswith("NNPS"):
w = inflection.singularize(w)
stemmed_words.append(w)
return " ".join(stemmed_words)
stemmer = OKVQAStemmer()
def postprocess_ok_vqa_generation(prediction) -> str:
prediction_stem = stemmer.stem(prediction)
return prediction_stem