Spaces:
Sleeping
Sleeping
BUCKWALTER_MAP = { | |
'\'': '\u0621', | |
'|': '\u0622', | |
'>': '\u0623', | |
'O': '\u0623', | |
'&': '\u0624', | |
'W': '\u0624', | |
'<': '\u0625', | |
'I': '\u0625', | |
'}': '\u0626', | |
'A': '\u0627', | |
'b': '\u0628', | |
'p': '\u0629', | |
't': '\u062A', | |
'v': '\u062B', | |
'j': '\u062C', | |
'H': '\u062D', | |
'x': '\u062E', | |
'd': '\u062F', | |
'*': '\u0630', | |
'r': '\u0631', | |
'z': '\u0632', | |
's': '\u0633', | |
'$': '\u0634', | |
'S': '\u0635', | |
'D': '\u0636', | |
'T': '\u0637', | |
'Z': '\u0638', | |
'E': '\u0639', | |
'g': '\u063A', | |
'_': '\u0640', | |
'f': '\u0641', | |
'q': '\u0642', | |
'k': '\u0643', | |
'l': '\u0644', | |
'm': '\u0645', | |
'n': '\u0646', | |
'h': '\u0647', | |
'w': '\u0648', | |
'Y': '\u0649', | |
'y': '\u064A', | |
'F': '\u064B', | |
'N': '\u064C', | |
'K': '\u064D', | |
'a': '\u064E', | |
'u': '\u064F', | |
'i': '\u0650', | |
'~': '\u0651', | |
'o': '\u0652', | |
'`': '\u0670', | |
'{': '\u0671', | |
} | |
BUCKWALTER_UNESCAPE = { | |
"-LRB-": "(", | |
"-RRB-": ")", | |
"-LCB-": "{", | |
"-RCB-": "}", | |
"-LSB-": "[", | |
"-RSB-": "]", | |
'-PLUS-': "+", | |
'-MINUS-': "-", | |
} | |
BUCKWALTER_UNCHANGED = set('.?!,"%-/:;=') | |
HEBREW_MAP = { | |
'A': '\u05d0', | |
'B': '\u05d1', | |
'G': '\u05d2', | |
'D': '\u05d3', | |
'H': '\u05d4', | |
'W': '\u05d5', | |
'Z': '\u05d6', | |
'X': '\u05d7', | |
'J': '\u05d8', | |
'I': '\u05d9', | |
'K': '\u05db', | |
'L': '\u05dc', | |
'M': '\u05de', | |
'N': '\u05e0', | |
'S': '\u05e1', | |
'E': '\u05e2', | |
'P': '\u05e4', | |
'C': '\u05e6', | |
'Q': '\u05e7', | |
'R': '\u05e8', | |
'F': '\u05e9', | |
'T': '\u05ea', | |
'0': '0', | |
'1': '1', | |
'2': '2', | |
'3': '3', | |
'4': '4', | |
'5': '5', | |
'6': '6', | |
'7': '7', | |
'8': '8', | |
'9': '9', | |
'U': '"', | |
'O': '%', | |
'.': '.', | |
',': ',', | |
} | |
HEBREW_SUFFIX_MAP = { | |
'\u05db': '\u05da', | |
'\u05de': '\u05dd', | |
'\u05e0': '\u05df', | |
'\u05e4': '\u05e3', | |
'\u05e6': '\u05e5', | |
} | |
HEBREW_UNESCAPE = { | |
"yyCLN": ":", | |
"yyCM": ",", | |
"yyDASH": "-", | |
"yyDOT": ".", | |
"yyELPS": "...", | |
"yyEXCL": "!", | |
"yyLRB": "(", | |
"yyQM": "?", | |
"yyRRB": ")", | |
"yySCLN": ";", | |
} | |
def arabic(inp): | |
""" | |
Undo Buckwalter transliteration | |
See: http://languagelog.ldc.upenn.edu/myl/ldc/morph/buckwalter.html | |
This code inspired by: | |
https://github.com/dlwh/epic/blob/master/src/main/scala/epic/util/ArabicNormalization.scala | |
""" | |
return "".join( | |
BUCKWALTER_MAP.get(char, char) | |
for char in BUCKWALTER_UNESCAPE.get(inp, inp)) | |
def hebrew(inp): | |
""" | |
Undo Hebrew transliteration | |
See: http://www.phil.uu.nl/ozsl/articles/simaan02.pdf | |
This code inspired by: | |
https://github.com/habeanf/yap/blob/b57502364b73ef78f3510eb890319ae268eeacca/nlp/parser/xliter8/types.go | |
""" | |
out = "".join( | |
HEBREW_MAP.get(char, char) | |
for char in HEBREW_UNESCAPE.get(inp, inp)) | |
if out and (out[-1] in HEBREW_SUFFIX_MAP): | |
out = out[:-1] + HEBREW_SUFFIX_MAP[out[-1]] | |
return out | |
TRANSLITERATIONS = { | |
'arabic': arabic, | |
'hebrew': hebrew, | |
} | |