latex-ocr / tokenizer-wordlevel.json
yhshin's picture
v5
8d3ade2
raw
history blame
13.1 kB
{
"version": "1.0",
"truncation": {
"direction": "Right",
"max_length": 100,
"strategy": "LongestFirst",
"stride": 0
},
"padding": {
"strategy": {
"Fixed": 100
},
"direction": "Right",
"pad_to_multiple_of": null,
"pad_id": 0,
"pad_type_id": 0,
"pad_token": "[PAD]"
},
"added_tokens": [
{
"id": 0,
"content": "[UNK]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 1,
"content": "[CLS]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 2,
"content": "[SEP]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 3,
"content": "[PAD]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 4,
"content": "[MASK]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
}
],
"normalizer": null,
"pre_tokenizer": {
"type": "Whitespace"
},
"post_processor": {
"type": "TemplateProcessing",
"single": [
{
"SpecialToken": {
"id": "[CLS]",
"type_id": 0
}
},
{
"Sequence": {
"id": "A",
"type_id": 0
}
},
{
"SpecialToken": {
"id": "[SEP]",
"type_id": 0
}
}
],
"pair": [
{
"SpecialToken": {
"id": "[CLS]",
"type_id": 0
}
},
{
"Sequence": {
"id": "A",
"type_id": 0
}
},
{
"SpecialToken": {
"id": "[SEP]",
"type_id": 0
}
},
{
"Sequence": {
"id": "B",
"type_id": 1
}
},
{
"SpecialToken": {
"id": "[SEP]",
"type_id": 1
}
}
],
"special_tokens": {
"[CLS]": {
"id": "[CLS]",
"ids": [
1
],
"tokens": [
"[CLS]"
]
},
"[SEP]": {
"id": "[SEP]",
"ids": [
2
],
"tokens": [
"[SEP]"
]
}
}
},
"decoder": null,
"model": {
"type": "WordLevel",
"vocab": {
"[UNK]": 0,
"[CLS]": 1,
"[SEP]": 2,
"[PAD]": 3,
"[MASK]": 4,
"}": 5,
"{": 6,
"\\": 7,
"_": 8,
"^": 9,
"(": 10,
")": 11,
"2": 12,
"1": 13,
"-": 14,
"=": 15,
",": 16,
"+": 17,
"frac": 18,
"i": 19,
"0": 20,
"x": 21,
"n": 22,
".": 23,
"d": 24,
"\\,": 25,
"a": 26,
"mu": 27,
"left": 28,
"right": 29,
"e": 30,
"k": 31,
"c": 32,
"m": 33,
"r": 34,
"p": 35,
"3": 36,
"alpha": 37,
"t": 38,
"partial": 39,
"~": 40,
"l": 41,
"A": 42,
"s": 43,
"&": 44,
"4": 45,
"j": 46,
"\\;": 47,
"g": 48,
"prime": 49,
"]": 50,
"[": 51,
"nu": 52,
"z": 53,
"pi": 54,
"|": 55,
"b": 56,
"phi": 57,
"\\\\": 58,
"mathrm": 59,
"q": 60,
"operatorname": 61,
"cal": 62,
"N": 63,
"delta": 64,
"f": 65,
"lambda": 66,
"beta": 67,
"bar": 68,
"T": 69,
"int": 70,
"array": 71,
"R": 72,
"S": 73,
"D": 74,
"L": 75,
"M": 76,
"B": 77,
"y": 78,
"sigma": 79,
"F": 80,
"theta": 81,
"/": 82,
"gamma": 83,
"h": 84,
"hat": 85,
"psi": 86,
"sqrt": 87,
"sum": 88,
"u": 89,
"H": 90,
"o": 91,
"rho": 92,
"tilde": 93,
"tau": 94,
"C": 95,
"P": 96,
"G": 97,
"V": 98,
"I": 99,
"X": 100,
"omega": 101,
"epsilon": 102,
"E": 103,
"J": 104,
"bf": 105,
"eta": 106,
"v": 107,
"xi": 108,
"Q": 109,
"Phi": 110,
"quad": 111,
"*": 112,
"5": 113,
"\\{": 114,
"vec": 115,
"begin": 116,
"end": 117,
"Gamma": 118,
"K": 119,
"infty": 120,
"\\}": 121,
"6": 122,
"U": 123,
"rangle": 124,
"dot": 125,
"W": 126,
"pm": 127,
"Lambda": 128,
"Z": 129,
"varphi": 130,
"Delta": 131,
"w": 132,
"chi": 133,
";": 134,
"8": 135,
"\\!": 136,
"Omega": 137,
"kappa": 138,
"qquad": 139,
"cdot": 140,
"Psi": 141,
"equiv": 142,
"langle": 143,
"overline": 144,
">": 145,
"<": 146,
"dagger": 147,
"zeta": 148,
"varepsilon": 149,
"cdots": 150,
"rightarrow": 151,
"O": 152,
"nabla": 153,
"Y": 154,
"ldots": 155,
":": 156,
"Sigma": 157,
"ell": 158,
"7": 159,
"mathcal": 160,
"\\:": 161,
"!": 162,
"otimes": 163,
"prod": 164,
"wedge": 165,
"9": 166,
"hspace": 167,
"Pi": 168,
"hbar": 169,
"sim": 170,
"vert": 171,
"in": 172,
"Big": 173,
"widetilde": 174,
"displaystyle": 175,
"times": 176,
"Theta": 177,
"underline": 178,
"mid": 179,
"to": 180,
"dots": 181,
"mathbf": 182,
"ast": 183,
"leq": 184,
"approx": 185,
"star": 186,
"stackrel": 187,
"perp": 188,
"widehat": 189,
"big": 190,
"vartheta": 191,
"'": 192,
"Bigr": 193,
"geq": 194,
"mp": 195,
"Bigl": 196,
"dag": 197,
"neq": 198,
"simeq": 199,
"textstyle": 200,
"circ": 201,
"bigg": 202,
"biggl": 203,
"biggr": 204,
"oint": 205,
"longrightarrow": 206,
"not": 207,
"boldmath": 208,
"bigr": 209,
"ddot": 210,
"bigl": 211,
"oplus": 212,
"put": 213,
"nonumber": 214,
"Xi": 215,
"\\|": 216,
"le": 217,
"check": 218,
"propto": 219,
"triangle": 220,
"hline": 221,
"--": 222,
"varrho": 223,
"vdots": 224,
"ge": 225,
"imath": 226,
"Bigg": 227,
"sp": 228,
"leftrightarrow": 229,
"forall": 230,
"iota": 231,
"scriptscriptstyle": 232,
"bot": 233,
"lbrack": 234,
"line": 235,
"parallel": 236,
"textrm": 237,
"scriptsize": 238,
"it": 239,
"Rightarrow": 240,
"phantom": 241,
"mapsto": 242,
"subset": 243,
"sf": 244,
"jmath": 245,
"binom": 246,
"Biggr": 247,
"Biggl": 248,
"Upsilon": 249,
"tiny": 250,
"overrightarrow": 251,
"wp": 252,
"scriptstyle": 253,
"ne": 254,
"ll": 255,
"kern": 256,
"bullet": 257,
"downarrow": 258,
"gg": 259,
"atop": 260,
"breve": 261,
"uparrow": 262,
"cong": 263,
"vee": 264,
"bigoplus": 265,
"Im": 266,
"small": 267,
"rbrack": 268,
"underbrace": 269,
"makebox": 270,
"sb": 271,
"varpi": 272,
"cap": 273,
"ddots": 274,
"mathsf": 275,
"cup": 276,
"lbrace": 277,
"rbrace": 278,
"slash": 279,
"upsilon": 280,
"\\#": 281,
"Re": 282,
"Longrightarrow": 283,
"vspace": 284,
"acute": 285,
"mit": 286,
"rightharpoonup": 287,
"supset": 288,
"raisebox": 289,
"varsigma": 290,
"Leftrightarrow": 291,
"noalign": 292,
"longleftrightarrow": 293,
"large": 294,
"circle": 295,
"bigtriangleup": 296,
"null": 297,
"Large": 298,
"footnotesize": 299,
"\"": 300,
"raise": 301,
"vphantom": 302,
"leftarrow": 303,
"protect": 304,
"Vert": 305,
"llap": 306,
"buildrel": 307,
"Longleftrightarrow": 308,
"`": 309,
"enspace": 310,
"overleftarrow": 311,
"sl": 312,
"diamond": 313,
"hfill": 314,
"rfloor": 315,
"ule": 316,
"bigotimes": 317,
"doteq": 318,
"tt": 319,
"cdotp": 320,
"textbf": 321,
"unitlength": 322,
"emptyset": 323,
"mm": 324,
"---": 325,
"cm": 326,
"mathop": 327,
"fbox": 328,
"ref": 329,
"aleph": 330,
"backslash": 331,
"\\-": 332,
"label": 333,
"sharp": 334,
"longmapsto": 335,
"overbrace": 336,
"relax": 337,
"subseteq": 338,
"textup": 339,
"mathit": 340,
"flat": 341,
"vskip": 342,
"bigcup": 343,
"Object": 344,
"ni": 345,
"object": 346,
"odot": 347,
"setlength": 348,
"\\/": 349,
"colon": 350,
"strut": 351,
"thinspace": 352,
"bigwedge": 353,
"lfloor": 354,
"smallskip": 355,
"pounds": 356,
"ominus": 357,
"land": 358,
"longleftarrow": 359,
"bmod": 360,
"\\*": 361,
"bigtriangledown": 362,
"medskip": 363,
"multicolumn": 364,
"arraystretch": 365,
"enskip": 366,
"framebox": 367,
"hookrightarrow": 368,
"hrule": 369,
"parbox": 370,
"vline": 371,
"vrule": 372,
"?": 373,
"renewcommand": 374,
"setminus": 375,
"pt": 376,
"bigcap": 377,
"hfil": 378,
"lower": 379,
"natural": 380,
"rlap": 381,
"diamondsuit": 382,
"space": 383,
"textit": 384,
"vector": 385,
"ddagger": 386,
"pmod": 387,
"texttt": 388,
"thicklines": 389,
"top": 390,
"LARGE": 391,
"sc": 392,
"smash": 393,
"triangleright": 394,
"Downarrow": 395,
"\\&": 396,
"bigcirc": 397,
"bigm": 398,
"exists": 399,
"searrow": 400,
"surd": 401,
"vdash": 402,
"arraycolsep": 403,
"hphantom": 404,
"normalsize": 405,
"oval": 406,
"special": 407,
"sqcup": 408,
"textnormal": 409,
"14": 410,
"Huge": 411,
"\\[": 412,
"\\]": 413,
"cite": 414,
"lefteqn": 415,
"mathbin": 416,
"mathrel": 417,
"mkern": 418,
"AA": 419,
"Biggm": 420,
"\\'": 421,
"footnote": 422,
"itshape": 423,
"lceil": 424,
"multiput": 425,
"sqcap": 426,
"supseteq": 427,
"textsf": 428,
"unboldmath": 429,
"16": 430,
"@": 431,
"Bigm": 432,
"Longleftarrow": 433,
"\\(": 434,
"\\)": 435,
"ae": 436,
"amalg": 437,
"asymp": 438,
"crcr": 439,
"do": 440,
"ensuremath": 441,
"hskip": 442,
"linethickness": 443,
"mathclose": 444,
"mathopen": 445,
"nulldelimiterspace": 446,
"ooalign": 447,
"prec": 448,
"qbezier": 449,
"ss": 450,
"triangleleft": 451,
"bigskip": 452,
"bigsqcup": 453,
"ddag": 454,
"fboxsep": 455,
"grave": 456,
"lgroup": 457,
"mathord": 458,
"mathtt": 459,
"nearrow": 460,
"notin": 461,
"oslash": 462,
"preceq": 463,
"protectu": 464,
"rgroup": 465,
"rightleftharpoons": 466,
"setcounter": 467,
"skew": 468,
"smallint": 469,
"smile": 470,
"succ": 471,
"succeq": 472,
"swarrow": 473,
"vcenter": 474,
"vss": 475,
"SS": 476,
"arrowvert": 477,
"atopwithdelims": 478,
"cline": 479,
"em": 480,
"footnotemark": 481,
"hss": 482,
"lq": 483,
"mathnormal": 484,
"mathstrut": 485,
"mathversion": 486,
"mskip": 487,
"nolinebreak": 488,
"ointop": 489,
"rightarrowfill": 490,
"symbol": 491,
"tabcolsep": 492,
"verb": 493,
"#": 494,
"10": 495,
"20": 496,
"23": 497,
"25": 498,
"\\\"": 499,
"\\^": 500,
"biggm": 501,
"bigvee": 502,
"brace": 503,
"brack": 504,
"coprod": 505,
"def": 506,
"dotfill": 507,
"emph": 508,
"everymath": 509,
"expandafter": 510,
"fill": 511,
"huge": 512,
"leavevmode": 513,
"mathaccent": 514,
"newcommand": 515,
"of": 516,
"overwithdelims": 517,
"protectE": 518,
"protectZ": 519,
"protecte": 520,
"protectm": 521,
"rceil": 522,
"romannumeral": 523,
"root": 524,
"scshape": 525,
"textcircled": 526,
"uppercase": 527
},
"unk_token": "[UNK]"
}
}