mhg-parsing / parse.py
nielklug's picture
init
6ed21b9
raw
history blame contribute delete
570 Bytes
import re
from MHGTagger.rnn_annotate import annotate
from Tagset_Mappings.tag_mapping import map_tags
from parsing.src.parse import run_parse
from nltk import word_tokenize
def parse_text(text):
tokens = tokenize(text)
tokens, tags, probs = annotate(tokens)
tags = map_tags(tags)
parse_tree = run_parse(tokens, tags)[0]
return tokens, tags, probs, parse_tree
def tokenize(text: str):
text = re.sub(r'\s*([.,;:?!"])\s', r' \1 ', text)
text = re.sub(r'\s*([.,;:?!"]) ', r' \1 ', text)
tokens = word_tokenize(text)
return tokens