File size: 570 Bytes
6ed21b9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
import re
from MHGTagger.rnn_annotate import annotate
from Tagset_Mappings.tag_mapping import map_tags
from parsing.src.parse import run_parse 
from nltk import word_tokenize

def parse_text(text):
    tokens = tokenize(text)
    tokens, tags, probs = annotate(tokens)
    tags = map_tags(tags)
    parse_tree = run_parse(tokens, tags)[0]
    return tokens, tags, probs, parse_tree

def tokenize(text: str):
    text = re.sub(r'\s*([.,;:?!"])\s', r' \1 ', text)
    text = re.sub(r'\s*([.,;:?!"]) ', r' \1 ', text)
    tokens = word_tokenize(text)
    return tokens