enhg-parsing / parse.py
nielklug's picture
update
42dc132
raw
history blame contribute delete
736 Bytes
import re
import sys
import benepar
from huggingface_hub import hf_hub_download
def parse(words):
model_path = hf_hub_download(repo_id="nielklug/enhg_parser", filename='new-convbert-german-europeana0_dev=83.03.pt')
parser = benepar.Parser(model_path)
words = [word.replace('(','-LRB-').replace(')','-RRB-') for word in words]
input_sentence = benepar.InputSentence(words=words)
tree = parser.parse(input_sentence)
tree = str(tree).replace('-LRB-','\\(').replace('-RRB-','\\)').replace('-LSB-','\\[').replace('-RSB-','\\]').replace('($(-','($\\(-')
# put the whole parse tree on a single line
tree = re.sub(r'\s+', ' ', tree.strip())
tree = re.sub(r' \(', '(', tree)
return tree