Spaces:
Sleeping
Sleeping
import streamlit as st | |
from parse import parse | |
from nltk import Tree | |
import pandas as pd | |
import re | |
from nltk.tree.prettyprinter import TreePrettyPrinter | |
from annotate import tag_text | |
st.title("ENHG parsing system (demo)") | |
text = st.text_area("""This is a simple demo of a Early New High German (ENHG) tagging and parsing system based on BERT language models.\n\n | |
Enter some ENHG text below!""") | |
st.text("""Example MHG sentences: | |
1. Im anfang war das Wort / Vnd das Wort war bey Gott / vnd Gott war das Wort. | |
2. Darinn ain treffenliche statt, genannt Famagosta, in wölicher stat ain edler purger altz herkommens was geseßsen.""") | |
def process_text(text): | |
text = re.sub(r'(["(])(\S)', r'\1 \2', text) | |
text = re.sub(r'(\S)([.,;:?!)"])', r'\1 \2', text) | |
text = re.sub(r' *$', '\n', text, flags=re.MULTILINE) | |
text = re.sub(r' +', '\n', text) | |
return text | |
if text: | |
tokens, tags, probs = tag_text(process_text(text)) | |
# create a table to show the tagged results: | |
zipped = list(zip(tokens, tags, probs)) | |
df = pd.DataFrame(zipped, columns=['Token', 'Tag', 'Prob.']) | |
parse_tree = parse(tokens) | |
# Convert the bracket parse tree into an NLTK Tree | |
mod_tree = str(parse_tree).replace("$\(", "$LRB").replace("$\)", "$RRB") | |
t = Tree.fromstring(re.sub(r'(-\w+)+', '', mod_tree)) | |
tree_svg = TreePrettyPrinter(t).svg(nodecolor='black', leafcolor='black', funccolor='black') | |
col1 = st.columns(1)[0] | |
col1.header("POS tagging result:") | |
col1.table(df) | |
col2 = st.columns(1)[0] | |
col2.header("Parsing result:") | |
col2.write(mod_tree.replace('_', '\_').replace('$', '\$').replace('*', '\*')) | |
# Display the graph in the Streamlit app | |
col2.image(tree_svg, use_column_width=True) | |