Spaces:
Runtime error
Runtime error
""" | |
Dashboard for showcasing extraction of text metrics with textdescriptives. | |
""" | |
from io import StringIO | |
import pandas as pd | |
import streamlit as st | |
import textdescriptives as td | |
from data_viewer import DataViewer | |
from process_text import text_to_metrics | |
from options import ( | |
all_model_size_options_pretty_to_short, | |
available_model_size_options, | |
language_options, | |
metrics_options, | |
) | |
################ | |
# Introduction # | |
################ | |
col1, col2 = st.columns([9, 2]) | |
with col1: | |
st.title("Extract Text Statistics") | |
with col2: | |
st.image( | |
"https://github.com/HLasse/TextDescriptives/raw/main/docs/_static/icon.png", | |
width=125, | |
) | |
st.write( | |
"Calculate a large variety of statistics from text via the " | |
"[**TextDescriptives**](https://github.com/HLasse/TextDescriptives) python package " | |
f"(v/{td.__version__}) and download the results as a .csv file. " | |
"Includes descriptive statistics and metrics related to readability, " | |
"information theory, text coherence and text quality." | |
) | |
st.write( | |
"The source code for this application can be found on [**GitHub**](https://github.com/HLasse/TextDescriptives_app). " | |
"If you have feedback, please open an [issue](https://github.com/HLasse/textdescriptives_app/issues)." | |
) | |
st.caption( | |
"Hansen, L., Olsen, L. R., & Enevoldsen, K. (2023). TextDescriptives: A Python package for " | |
"calculating a large variety of metrics from text. [Journal of Open Source Software, 8(84), " | |
"5153, https://doi.org/10.21105/joss.05153](https://doi.org/10.21105/joss.05153)" | |
) | |
############ | |
# Settings # | |
############ | |
input_choice = st.radio( | |
label="Input", options=["Enter text", "Upload file(s)"], index=0, horizontal=True | |
) | |
with st.form(key="settings_form"): | |
split_by_line = st.checkbox(label="Split by newline", value=True) | |
file_name_to_text_string = {} | |
if input_choice == "Upload file(s)": | |
uploaded_files = st.file_uploader( | |
label="Choose a .txt file", type=["txt"], accept_multiple_files=True | |
) | |
if uploaded_files is not None and len(uploaded_files) > 0: | |
# To convert to a string based IO: | |
file_name_to_text_string = { | |
file.name: StringIO(file.getvalue().decode("utf-8")).read() | |
for file in uploaded_files | |
} | |
else: | |
default_text = """Hello, morning dew. The grass whispers low. | |
I'm here to dance. The gentle breeze does show. | |
Good morning, world. The birds sing in delight. | |
Let's spread our wings. The butterflies take flight. | |
Nature's chorus sings, a symphony of light.""" | |
file_name_to_text_string = { | |
"input": st.text_area( | |
label="Enter text", value=default_text, height=145, max_chars=None | |
) | |
} | |
# Row of selectors | |
col1, col2 = st.columns([1, 1]) | |
with col1: | |
# Selection of language | |
language_pretty = st.selectbox( | |
label="Language", | |
options=list(language_options().keys()), | |
index=5, | |
key="language_selector", | |
) | |
language_short = language_options()[language_pretty] | |
with col2: | |
# Selection of model size | |
model_size_pretty = st.selectbox( | |
label="Model Size", | |
options=available_model_size_options(lang="all"), | |
index=0, | |
key="size_selector", | |
) | |
model_size_short = all_model_size_options_pretty_to_short()[model_size_pretty] | |
# Multiselection of metrics | |
metrics = st.multiselect( | |
label="Metrics", options=metrics_options(), default=metrics_options() | |
) | |
st.write( | |
"See the [**documentation**](https://hlasse.github.io/TextDescriptives/) for " | |
"information on the available metrics." | |
) | |
# This shouldn't happen but better safe than sorry | |
if isinstance(metrics, list) and not metrics: | |
metrics = None | |
apply_settings_button = st.form_submit_button(label="Apply") | |
############# | |
# Apply NLP # | |
############# | |
if apply_settings_button and len(file_name_to_text_string) > 0: | |
if model_size_pretty not in available_model_size_options(lang=language_short): | |
st.write( | |
"**Sorry!** The chosen *model size* is not available in this language. Please try another." | |
) | |
else: | |
# Extract metrics for each text | |
output_df = pd.concat( | |
[ | |
text_to_metrics( | |
string=string, | |
language_short=language_short, | |
model_size_short=model_size_short, | |
metrics=metrics, | |
split_by_line=split_by_line, | |
filename=filename if "Upload" in input_choice else None, | |
) | |
for filename, string in file_name_to_text_string.items() | |
], | |
ignore_index=True, | |
) | |
################### | |
# Present Results # | |
################### | |
# Create 2 columns with 1) the output header | |
# and 2) a download button | |
DataViewer()._header_and_download( | |
header="The calculated metrics", | |
data=output_df, | |
file_name="text_metrics.csv", | |
) | |
st.write("**Note**: This data frame has been transposed for readability.") | |
output_df = output_df.transpose().reset_index() | |
output_df.columns = ["Metric"] + [str(c) for c in list(output_df.columns)[1:]] | |
st.dataframe(data=output_df, use_container_width=True) | |
############################ | |
# Code For Reproducibility # | |
############################ | |
with st.expander("See python code"): | |
st.code( | |
""" | |
# Note: This is the code for a single text file | |
# The actual code is slightly more complex | |
# to allow processing multiple files at once | |
import textdescriptives as td | |
# Given a string of text and the settings | |
text = "..." | |
language = "..." | |
model_size = "..." | |
metrics = [...] | |
split_by_newline = True | |
# Remove whitespace from both ends of the string | |
text = text.strip() | |
# When asked, split by newlines | |
if split_by_newline: | |
lines = text.split("\\n") | |
else: | |
lines = [text] | |
# Remove empty lines | |
# E.g. due to consecutive newlines | |
lines = [l for l in lines if l] | |
# Extract metrics for each line | |
extracted_metrics = td.extract_metrics( | |
text=lines, | |
lang=language, | |
spacy_model_size=model_size, | |
metrics=metrics | |
) | |
""", | |
language="python", | |
) | |
####### | |
# FAQ # | |
####### | |
st.subheader("Frequently Asked Questions (FAQ)") | |
with st.expander("What does the 'Split by newline' option do?"): | |
st.write( | |
""" | |
When the `Split by newline` option is `enabled`, the metrics calculation is | |
performed separately for each paragraph. I.e. whenever there's a line break, | |
we split the text. | |
When this option is `disabled`, the entire text is processed at once. | |
""" | |
) | |
with st.expander( | |
"Why do I get a warning/error message for certain languages or model sizes?" | |
): | |
st.write( | |
""" | |
Some combinations of languages, model sizes, and metrics are not currently supported in the app. | |
While we *are* working on this, you may currently see a red box | |
with an error message after clicking `Apply`. | |
If you need this language and/or model size to work for your project, | |
please open an [issue](https://github.com/HLasse/textdescriptives_app/issues). | |
This may cause us to prioritize supporting your use case. | |
""" | |
) | |