HLasse's picture
feat: multifile processing
a177196
raw
history blame
6.46 kB
"""
Dashboard for showcasing extraction of text metrics with textdescriptives.
"""
from io import StringIO
import pandas as pd
import streamlit as st
import textdescriptives as td
from data_viewer import DataViewer
from process_text import text_to_metrics
from options import (
all_model_size_options_pretty_to_short,
available_model_size_options,
language_options,
metrics_options,
)
################
# Introduction #
################
col1, col2 = st.columns([9, 2])
with col1:
st.title("Extract Text Statistics")
with col2:
st.image(
"https://github.com/HLasse/TextDescriptives/raw/main/docs/_static/icon.png",
width=125,
)
st.write(
"Calculate a large variety of statistics from text via the "
"[**TextDescriptives**](https://github.com/HLasse/TextDescriptives) python package "
f"(v/{td.__version__}) and download the results as a .csv file. "
"Includes descriptive statistics and metrics related to readability, "
"information theory, text coherence and text quality."
)
st.write(
"The source code for this application can be found on [**GitHub**](https://github.com/HLasse/TextDescriptives_app). "
"If you have feedback, please open an [issue](https://github.com/HLasse/textdescriptives_app/issues)."
)
st.caption(
"Hansen, L., Olsen, L. R., & Enevoldsen, K. (2023). TextDescriptives: A Python package for "
"calculating a large variety of metrics from text. [Journal of Open Source Software, 8(84), "
"5153, https://doi.org/10.21105/joss.05153](https://doi.org/10.21105/joss.05153)"
)
############
# Settings #
############
input_choice = st.radio(
label="Input", options=["Enter text", "Upload file(s)"], index=0, horizontal=True
)
with st.form(key="settings_form"):
split_by_line = st.checkbox(label="Split by newline", value=True)
file_name_to_text_string = {}
if input_choice == "Upload file(s)":
uploaded_files = st.file_uploader(
label="Choose a .txt file", type=["txt"], accept_multiple_files=True
)
if uploaded_files is not None and len(uploaded_files) > 0:
# To convert to a string based IO:
file_name_to_text_string = {
file.name: StringIO(file.getvalue().decode("utf-8")).read()
for file in uploaded_files
}
else:
default_text = """Hello, morning dew. The grass whispers low.
I'm here to dance. The gentle breeze does show.
Good morning, world. The birds sing in delight.
Let's spread our wings. The butterflies take flight.
Nature's chorus sings, a symphony of light."""
file_name_to_text_string = {
"input": st.text_area(
label="Enter text", value=default_text, height=145, max_chars=None
)
}
# Row of selectors
col1, col2 = st.columns([1, 1])
with col1:
# Selection of language
language_pretty = st.selectbox(
label="Language",
options=list(language_options().keys()),
index=5,
key="language_selector",
)
language_short = language_options()[language_pretty]
with col2:
# Selection of model size
model_size_pretty = st.selectbox(
label="Model Size",
options=available_model_size_options(lang="all"),
index=0,
key="size_selector",
)
model_size_short = all_model_size_options_pretty_to_short()[model_size_pretty]
# Multiselection of metrics
metrics = st.multiselect(
label="Metrics", options=metrics_options(), default=metrics_options()
)
st.write(
"See the [**documentation**](https://hlasse.github.io/TextDescriptives/) for "
"information on the available metrics."
)
# This shouldn't happen but better safe than sorry
if isinstance(metrics, list) and not metrics:
metrics = None
apply_settings_button = st.form_submit_button(label="Apply")
#############
# Apply NLP #
#############
if apply_settings_button and len(file_name_to_text_string) > 0:
if model_size_pretty not in available_model_size_options(lang=language_short):
st.write(
"**Sorry!** The chosen *model size* is not available in this language. Please try another."
)
else:
# Extract metrics for each text
output_df = pd.concat(
[
text_to_metrics(
string=string,
language_short=language_short,
model_size_short=model_size_short,
metrics=metrics,
split_by_line=split_by_line,
filename=filename if "Upload" in input_choice else None,
)
for filename, string in file_name_to_text_string.items()
],
ignore_index=True,
)
###################
# Present Results #
###################
# Create 2 columns with 1) the output header
# and 2) a download button
DataViewer()._header_and_download(
header="The calculated metrics",
data=output_df,
file_name="text_metrics.csv",
)
st.write("**Note**: This data frame has been transposed for readability.")
output_df = output_df.transpose().reset_index()
output_df.columns = ["Metric"] + [str(c) for c in list(output_df.columns)[1:]]
st.dataframe(data=output_df, use_container_width=True)
############################
# Code For Reproducibility #
############################
with st.expander("See python code"):
st.code(
"""
# Note: This is the code for a single text file
# The actual code is slightly more complex
# to allow processing multiple files at once
import textdescriptives as td
# Given a string of text and the settings
text = "..."
language = "..."
model_size = "..."
metrics = [...]
split_by_newline = True
# Remove whitespace from both ends of the string
text = text.strip()
# When asked, split by newlines
if split_by_newline:
lines = text.split("\\n")
else:
lines = [text]
# Remove empty lines
# E.g. due to consecutive newlines
lines = [l for l in lines if l]
# Extract metrics for each line
extracted_metrics = td.extract_metrics(
text=lines,
lang=language,
spacy_model_size=model_size,
metrics=metrics
)
""",
language="python",
)