Spaces:

HLasse
/

textdescriptives

Runtime error

App Files Files Community

textdescriptives / app.py

HLasse

feat: multifile processing

a177196 over 1 year ago

raw

history blame

6.46 kB

	"""
	Dashboard for showcasing extraction of text metrics with textdescriptives.

	"""

	from io import StringIO

	import pandas as pd
	import streamlit as st
	import textdescriptives as td

	from data_viewer import DataViewer
	from process_text import text_to_metrics
	from options import (
	all_model_size_options_pretty_to_short,
	available_model_size_options,
	language_options,
	metrics_options,
	)

	################
	# Introduction #
	################


	col1, col2 = st.columns([9, 2])
	with col1:
	st.title("Extract Text Statistics")
	with col2:
	st.image(
	"https://github.com/HLasse/TextDescriptives/raw/main/docs/_static/icon.png",
	width=125,
	)

	st.write(
	"Calculate a large variety of statistics from text via the "
	"[TextDescriptives](https://github.com/HLasse/TextDescriptives) python package "
	f"(v/{td.__version__}) and download the results as a .csv file. "
	"Includes descriptive statistics and metrics related to readability, "
	"information theory, text coherence and text quality."
	)

	st.write(
	"The source code for this application can be found on [GitHub](https://github.com/HLasse/TextDescriptives_app). "
	"If you have feedback, please open an [issue](https://github.com/HLasse/textdescriptives_app/issues)."
	)

	st.caption(
	"Hansen, L., Olsen, L. R., & Enevoldsen, K. (2023). TextDescriptives: A Python package for "
	"calculating a large variety of metrics from text. [Journal of Open Source Software, 8(84), "
	"5153, https://doi.org/10.21105/joss.05153](https://doi.org/10.21105/joss.05153)"
	)


	############
	# Settings #
	############


	input_choice = st.radio(
	label="Input", options=["Enter text", "Upload file(s)"], index=0, horizontal=True
	)

	with st.form(key="settings_form"):
	split_by_line = st.checkbox(label="Split by newline", value=True)

	file_name_to_text_string = {}

	if input_choice == "Upload file(s)":
	uploaded_files = st.file_uploader(
	label="Choose a .txt file", type=["txt"], accept_multiple_files=True
	)

	if uploaded_files is not None and len(uploaded_files) > 0:
	# To convert to a string based IO:
	file_name_to_text_string = {
	file.name: StringIO(file.getvalue().decode("utf-8")).read()
	for file in uploaded_files
	}

	else:
	default_text = """Hello, morning dew. The grass whispers low.
	I'm here to dance. The gentle breeze does show.
	Good morning, world. The birds sing in delight.
	Let's spread our wings. The butterflies take flight.
	Nature's chorus sings, a symphony of light."""

	file_name_to_text_string = {
	"input": st.text_area(
	label="Enter text", value=default_text, height=145, max_chars=None
	)
	}

	# Row of selectors
	col1, col2 = st.columns([1, 1])

	with col1:
	# Selection of language
	language_pretty = st.selectbox(
	label="Language",
	options=list(language_options().keys()),
	index=5,
	key="language_selector",
	)

	language_short = language_options()[language_pretty]

	with col2:
	# Selection of model size
	model_size_pretty = st.selectbox(
	label="Model Size",
	options=available_model_size_options(lang="all"),
	index=0,
	key="size_selector",
	)

	model_size_short = all_model_size_options_pretty_to_short()[model_size_pretty]

	# Multiselection of metrics
	metrics = st.multiselect(
	label="Metrics", options=metrics_options(), default=metrics_options()
	)

	st.write(
	"See the [documentation](https://hlasse.github.io/TextDescriptives/) for "
	"information on the available metrics."
	)

	# This shouldn't happen but better safe than sorry
	if isinstance(metrics, list) and not metrics:
	metrics = None

	apply_settings_button = st.form_submit_button(label="Apply")


	#############
	# Apply NLP #
	#############


	if apply_settings_button and len(file_name_to_text_string) > 0:
	if model_size_pretty not in available_model_size_options(lang=language_short):
	st.write(
	"Sorry! The chosen model size is not available in this language. Please try another."
	)
	else:
	# Extract metrics for each text
	output_df = pd.concat(
	[
	text_to_metrics(
	string=string,
	language_short=language_short,
	model_size_short=model_size_short,
	metrics=metrics,
	split_by_line=split_by_line,
	filename=filename if "Upload" in input_choice else None,
	)
	for filename, string in file_name_to_text_string.items()
	],
	ignore_index=True,
	)

	###################
	# Present Results #
	###################

	# Create 2 columns with 1) the output header
	# and 2) a download button
	DataViewer()._header_and_download(
	header="The calculated metrics",
	data=output_df,
	file_name="text_metrics.csv",
	)

	st.write("Note: This data frame has been transposed for readability.")
	output_df = output_df.transpose().reset_index()
	output_df.columns = ["Metric"] + [str(c) for c in list(output_df.columns)[1:]]
	st.dataframe(data=output_df, use_container_width=True)


	############################
	# Code For Reproducibility #
	############################


	with st.expander("See python code"):
	st.code(
	"""
	# Note: This is the code for a single text file
	# The actual code is slightly more complex
	# to allow processing multiple files at once

	import textdescriptives as td

	# Given a string of text and the settings
	text = "..."
	language = "..."
	model_size = "..."
	metrics = [...]
	split_by_newline = True

	# Remove whitespace from both ends of the string
	text = text.strip()

	# When asked, split by newlines
	if split_by_newline:
	lines = text.split("\\n")
	else:
	lines = [text]

	# Remove empty lines
	# E.g. due to consecutive newlines
	lines = [l for l in lines if l]

	# Extract metrics for each line
	extracted_metrics = td.extract_metrics(
	text=lines,
	lang=language,
	spacy_model_size=model_size,
	metrics=metrics
	)

	""",
	language="python",
	)