Spaces:
Runtime error
Runtime error
feat: multifile processing
Browse files- app.py +45 -37
- data_viewer.py +9 -6
- process_text.py +66 -0
app.py
CHANGED
@@ -5,11 +5,12 @@ Dashboard for showcasing extraction of text metrics with textdescriptives.
|
|
5 |
|
6 |
from io import StringIO
|
7 |
|
8 |
-
import
|
9 |
import streamlit as st
|
10 |
import textdescriptives as td
|
11 |
|
12 |
from data_viewer import DataViewer
|
|
|
13 |
from options import (
|
14 |
all_model_size_options_pretty_to_short,
|
15 |
available_model_size_options,
|
@@ -28,7 +29,7 @@ with col1:
|
|
28 |
with col2:
|
29 |
st.image(
|
30 |
"https://github.com/HLasse/TextDescriptives/raw/main/docs/_static/icon.png",
|
31 |
-
width=125
|
32 |
)
|
33 |
|
34 |
st.write(
|
@@ -46,8 +47,8 @@ st.write(
|
|
46 |
|
47 |
st.caption(
|
48 |
"Hansen, L., Olsen, L. R., & Enevoldsen, K. (2023). TextDescriptives: A Python package for "
|
49 |
-
"calculating a large variety of
|
50 |
-
"
|
51 |
)
|
52 |
|
53 |
|
@@ -57,22 +58,25 @@ st.caption(
|
|
57 |
|
58 |
|
59 |
input_choice = st.radio(
|
60 |
-
label="Input", options=["Enter text", "Upload file"], index=0, horizontal=True
|
61 |
)
|
62 |
|
63 |
with st.form(key="settings_form"):
|
64 |
split_by_line = st.checkbox(label="Split by newline", value=True)
|
65 |
|
66 |
-
|
67 |
|
68 |
-
if input_choice == "Upload file":
|
69 |
-
|
70 |
-
label="Choose a .txt file", type=["txt"], accept_multiple_files=
|
71 |
)
|
72 |
|
73 |
-
if
|
74 |
# To convert to a string based IO:
|
75 |
-
|
|
|
|
|
|
|
76 |
|
77 |
else:
|
78 |
default_text = """Hello, morning dew. The grass whispers low.
|
@@ -81,9 +85,11 @@ Good morning, world. The birds sing in delight.
|
|
81 |
Let's spread our wings. The butterflies take flight.
|
82 |
Nature's chorus sings, a symphony of light."""
|
83 |
|
84 |
-
|
85 |
-
|
86 |
-
|
|
|
|
|
87 |
|
88 |
# Row of selectors
|
89 |
col1, col2 = st.columns([1, 1])
|
@@ -132,30 +138,26 @@ Nature's chorus sings, a symphony of light."""
|
|
132 |
#############
|
133 |
|
134 |
|
135 |
-
if apply_settings_button and
|
136 |
if model_size_pretty not in available_model_size_options(lang=language_short):
|
137 |
st.write(
|
138 |
"**Sorry!** The chosen *model size* is not available in this language. Please try another."
|
139 |
)
|
140 |
else:
|
141 |
-
#
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
text=string_data,
|
156 |
-
lang=language_short,
|
157 |
-
spacy_model_size=model_size_short,
|
158 |
-
metrics=metrics,
|
159 |
)
|
160 |
|
161 |
###################
|
@@ -165,13 +167,15 @@ if apply_settings_button and string_data is not None and string_data:
|
|
165 |
# Create 2 columns with 1) the output header
|
166 |
# and 2) a download button
|
167 |
DataViewer()._header_and_download(
|
168 |
-
header="The calculated metrics",
|
|
|
|
|
169 |
)
|
170 |
|
171 |
st.write("**Note**: This data frame has been transposed for readability.")
|
172 |
-
|
173 |
-
|
174 |
-
st.dataframe(data=
|
175 |
|
176 |
|
177 |
############################
|
@@ -182,6 +186,10 @@ if apply_settings_button and string_data is not None and string_data:
|
|
182 |
with st.expander("See python code"):
|
183 |
st.code(
|
184 |
"""
|
|
|
|
|
|
|
|
|
185 |
import textdescriptives as td
|
186 |
|
187 |
# Given a string of text and the settings
|
|
|
5 |
|
6 |
from io import StringIO
|
7 |
|
8 |
+
import pandas as pd
|
9 |
import streamlit as st
|
10 |
import textdescriptives as td
|
11 |
|
12 |
from data_viewer import DataViewer
|
13 |
+
from process_text import text_to_metrics
|
14 |
from options import (
|
15 |
all_model_size_options_pretty_to_short,
|
16 |
available_model_size_options,
|
|
|
29 |
with col2:
|
30 |
st.image(
|
31 |
"https://github.com/HLasse/TextDescriptives/raw/main/docs/_static/icon.png",
|
32 |
+
width=125,
|
33 |
)
|
34 |
|
35 |
st.write(
|
|
|
47 |
|
48 |
st.caption(
|
49 |
"Hansen, L., Olsen, L. R., & Enevoldsen, K. (2023). TextDescriptives: A Python package for "
|
50 |
+
"calculating a large variety of metrics from text. [Journal of Open Source Software, 8(84), "
|
51 |
+
"5153, https://doi.org/10.21105/joss.05153](https://doi.org/10.21105/joss.05153)"
|
52 |
)
|
53 |
|
54 |
|
|
|
58 |
|
59 |
|
60 |
input_choice = st.radio(
|
61 |
+
label="Input", options=["Enter text", "Upload file(s)"], index=0, horizontal=True
|
62 |
)
|
63 |
|
64 |
with st.form(key="settings_form"):
|
65 |
split_by_line = st.checkbox(label="Split by newline", value=True)
|
66 |
|
67 |
+
file_name_to_text_string = {}
|
68 |
|
69 |
+
if input_choice == "Upload file(s)":
|
70 |
+
uploaded_files = st.file_uploader(
|
71 |
+
label="Choose a .txt file", type=["txt"], accept_multiple_files=True
|
72 |
)
|
73 |
|
74 |
+
if uploaded_files is not None and len(uploaded_files) > 0:
|
75 |
# To convert to a string based IO:
|
76 |
+
file_name_to_text_string = {
|
77 |
+
file.name: StringIO(file.getvalue().decode("utf-8")).read()
|
78 |
+
for file in uploaded_files
|
79 |
+
}
|
80 |
|
81 |
else:
|
82 |
default_text = """Hello, morning dew. The grass whispers low.
|
|
|
85 |
Let's spread our wings. The butterflies take flight.
|
86 |
Nature's chorus sings, a symphony of light."""
|
87 |
|
88 |
+
file_name_to_text_string = {
|
89 |
+
"input": st.text_area(
|
90 |
+
label="Enter text", value=default_text, height=145, max_chars=None
|
91 |
+
)
|
92 |
+
}
|
93 |
|
94 |
# Row of selectors
|
95 |
col1, col2 = st.columns([1, 1])
|
|
|
138 |
#############
|
139 |
|
140 |
|
141 |
+
if apply_settings_button and len(file_name_to_text_string) > 0:
|
142 |
if model_size_pretty not in available_model_size_options(lang=language_short):
|
143 |
st.write(
|
144 |
"**Sorry!** The chosen *model size* is not available in this language. Please try another."
|
145 |
)
|
146 |
else:
|
147 |
+
# Extract metrics for each text
|
148 |
+
output_df = pd.concat(
|
149 |
+
[
|
150 |
+
text_to_metrics(
|
151 |
+
string=string,
|
152 |
+
language_short=language_short,
|
153 |
+
model_size_short=model_size_short,
|
154 |
+
metrics=metrics,
|
155 |
+
split_by_line=split_by_line,
|
156 |
+
filename=filename if "Upload" in input_choice else None,
|
157 |
+
)
|
158 |
+
for filename, string in file_name_to_text_string.items()
|
159 |
+
],
|
160 |
+
ignore_index=True,
|
|
|
|
|
|
|
|
|
161 |
)
|
162 |
|
163 |
###################
|
|
|
167 |
# Create 2 columns with 1) the output header
|
168 |
# and 2) a download button
|
169 |
DataViewer()._header_and_download(
|
170 |
+
header="The calculated metrics",
|
171 |
+
data=output_df,
|
172 |
+
file_name="text_metrics.csv",
|
173 |
)
|
174 |
|
175 |
st.write("**Note**: This data frame has been transposed for readability.")
|
176 |
+
output_df = output_df.transpose().reset_index()
|
177 |
+
output_df.columns = ["Metric"] + [str(c) for c in list(output_df.columns)[1:]]
|
178 |
+
st.dataframe(data=output_df, use_container_width=True)
|
179 |
|
180 |
|
181 |
############################
|
|
|
186 |
with st.expander("See python code"):
|
187 |
st.code(
|
188 |
"""
|
189 |
+
# Note: This is the code for a single text file
|
190 |
+
# The actual code is slightly more complex
|
191 |
+
# to allow processing multiple files at once
|
192 |
+
|
193 |
import textdescriptives as td
|
194 |
|
195 |
# Given a string of text and the settings
|
data_viewer.py
CHANGED
@@ -1,14 +1,17 @@
|
|
|
|
|
|
|
|
1 |
|
2 |
import streamlit as st
|
3 |
|
4 |
|
5 |
class DataViewer:
|
6 |
-
|
7 |
-
# @st.cache_data
|
8 |
def _convert_df_to_csv(self, data, **kwargs):
|
9 |
-
return data.to_csv(**kwargs).encode(
|
10 |
|
11 |
-
def _header_and_download(
|
|
|
|
|
12 |
col1, col2 = st.columns([9, 2])
|
13 |
with col1:
|
14 |
st.subheader(header)
|
@@ -16,8 +19,8 @@ class DataViewer:
|
|
16 |
st.write("")
|
17 |
st.download_button(
|
18 |
label=label,
|
19 |
-
data=self._convert_df_to_csv(data),
|
20 |
file_name=file_name,
|
21 |
key=key,
|
22 |
-
help=help
|
23 |
)
|
|
|
1 |
+
"""
|
2 |
+
Class for showing header and download button in the same row.
|
3 |
+
"""
|
4 |
|
5 |
import streamlit as st
|
6 |
|
7 |
|
8 |
class DataViewer:
|
|
|
|
|
9 |
def _convert_df_to_csv(self, data, **kwargs):
|
10 |
+
return data.to_csv(**kwargs).encode("utf-8")
|
11 |
|
12 |
+
def _header_and_download(
|
13 |
+
self, header, data, file_name, key=None, label="Download", help="Download data"
|
14 |
+
):
|
15 |
col1, col2 = st.columns([9, 2])
|
16 |
with col1:
|
17 |
st.subheader(header)
|
|
|
19 |
st.write("")
|
20 |
st.download_button(
|
21 |
label=label,
|
22 |
+
data=self._convert_df_to_csv(data, index=False),
|
23 |
file_name=file_name,
|
24 |
key=key,
|
25 |
+
help=help,
|
26 |
)
|
process_text.py
ADDED
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
The text processing functionality.
|
3 |
+
"""
|
4 |
+
|
5 |
+
from typing import List, Optional
|
6 |
+
import streamlit as st
|
7 |
+
import pandas as pd
|
8 |
+
import textdescriptives as td
|
9 |
+
|
10 |
+
|
11 |
+
@st.cache_data
|
12 |
+
def text_to_metrics(
|
13 |
+
string: str,
|
14 |
+
language_short: str,
|
15 |
+
model_size_short: str,
|
16 |
+
metrics: List[str],
|
17 |
+
split_by_line: bool,
|
18 |
+
filename: Optional[str],
|
19 |
+
) -> pd.DataFrame:
|
20 |
+
# Clean and (optionally) split the text
|
21 |
+
string = string.strip()
|
22 |
+
if split_by_line:
|
23 |
+
strings = string.split("\n")
|
24 |
+
else:
|
25 |
+
strings = [string]
|
26 |
+
|
27 |
+
# Remove empty strings
|
28 |
+
# E.g. due to consecutive newlines
|
29 |
+
strings = [s for s in strings if s]
|
30 |
+
|
31 |
+
# Will automatically download the relevant model and extract all metrics
|
32 |
+
# TODO: Download beforehand to speed up inference
|
33 |
+
df = td.extract_metrics(
|
34 |
+
text=strings,
|
35 |
+
lang=language_short,
|
36 |
+
spacy_model_size=model_size_short,
|
37 |
+
metrics=metrics,
|
38 |
+
)
|
39 |
+
|
40 |
+
# Add filename
|
41 |
+
if filename is not None:
|
42 |
+
df["File"] = filename
|
43 |
+
move_column_inplace(df=df, col="File", pos=0)
|
44 |
+
|
45 |
+
return df
|
46 |
+
|
47 |
+
|
48 |
+
def move_column_inplace(df: pd.DataFrame, col: str, pos: int) -> None:
|
49 |
+
"""
|
50 |
+
Move a column to a given column-index position.
|
51 |
+
|
52 |
+
Taken from the `utipy` package.
|
53 |
+
|
54 |
+
Parameters
|
55 |
+
----------
|
56 |
+
df : `pandas.DataFrame`.
|
57 |
+
col : str
|
58 |
+
Name of column to move.
|
59 |
+
pos : int
|
60 |
+
Column index to move `col` to.
|
61 |
+
"""
|
62 |
+
assert (
|
63 |
+
0 <= pos < len(df.columns)
|
64 |
+
), f"`pos` must be between 0 (incl.) and the number of columns -1. Was {pos}."
|
65 |
+
col = df.pop(col)
|
66 |
+
df.insert(pos, col.name, col)
|