textdescriptives / process_text.py
HLasse's picture
feat: multifile processing
a177196
"""
The text processing functionality.
"""
from typing import List, Optional
import streamlit as st
import pandas as pd
import textdescriptives as td
@st.cache_data
def text_to_metrics(
string: str,
language_short: str,
model_size_short: str,
metrics: List[str],
split_by_line: bool,
filename: Optional[str],
) -> pd.DataFrame:
# Clean and (optionally) split the text
string = string.strip()
if split_by_line:
strings = string.split("\n")
else:
strings = [string]
# Remove empty strings
# E.g. due to consecutive newlines
strings = [s for s in strings if s]
# Will automatically download the relevant model and extract all metrics
# TODO: Download beforehand to speed up inference
df = td.extract_metrics(
text=strings,
lang=language_short,
spacy_model_size=model_size_short,
metrics=metrics,
)
# Add filename
if filename is not None:
df["File"] = filename
move_column_inplace(df=df, col="File", pos=0)
return df
def move_column_inplace(df: pd.DataFrame, col: str, pos: int) -> None:
"""
Move a column to a given column-index position.
Taken from the `utipy` package.
Parameters
----------
df : `pandas.DataFrame`.
col : str
Name of column to move.
pos : int
Column index to move `col` to.
"""
assert (
0 <= pos < len(df.columns)
), f"`pos` must be between 0 (incl.) and the number of columns -1. Was {pos}."
col = df.pop(col)
df.insert(pos, col.name, col)