Spaces:

HLasse
/

textdescriptives

Runtime error

textdescriptives / process_text.py

feat: multifile processing

a177196 over 1 year ago

1.59 kB

	"""
	The text processing functionality.
	"""

	from typing import List, Optional
	import streamlit as st
	import pandas as pd
	import textdescriptives as td


	@st.cache_data
	def text_to_metrics(
	string: str,
	language_short: str,
	model_size_short: str,
	metrics: List[str],
	split_by_line: bool,
	filename: Optional[str],
	) -> pd.DataFrame:
	# Clean and (optionally) split the text
	string = string.strip()
	if split_by_line:
	strings = string.split("\n")
	else:
	strings = [string]

	# Remove empty strings
	# E.g. due to consecutive newlines
	strings = [s for s in strings if s]

	# Will automatically download the relevant model and extract all metrics
	# TODO: Download beforehand to speed up inference
	df = td.extract_metrics(
	text=strings,
	lang=language_short,
	spacy_model_size=model_size_short,
	metrics=metrics,
	)

	# Add filename
	if filename is not None:
	df["File"] = filename
	move_column_inplace(df=df, col="File", pos=0)

	return df


	def move_column_inplace(df: pd.DataFrame, col: str, pos: int) -> None:
	"""
	Move a column to a given column-index position.

	Taken from the `utipy` package.

	Parameters
	----------
	df : `pandas.DataFrame`.
	col : str
	Name of column to move.
	pos : int
	Column index to move `col` to.
	"""
	assert (
	0 <= pos < len(df.columns)
	), f"`pos` must be between 0 (incl.) and the number of columns -1. Was {pos}."
	col = df.pop(col)
	df.insert(pos, col.name, col)