File size: 1,588 Bytes
a177196
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
"""
The text processing functionality.
"""

from typing import List, Optional
import streamlit as st
import pandas as pd
import textdescriptives as td


@st.cache_data
def text_to_metrics(
    string: str,
    language_short: str,
    model_size_short: str,
    metrics: List[str],
    split_by_line: bool,
    filename: Optional[str],
) -> pd.DataFrame:
    # Clean and (optionally) split the text
    string = string.strip()
    if split_by_line:
        strings = string.split("\n")
    else:
        strings = [string]

    # Remove empty strings
    # E.g. due to consecutive newlines
    strings = [s for s in strings if s]

    # Will automatically download the relevant model and extract all metrics
    # TODO: Download beforehand to speed up inference
    df = td.extract_metrics(
        text=strings,
        lang=language_short,
        spacy_model_size=model_size_short,
        metrics=metrics,
    )

    # Add filename
    if filename is not None:
        df["File"] = filename
        move_column_inplace(df=df, col="File", pos=0)

    return df


def move_column_inplace(df: pd.DataFrame, col: str, pos: int) -> None:
    """
    Move a column to a given column-index position.

    Taken from the `utipy` package.

    Parameters
    ----------
    df : `pandas.DataFrame`.
    col : str
        Name of column to move.
    pos : int
        Column index to move `col` to.
    """
    assert (
        0 <= pos < len(df.columns)
    ), f"`pos` must be between 0 (incl.) and the number of columns -1. Was {pos}."
    col = df.pop(col)
    df.insert(pos, col.name, col)