Spaces:
Runtime error
Runtime error
import logging | |
from functools import partial | |
from typing import Optional | |
import pandas as pd | |
import typer | |
from bokeh.plotting import output_file as bokeh_output_file | |
from bokeh.plotting import save | |
from embedding_lenses.dimensionality_reduction import ( | |
get_tsne_embeddings, | |
get_umap_embeddings, | |
) | |
from embedding_lenses.embedding import load_model | |
from perplexity_lenses import REGISTRY_DATASET | |
from perplexity_lenses.data import ( | |
documents_df_to_sentences_df, | |
hub_dataset_to_dataframe, | |
) | |
from perplexity_lenses.engine import ( | |
DIMENSIONALITY_REDUCTION_ALGORITHMS, | |
DOCUMENT_TYPES, | |
EMBEDDING_MODELS, | |
LANGUAGES, | |
PERPLEXITY_MODELS, | |
SEED, | |
generate_plot, | |
) | |
from perplexity_lenses.perplexity import KenlmModel | |
from perplexity_lenses.visualization import draw_histogram | |
logging.basicConfig(level=logging.INFO) | |
logger = logging.getLogger(__name__) | |
app = typer.Typer() | |
def main( | |
dataset: str = typer.Option( | |
"mc4", help="The name of the hub dataset or local csv/tsv file." | |
), | |
dataset_config: Optional[str] = typer.Option( | |
"es", | |
help="The configuration of the hub dataset, if any. Does not apply to local csv/tsv files.", | |
), | |
dataset_split: Optional[str] = typer.Option( | |
"train", help="The dataset split. Does not apply to local csv/tsv files." | |
), | |
text_column: str = typer.Option("text", help="The text field name."), | |
language: str = typer.Option( | |
"es", help=f"The language of the text. Options: {LANGUAGES}" | |
), | |
doc_type: str = typer.Option( | |
"sentence", | |
help=f"Whether to embed at the sentence or document level. Options: {DOCUMENT_TYPES}.", | |
), | |
sample: int = typer.Option(1000, help="Maximum number of examples to use."), | |
perplexity_model: str = typer.Option( | |
"wikipedia", | |
help=f"Dataset on which the perplexity model was trained on. Options: {PERPLEXITY_MODELS}", | |
), | |
dimensionality_reduction: str = typer.Option( | |
DIMENSIONALITY_REDUCTION_ALGORITHMS[0], | |
help=f"Whether to use UMAP or t-SNE for dimensionality reduction. Options: {DIMENSIONALITY_REDUCTION_ALGORITHMS}.", | |
), | |
model_name: str = typer.Option( | |
EMBEDDING_MODELS[0], | |
help=f"The sentence embedding model to use. Options: {EMBEDDING_MODELS}", | |
), | |
output_file: str = typer.Option( | |
"perplexity", help="The name of the output visualization files." | |
), | |
): | |
""" | |
Perplexity Lenses: Visualize text embeddings in 2D using colors to represent perplexity values. | |
""" | |
logger.info("Loading embedding model...") | |
model = load_model(model_name) | |
dimensionality_reduction_function = ( | |
partial(get_umap_embeddings, random_state=SEED) | |
if dimensionality_reduction.lower() == "umap" | |
else partial(get_tsne_embeddings, random_state=SEED) | |
) | |
logger.info("Loading KenLM model...") | |
kenlm_model = KenlmModel.from_pretrained( | |
perplexity_model.lower(), | |
language, | |
lower_case=True, | |
remove_accents=True, | |
normalize_numbers=True, | |
punctuation=1, | |
) | |
logger.info("Loading dataset...") | |
if dataset.endswith(".csv") or dataset.endswith(".tsv"): | |
df = pd.read_csv(dataset, sep="\t" if dataset.endswith(".tsv") else ",") | |
if doc_type.lower() == "sentence": | |
df = documents_df_to_sentences_df(df, text_column, sample, seed=SEED) | |
df["perplexity"] = df[text_column].map(kenlm_model.get_perplexity) | |
else: | |
df = hub_dataset_to_dataframe( | |
dataset, | |
dataset_config, | |
dataset_split, | |
sample, | |
text_column, | |
kenlm_model, | |
seed=SEED, | |
doc_type=doc_type, | |
) | |
# Round perplexity | |
df["perplexity"] = df["perplexity"].round().astype(int) | |
logger.info( | |
f"Perplexity range: {df['perplexity'].min()} - {df['perplexity'].max()}" | |
) | |
plot, plot_registry = generate_plot( | |
df, | |
text_column, | |
"perplexity", | |
None, | |
dimensionality_reduction_function, | |
model, | |
seed=SEED, | |
hub_dataset=dataset, | |
) | |
logger.info("Saving plots") | |
bokeh_output_file(f"{output_file}.html") | |
save(plot) | |
if dataset == REGISTRY_DATASET: | |
bokeh_output_file(f"{output_file}_registry.html") | |
save(plot_registry) | |
fig = draw_histogram(df["perplexity"].values) | |
fig.savefig(f"{output_file}_histogram.png") | |
logger.info("Done") | |
if __name__ == "__main__": | |
app() | |