from collections import defaultdict import fasttext import pandas as pd from sklearn.metrics import classification_report from tqdm import tqdm; tqdm.pandas() #!pip install tabulate import io from pathlib import Path import numpy as np import pandas as pd import requests from sklearn.metrics import accuracy_score from sklearn.metrics import classification_report from sklearn.metrics import precision_recall_fscore_support names = pd.read_csv( io.StringIO(requests.get("https://iso639-3.sil.org/sites/iso639-3/files/downloads/iso-639-3.tab").text ), sep="\t").set_index("Id").rename( columns={"Ref_Name": "name"} )[["name"]].to_dict()["name"] tato_names = pd.read_html( "https://tatoeba.org/en/stats/sentences_by_language" )[0].rename( columns={"Unnamed: 2": "code", "Language": "name"} ).set_index("code")[["name"]].to_dict()["name"] names.update(tato_names) # langs = pd.read_csv("train.csv").lang.unique().tolist() # langs_df = pd.DataFrame({"ISO-639-3": langs}).sort_values("ISO-639-3") # langs_df["Language"] = langs_df["ISO-639-3"].apply(names.__getitem__) # langs_df = langs_df.set_index("ISO-639-3") def pandas_classification_report(y_true, y_pred, labels=None): metrics_summary = precision_recall_fscore_support( y_true=y_true, y_pred=y_pred, labels=labels) weighted_avg = list(precision_recall_fscore_support( y_true=y_true, y_pred=y_pred, labels=labels, average='weighted')) macro_avg = list(precision_recall_fscore_support( y_true=y_true, y_pred=y_pred, labels=labels, average='macro')) accuracy = [np.nan, np.nan, accuracy_score(y_true=y_true, y_pred=y_pred), np.nan] metrics_sum_index = ['precision', 'recall', 'f1-score', 'support'] class_report_df = pd.DataFrame( list(metrics_summary), index=metrics_sum_index, columns=labels) support = class_report_df.loc['support'] total = support.sum() weighted_avg[-1] = total macro_avg[-1] = total accuracy[-1] = total class_report_df['accuracy'] = accuracy class_report_df['weighted avg'] = weighted_avg class_report_df['macro avg'] = macro_avg report = class_report_df.T report["support"] = report["support"].astype(int) return report scores_text = "" for model_name in ("nordic-lid.bin", "nordic-lid_all.bin"): print( f""" ------------ {model_name} ------------ """) model = fasttext.load_model(model_name) train = pd.read_csv("train.csv") ddict = defaultdict(lambda: "---") for k in train.lang.unique().tolist(): ddict[k] = k train["nordic-lid"] = train.progress_apply(lambda row: ddict[model.predict(row["text"].replace("\n", " "))[0][0][-3:]], axis=1) print("TRAIN") print(model.test("train.txt")) print(classification_report(train["lang"], train["nordic-lid"], digits=4)) val = pd.read_csv("validation.csv") val["nordic-lid"] = val.progress_apply(lambda row: ddict[model.predict(row["text"].replace("\n", " "))[0][0][-3:]], axis=1) print("VALIDATION") print(model.test("validation.txt")) print(classification_report(val["lang"], val["nordic-lid"], digits=4)) test = pd.read_csv("test.csv") test["nordic-lid"] = test.progress_apply(lambda row: ddict[model.predict(row["text"].replace("\n", " "))[0][0][-3:]], axis=1) print("TEST") print(model.test("test.txt")) print(classification_report(test["lang"], test["nordic-lid"], digits=4)) if "_all" in model_name: train = pd.read_csv("train_all.csv") ddict = defaultdict(lambda: "---") for k in train.lang.unique().tolist(): ddict[k] = k train["nordic-lid"] = train.progress_apply(lambda row: ddict[model.predict(row["text"].replace("\n", " "))[0][0][-3:]], axis=1) print("TRAIN ALL") print(model.test("train_all.txt")) print(classification_report(train["lang"], train["nordic-lid"], digits=4)) val = pd.read_csv("validation_all.csv") val["nordic-lid"] = val.progress_apply(lambda row: ddict[model.predict(row["text"].replace("\n", " "))[0][0][-3:]], axis=1) print("VALIDATION ALL") print(model.test("validation_all.txt")) print(classification_report(val["lang"], val["nordic-lid"], digits=4)) test = pd.read_csv("test_all.csv") test["nordic-lid"] = test.progress_apply(lambda row: ddict[model.predict(row["text"].replace("\n", " "))[0][0][-3:]], axis=1) print("TEST ALL") print(model.test("test_all.txt")) print(classification_report(test["lang"], test["nordic-lid"], digits=4)) langs = pd.read_csv("train_all.csv").lang.unique().tolist() else: langs = pd.read_csv("train.csv").lang.unique().tolist() langs_df = pd.DataFrame({"ISO-639-3": langs}).sort_values("ISO-639-3") langs_df["Language"] = langs_df["ISO-639-3"].apply(names.__getitem__) langs_df = langs_df.set_index("ISO-639-3") report_df = pandas_classification_report(test["nordic-lid"], test["lang"], sorted(langs)) scores = report_df.join(langs_df) scores.columns = map(str.title, scores.columns) scores.index.name = "ISO-639-3" scores = scores[["Language"] + [col.title() for col in scores.columns if col != "Language"]] scores_text += f"## {model_name}\n\n{scores.reset_index().to_markdown(index=False, floatfmt='.4f')}\n\n" print() print(scores_text) Path("./scores.md").write_text(scores_text)