| | from collections import defaultdict |
| |
|
| | import fasttext |
| | import pandas as pd |
| | from sklearn.metrics import classification_report |
| | from tqdm import tqdm; tqdm.pandas() |
| | |
| | import io |
| | from pathlib import Path |
| | import numpy as np |
| | import pandas as pd |
| | import requests |
| | from sklearn.metrics import accuracy_score |
| | from sklearn.metrics import classification_report |
| | from sklearn.metrics import precision_recall_fscore_support |
| |
|
| |
|
| | names = pd.read_csv( |
| | io.StringIO(requests.get("https://iso639-3.sil.org/sites/iso639-3/files/downloads/iso-639-3.tab").text |
| | ), sep="\t").set_index("Id").rename( |
| | columns={"Ref_Name": "name"} |
| | )[["name"]].to_dict()["name"] |
| | tato_names = pd.read_html( |
| | "https://tatoeba.org/en/stats/sentences_by_language" |
| | )[0].rename( |
| | columns={"Unnamed: 2": "code", "Language": "name"} |
| | ).set_index("code")[["name"]].to_dict()["name"] |
| | names.update(tato_names) |
| |
|
| | |
| | |
| | |
| | |
| |
|
| |
|
| | def pandas_classification_report(y_true, y_pred, labels=None): |
| | metrics_summary = precision_recall_fscore_support( |
| | y_true=y_true, |
| | y_pred=y_pred, |
| | labels=labels) |
| | weighted_avg = list(precision_recall_fscore_support( |
| | y_true=y_true, |
| | y_pred=y_pred, |
| | labels=labels, |
| | average='weighted')) |
| | macro_avg = list(precision_recall_fscore_support( |
| | y_true=y_true, |
| | y_pred=y_pred, |
| | labels=labels, |
| | average='macro')) |
| | accuracy = [np.nan, np.nan, accuracy_score(y_true=y_true, y_pred=y_pred), np.nan] |
| | metrics_sum_index = ['precision', 'recall', 'f1-score', 'support'] |
| | class_report_df = pd.DataFrame( |
| | list(metrics_summary), |
| | index=metrics_sum_index, |
| | columns=labels) |
| |
|
| | support = class_report_df.loc['support'] |
| | total = support.sum() |
| | weighted_avg[-1] = total |
| | macro_avg[-1] = total |
| | accuracy[-1] = total |
| |
|
| | class_report_df['accuracy'] = accuracy |
| | class_report_df['weighted avg'] = weighted_avg |
| | class_report_df['macro avg'] = macro_avg |
| | report = class_report_df.T |
| | report["support"] = report["support"].astype(int) |
| | return report |
| |
|
| |
|
| | scores_text = "" |
| | for model_name in ("nordic-lid.bin", "nordic-lid_all.bin"): |
| | print( |
| | f""" |
| | ------------ |
| | {model_name} |
| | ------------ |
| | """) |
| | model = fasttext.load_model(model_name) |
| |
|
| | train = pd.read_csv("train.csv") |
| | ddict = defaultdict(lambda: "---") |
| | for k in train.lang.unique().tolist(): |
| | ddict[k] = k |
| |
|
| | train["nordic-lid"] = train.progress_apply(lambda row: ddict[model.predict(row["text"].replace("\n", " "))[0][0][-3:]], axis=1) |
| | print("TRAIN") |
| | print(model.test("train.txt")) |
| | print(classification_report(train["lang"], train["nordic-lid"], digits=4)) |
| |
|
| | val = pd.read_csv("validation.csv") |
| | val["nordic-lid"] = val.progress_apply(lambda row: ddict[model.predict(row["text"].replace("\n", " "))[0][0][-3:]], axis=1) |
| | print("VALIDATION") |
| | print(model.test("validation.txt")) |
| | print(classification_report(val["lang"], val["nordic-lid"], digits=4)) |
| |
|
| | test = pd.read_csv("test.csv") |
| | test["nordic-lid"] = test.progress_apply(lambda row: ddict[model.predict(row["text"].replace("\n", " "))[0][0][-3:]], axis=1) |
| | print("TEST") |
| | print(model.test("test.txt")) |
| | print(classification_report(test["lang"], test["nordic-lid"], digits=4)) |
| |
|
| | if "_all" in model_name: |
| | train = pd.read_csv("train_all.csv") |
| | ddict = defaultdict(lambda: "---") |
| | for k in train.lang.unique().tolist(): |
| | ddict[k] = k |
| |
|
| | train["nordic-lid"] = train.progress_apply(lambda row: ddict[model.predict(row["text"].replace("\n", " "))[0][0][-3:]], axis=1) |
| | print("TRAIN ALL") |
| | print(model.test("train_all.txt")) |
| | print(classification_report(train["lang"], train["nordic-lid"], digits=4)) |
| |
|
| | val = pd.read_csv("validation_all.csv") |
| | val["nordic-lid"] = val.progress_apply(lambda row: ddict[model.predict(row["text"].replace("\n", " "))[0][0][-3:]], axis=1) |
| | print("VALIDATION ALL") |
| | print(model.test("validation_all.txt")) |
| | print(classification_report(val["lang"], val["nordic-lid"], digits=4)) |
| |
|
| | test = pd.read_csv("test_all.csv") |
| | test["nordic-lid"] = test.progress_apply(lambda row: ddict[model.predict(row["text"].replace("\n", " "))[0][0][-3:]], axis=1) |
| | print("TEST ALL") |
| | print(model.test("test_all.txt")) |
| | print(classification_report(test["lang"], test["nordic-lid"], digits=4)) |
| |
|
| | langs = pd.read_csv("train_all.csv").lang.unique().tolist() |
| | else: |
| | langs = pd.read_csv("train.csv").lang.unique().tolist() |
| |
|
| | langs_df = pd.DataFrame({"ISO-639-3": langs}).sort_values("ISO-639-3") |
| | langs_df["Language"] = langs_df["ISO-639-3"].apply(names.__getitem__) |
| | langs_df = langs_df.set_index("ISO-639-3") |
| |
|
| | report_df = pandas_classification_report(test["nordic-lid"], test["lang"], sorted(langs)) |
| | scores = report_df.join(langs_df) |
| | scores.columns = map(str.title, scores.columns) |
| | scores.index.name = "ISO-639-3" |
| | scores = scores[["Language"] + [col.title() for col in scores.columns if col != "Language"]] |
| | scores_text += f"## {model_name}\n\n{scores.reset_index().to_markdown(index=False, floatfmt='.4f')}\n\n" |
| |
|
| | print() |
| |
|
| | print(scores_text) |
| | Path("./scores.md").write_text(scores_text) |
| |
|