versae commited on Nov 18, 2022

Commit

486585a

•

1 Parent(s): 6b50e33

First full version of the models

Browse files

Files changed (27) hide show

.gitattributes +18 -0
code/create_fasttext_data.py +39 -0
code/create_models.py +10 -0
code/create_tatoeba_data.py +36 -0
code/prepare_data.sh +23 -0
code/test_models.py +145 -0
data/test.csv +3 -0
data/test.txt +3 -0
data/test_all.csv +3 -0
data/test_all.txt +3 -0
data/test_tatoeba.csv +3 -0
data/test_tatoeba.txt +3 -0
data/train.csv +3 -0
data/train.txt +3 -0
data/train_all.csv +3 -0
data/train_all.txt +3 -0
data/train_tatoeba.csv +3 -0
data/train_tatoeba.txt +3 -0
data/validation.csv +3 -0
data/validation.txt +3 -0
data/validation_all.csv +3 -0
data/validation_all.txt +3 -0
data/validation_tatoeba.csv +3 -0
data/validation_tatoeba.txt +3 -0
nordic-lid.159.bin +3 -0
nordic-lid.bin +3 -0
scores.md +188 -0

.gitattributes CHANGED Viewed

@@ -32,3 +32,21 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+data/validation_all.csv filter=lfs diff=lfs merge=lfs -text
+data/validation_all.txt filter=lfs diff=lfs merge=lfs -text
+data/validation_tatoeba.csv filter=lfs diff=lfs merge=lfs -text
+data/test.txt filter=lfs diff=lfs merge=lfs -text
+data/train.csv filter=lfs diff=lfs merge=lfs -text
+data/train_tatoeba.csv filter=lfs diff=lfs merge=lfs -text
+data/test_tatoeba.csv filter=lfs diff=lfs merge=lfs -text
+data/validation_tatoeba.txt filter=lfs diff=lfs merge=lfs -text
+data/test_all.csv filter=lfs diff=lfs merge=lfs -text
+data/test_all.txt filter=lfs diff=lfs merge=lfs -text
+data/test.csv filter=lfs diff=lfs merge=lfs -text
+data/validation.csv filter=lfs diff=lfs merge=lfs -text
+data/test_tatoeba.txt filter=lfs diff=lfs merge=lfs -text
+data/train_all.csv filter=lfs diff=lfs merge=lfs -text
+data/train_tatoeba.txt filter=lfs diff=lfs merge=lfs -text
+data/train_all.txt filter=lfs diff=lfs merge=lfs -text
+data/train.txt filter=lfs diff=lfs merge=lfs -text
+data/validation.txt filter=lfs diff=lfs merge=lfs -text

code/create_fasttext_data.py ADDED Viewed

	@@ -0,0 +1,39 @@

+from pathlib import Path
+import pandas as pd
+train = (pd.concat([pd.read_csv(p, sep="\t", names=["_text", "lang"]) for p in Path("/nfsmounts/datastore/langid2").rglob("*_train.tsv")])
+    .assign(text=lambda x: x["_text"].str[6:].str.strip())
+    .drop("_text", axis=1)
+    .query("'xxx' not in text")
+    .sample(frac=1)
+    .reset_index()
+    .drop('index', axis=1)
+)
+validation = (pd.concat([pd.read_csv(p, sep="\t", names=["_text", "lang"]) for p in Path("/nfsmounts/datastore/langid2").rglob("*_dev.tsv")])
+    .assign(text=lambda x: x["_text"].str[6:].str.strip())
+    .drop("_text", axis=1)
+    .query("'xxx' not in text")
+    .sample(frac=1)
+    .reset_index()
+    .drop('index', axis=1)
+)
+test = (pd.concat([pd.read_csv(p, sep="\t", names=["_text", "lang"]) for p in Path("/nfsmounts/datastore/langid2").rglob("*_test.tsv")])
+    .assign(text=lambda x: x["_text"].str[6:].str.strip())
+    .drop("_text", axis=1)
+    .query("'xxx' not in text")
+    .sample(frac=1)
+    .reset_index()
+    .drop('index', axis=1)
+)
+train.to_csv("train.csv", index=False)
+validation.to_csv("validation.csv", index=False)
+test.to_csv("test.csv", index=False)
+Path("train.txt").write_text("\n".join(train.apply(lambda row: f"__label__{row['lang']} {row['text']}".replace('\n', ' '), axis=1).values))
+Path("validation.txt").write_text("\n".join(validation.apply(lambda row: f"__label__{row['lang']} {row['text']}".replace('\n', ' '), axis=1).values))
+Path("test.txt").write_text("\n".join(test.apply(lambda row: f"__label__{row['lang']} {row['text']}".replace('\n', ' '), axis=1).values))

code/create_models.py ADDED Viewed

	@@ -0,0 +1,10 @@

+import fasttext
+model = fasttext.train_supervised(input='train.txt', autotuneValidationFile='validation.txt', autotuneDuration=60*60)
+model.save_model("nordic-lid.bin")
+print(model.test("test.txt"))
+model_all = fasttext.train_supervised(input='train_all.txt', autotuneValidationFile='validation_all.txt', autotuneDuration=60*60)
+model_all.save_model("nordic-lid_all.bin")
+print(model_all.test("test.txt"))
+print(model_all.test("test_all.txt"))

code/create_tatoeba_data.py ADDED Viewed

	@@ -0,0 +1,36 @@

+import tarfile
+from pathlib import Path
+import pandas as pd
+import requests
+from sklearn.model_selection import train_test_split
+if not Path("sentences.csv").exists():
+    print("Downloading data")
+    link = "http://downloads.tatoeba.org/exports/sentences.tar.bz2"
+    with requests.get(link , stream=True) as rx, tarfile.open(fileobj=rx.raw, mode="r|bz2") as tarobj:
+        tarobj.extractall("./")
+print("Preparing sentences")
+sents = pd.read_csv("sentences.csv", sep="\t", names=["index", "lang", "text"]).dropna().drop("index", axis=1)
+sents = sents[sents.lang != "\\N"]
+sents = sents[sents.groupby("lang")["lang"].transform("size") > 500]
+sents = sents.groupby("lang").apply(lambda group: group.sample(11_000, replace=True)).droplevel("lang").drop_duplicates()
+sents = sents.sample(frac=1).reset_index().drop('index', axis=1)
+lang_count = len(sents.lang.unique())
+print(f"Splitting sentences in {lang_count} languages")
+train, validation_test = train_test_split(sents, stratify=sents.lang, test_size=0.1)
+validation, test = train_test_split(validation_test, stratify=validation_test.lang, test_size=0.5)
+print("Writing files")
+train.to_csv("train_tatoeba.csv", index=False)
+validation.to_csv("validation_tatoeba.csv", index=False)
+test.to_csv("test_tatoeba.csv", index=False)
+Path("train_tatoeba.txt").write_text("\n".join(train.apply(lambda row: f"__label__{row['lang']} {row['text']}".replace('\n', ' '), axis=1).values))
+Path("validation_tatoeba.txt").write_text("\n".join(validation.apply(lambda row: f"__label__{row['lang']} {row['text']}".replace('\n', ' '), axis=1).values))
+Path("test_tatoeba.txt").write_text("\n".join(test.apply(lambda row: f"__label__{row['lang']} {row['text']}".replace('\n', ' '), axis=1).values))
+print("Done")

code/prepare_data.sh ADDED Viewed

	@@ -0,0 +1,23 @@

+#wget http://downloads.tatoeba.org/exports/sentences.tar.bz2
+#bunzip2 sentences.tar.bz2
+#tar xvf sentences.tar
+#awk -F"\t" '{print"__label__"$2" "$3}' < sentences.csv | shuf > all.txt
+#head -3 all.txt
+#head -n 10000 all.txt > validation_tatoeba.txt
+#tail -n +10001 all.txt > train_tatoeba.txt
+python create_fasttext_data.py
+python create_tatoeba_data.py
+cat train*.txt | shuf > train_all.txt
+cat validation*.txt | shuf > validation_all.txt
+cat test*.txt | shuf > test_all.txt
+python <<EOF
+from pathlib import Path
+import pandas as pd
+pd.DataFrame.from_records([line[9:].split(" ", 1) for line in Path("train_all.txt").read_text().split("\n") if line], columns=["lang", "text"]).to_csv("train_all.csv", index=False)
+pd.DataFrame.from_records([line[9:].split(" ", 1) for line in Path("validation_all.txt").read_text().split("\n") if line], columns=["lang", "text"]).to_csv("validation_all.csv", index=False)
+pd.DataFrame.from_records([line[9:].split(" ", 1) for line in Path("test_all.txt").read_text().split("\n") if line], columns=["lang", "text"]).to_csv("test_all.csv", index=False)
+EOF

code/test_models.py ADDED Viewed

	@@ -0,0 +1,145 @@

+from collections import defaultdict
+import fasttext
+import pandas as pd
+from sklearn.metrics import classification_report
+from tqdm import tqdm; tqdm.pandas()
+#!pip install tabulate
+import io
+from pathlib import Path
+import numpy as np
+import pandas as pd
+import requests
+from sklearn.metrics import accuracy_score
+from sklearn.metrics import classification_report
+from sklearn.metrics import precision_recall_fscore_support
+names = pd.read_csv(
+    io.StringIO(requests.get("https://iso639-3.sil.org/sites/iso639-3/files/downloads/iso-639-3.tab").text
+), sep="\t").set_index("Id").rename(
+    columns={"Ref_Name": "name"}
+)[["name"]].to_dict()["name"]
+tato_names = pd.read_html(
+    "https://tatoeba.org/en/stats/sentences_by_language"
+)[0].rename(
+    columns={"Unnamed: 2": "code", "Language": "name"}
+).set_index("code")[["name"]].to_dict()["name"]
+names.update(tato_names)
+# langs = pd.read_csv("train.csv").lang.unique().tolist()
+# langs_df = pd.DataFrame({"ISO-639-3": langs}).sort_values("ISO-639-3")
+# langs_df["Language"] = langs_df["ISO-639-3"].apply(names.__getitem__)
+# langs_df = langs_df.set_index("ISO-639-3")
+def pandas_classification_report(y_true, y_pred, labels=None):
+    metrics_summary = precision_recall_fscore_support(
+            y_true=y_true,
+            y_pred=y_pred,
+            labels=labels)
+    weighted_avg = list(precision_recall_fscore_support(
+            y_true=y_true,
+            y_pred=y_pred,
+            labels=labels,
+            average='weighted'))
+    macro_avg = list(precision_recall_fscore_support(
+            y_true=y_true,
+            y_pred=y_pred,
+            labels=labels,
+            average='macro'))
+    accuracy = [np.nan, np.nan, accuracy_score(y_true=y_true, y_pred=y_pred), np.nan]
+    metrics_sum_index = ['precision', 'recall', 'f1-score', 'support']
+    class_report_df = pd.DataFrame(
+        list(metrics_summary),
+        index=metrics_sum_index,
+        columns=labels)
+    support = class_report_df.loc['support']
+    total = support.sum()
+    weighted_avg[-1] = total
+    macro_avg[-1] = total
+    accuracy[-1] = total
+    class_report_df['accuracy'] = accuracy
+    class_report_df['weighted avg'] = weighted_avg
+    class_report_df['macro avg'] = macro_avg
+    report = class_report_df.T
+    report["support"] = report["support"].astype(int)
+    return report
+scores_text = ""
+for model_name in ("nordic-lid.bin", "nordic-lid_all.bin"):
+    print(
+f"""
+------------
+{model_name}
+------------
+""")
+    model = fasttext.load_model(model_name)
+    train = pd.read_csv("train.csv")
+    ddict = defaultdict(lambda: "---")
+    for k in train.lang.unique().tolist():
+        ddict[k] = k
+    train["nordic-lid"] = train.progress_apply(lambda row: ddict[model.predict(row["text"].replace("\n", " "))[0][0][-3:]], axis=1)
+    print("TRAIN")
+    print(model.test("train.txt"))
+    print(classification_report(train["lang"], train["nordic-lid"], digits=4))
+    val = pd.read_csv("validation.csv")
+    val["nordic-lid"] = val.progress_apply(lambda row: ddict[model.predict(row["text"].replace("\n", " "))[0][0][-3:]], axis=1)
+    print("VALIDATION")
+    print(model.test("validation.txt"))
+    print(classification_report(val["lang"], val["nordic-lid"], digits=4))
+    test = pd.read_csv("test.csv")
+    test["nordic-lid"] = test.progress_apply(lambda row: ddict[model.predict(row["text"].replace("\n", " "))[0][0][-3:]], axis=1)
+    print("TEST")
+    print(model.test("test.txt"))
+    print(classification_report(test["lang"], test["nordic-lid"], digits=4))
+    if "_all" in model_name:
+        train = pd.read_csv("train_all.csv")
+        ddict = defaultdict(lambda: "---")
+        for k in train.lang.unique().tolist():
+            ddict[k] = k
+        train["nordic-lid"] = train.progress_apply(lambda row: ddict[model.predict(row["text"].replace("\n", " "))[0][0][-3:]], axis=1)
+        print("TRAIN ALL")
+        print(model.test("train_all.txt"))
+        print(classification_report(train["lang"], train["nordic-lid"], digits=4))
+        val = pd.read_csv("validation_all.csv")
+        val["nordic-lid"] = val.progress_apply(lambda row: ddict[model.predict(row["text"].replace("\n", " "))[0][0][-3:]], axis=1)
+        print("VALIDATION ALL")
+        print(model.test("validation_all.txt"))
+        print(classification_report(val["lang"], val["nordic-lid"], digits=4))
+        test = pd.read_csv("test_all.csv")
+        test["nordic-lid"] = test.progress_apply(lambda row: ddict[model.predict(row["text"].replace("\n", " "))[0][0][-3:]], axis=1)
+        print("TEST ALL")
+        print(model.test("test_all.txt"))
+        print(classification_report(test["lang"], test["nordic-lid"], digits=4))
+        langs = pd.read_csv("train_all.csv").lang.unique().tolist()
+    else:
+        langs = pd.read_csv("train.csv").lang.unique().tolist()
+    langs_df = pd.DataFrame({"ISO-639-3": langs}).sort_values("ISO-639-3")
+    langs_df["Language"] = langs_df["ISO-639-3"].apply(names.__getitem__)
+    langs_df = langs_df.set_index("ISO-639-3")
+    report_df = pandas_classification_report(test["nordic-lid"], test["lang"], sorted(langs))
+    scores = report_df.join(langs_df)
+    scores.columns = map(str.title, scores.columns)
+    scores.index.name = "ISO-639-3"
+    scores = scores[["Language"] + [col.title() for col in scores.columns if col != "Language"]]
+    scores_text += f"## {model_name}\n\n{scores.reset_index().to_markdown(index=False, floatfmt='.4f')}\n\n"
+    print()
+print(scores_text)
+Path("./scores.md").write_text(scores_text)

data/test.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e74990101c32ce2127b5746ddb94aed6fd2a58602dd43d5e0ba258ab9325b1a5
+size 580011

data/test.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:50a7ad73a0065eb0d501c6608a0374148a5af0aacf5f3f558a932e9610985a71
+size 624477

data/test_all.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2a98db9e35920fbf002bae9195d8006cc63370360001f6ea41090155c70cda2c
+size 2449952

data/test_all.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:73b8c4cc9a757a963b76c7b8e1e15370644a8967ef07237657b25336dda1955a
+size 2831836

data/test_tatoeba.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:630ca5569ae61e8b3a9a525b59cf966cc00520fa2507c0c5e3a589cc5f2c14e0
+size 1869943

data/test_tatoeba.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:05ea1e7d20fc6252e2bc151d20f3be616e6ebf58ae2e043ca983f5d96276e7e7
+size 2207358

data/train.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:beefe8b0a9ba61cf55c9ef0885d36170ea4fb1f8c1487f8c8e131b7f15c598ec
+size 8483510

data/train.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8d82958dfc4736b74801c21b090596d3da3aa60bea4f54d706f71753f1bbe8dd
+size 9136984

data/train_all.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5dbf35164649f49e3f230b54978032b764d364dcb51b7f6f98276039661377f4
+size 42158820

data/train_all.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:74f8c470418d15e87bd6c1ab4f3e40b5838546682d79692463b7f7f275a67914
+size 48886432

data/train_tatoeba.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:06120b49c7dbbd1a63739a30f5fb305a39da8477297768c4374470b508dd70d8
+size 33675312

data/train_tatoeba.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:752a82a66e0002440eb4a035838d69c3cd9294ca28094318a584def7cab4b4dd
+size 39749447

data/validation.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e9d303938403699700be43133e06038b9c0f38954b875337cb29be89b94a39a8
+size 577673

data/validation.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:064db1e9bbf1013a6b505dc1e9c66171ddbfe831f433ebc16a9f1c024c4683a1
+size 622232

data/validation_all.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c16ff730a83e409b494b71b77ebfa0c94498218a9574863b597343c3dd222cd2
+size 2462876

data/validation_all.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:28c998fca8199809f8ac1b387b33de9002422af0f1bd2e6150c85ab2a7df1f71
+size 2844912

data/validation_tatoeba.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:96fc539c28120cc3a98b0a006c2cd36b7f1917987a3c3d6bab7a1c920595c546
+size 1885205

data/validation_tatoeba.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8123f84395ff12491e0b5623fc36b07f977aa935d00e6aedbd5e5857e32336f6
+size 2222679

nordic-lid.159.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:90835f286b3ecd1538b15c8f82a2dd461ecb5614903741767173a2c1ba8e6d48
+size 747823000

nordic-lid.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a65c608ed4f0bfe05694ed75fd7b1643110a215cfd39fbc82a40f3374aaaf26e
+size 273830875

scores.md ADDED Viewed

	@@ -0,0 +1,188 @@

+## nordic-lid.bin
+| ISO-639-3    | Language          |   Precision |   Recall |   F1-Score |   Support |
+|:-------------|:------------------|------------:|---------:|-----------:|----------:|
+| dan          | Danish            |      0.9720 |   0.9838 |     0.9779 |       494 |
+| eng          | English           |      0.9980 |   0.9940 |     0.9960 |       502 |
+| fao          | Faroese           |      0.9920 |   0.9940 |     0.9930 |       499 |
+| fin          | Finnish           |      1.0000 |   1.0000 |     1.0000 |       500 |
+| isl          | Icelandic         |      0.9900 |   0.9920 |     0.9910 |       499 |
+| nno          | Norwegian Nynorsk |      0.9920 |   0.9861 |     0.9890 |       503 |
+| nob          | Norwegian Bokmål  |      0.9840 |   0.9743 |     0.9791 |       505 |
+| sma          | Southern Sami     |      0.9800 |   0.9703 |     0.9751 |       101 |
+| sme          | Northern Sami     |      1.0000 |   0.9921 |     0.9960 |       504 |
+| smj          | Lule Sami         |      0.9920 |   0.9960 |     0.9940 |       498 |
+| smn          | Inari Sami        |      0.9950 |   1.0000 |     0.9975 |       199 |
+| sms          | Skolt Sami        |      0.9900 |   0.9950 |     0.9925 |       199 |
+| swe          | Swedish           |      0.9860 |   0.9920 |     0.9890 |       497 |
+| accuracy     | nan               |    nan      | nan      |     0.9905 |      5500 |
+| weighted avg | nan               |      0.9906 |   0.9905 |     0.9905 |      5500 |
+| macro avg    | nan               |      0.9901 |   0.9900 |     0.9900 |      5500 |
+## nordic-lid_all.bin
+| ISO-639-3    | Language                    |   Precision |   Recall |   F1-Score |   Support |
+|:-------------|:----------------------------|------------:|---------:|-----------:|----------:|
+| afr          | Afrikaans                   |      0.9476 |   0.9476 |     0.9476 |       191 |
+| ara          | Arabic                      |      0.9708 |   0.9472 |     0.9588 |       492 |
+| arq          | Algerian Arabic             |      0.9478 |   0.9237 |     0.9356 |       118 |
+| arz          | Egyptian Arabic             |      0.6316 |   0.7660 |     0.6923 |        47 |
+| asm          | Assamese                    |      0.9828 |   0.9884 |     0.9856 |       173 |
+| avk          | Kotava                      |      0.9791 |   0.9894 |     0.9842 |       189 |
+| aze          | Azerbaijani                 |      0.9707 |   0.9789 |     0.9748 |       237 |
+| bel          | Belarusian                  |      0.9892 |   0.9733 |     0.9812 |       375 |
+| ben          | Bengali                     |      0.9872 |   0.9872 |     0.9872 |       235 |
+| ber          | Berber                      |      0.8881 |   0.8388 |     0.8627 |       577 |
+| bos          | Bosnian                     |      0.1310 |   0.3333 |     0.1880 |        33 |
+| bre          | Breton                      |      0.9648 |   0.9786 |     0.9716 |       280 |
+| bua          | Buryat                      |      0.9111 |   0.9111 |     0.9111 |        45 |
+| bul          | Bulgarian                   |      0.9597 |   0.9662 |     0.9630 |       444 |
+| cat          | Catalan                     |      0.9538 |   0.9475 |     0.9507 |       305 |
+| cbk          | Chavacano                   |      0.9627 |   0.9773 |     0.9699 |       132 |
+| ceb          | Cebuano                     |      0.8205 |   0.8533 |     0.8366 |        75 |
+| ces          | Czech                       |      0.9606 |   0.9740 |     0.9672 |       500 |
+| chv          | Chuvash                     |      0.9756 |   0.9877 |     0.9816 |        81 |
+| ckb          | Central Kurdish (Soranî)    |      0.9751 |   0.9915 |     0.9832 |       355 |
+| ckt          | Chukchi                     |      0.9615 |   1.0000 |     0.9804 |        25 |
+| cmn          | Mandarin Chinese            |      0.9530 |   0.8743 |     0.9120 |       557 |
+| cor          | Cornish                     |      0.9945 |   0.9628 |     0.9784 |       188 |
+| csb          | Kashubian                   |      0.9574 |   1.0000 |     0.9783 |        45 |
+| cym          | Welsh                       |      0.9375 |   0.9615 |     0.9494 |        78 |
+| dan          | Danish                      |      0.9401 |   0.9363 |     0.9382 |      1005 |
+| deu          | German                      |      0.9853 |   0.9781 |     0.9817 |       549 |
+| dsb          | Lower Sorbian               |      0.8704 |   0.8246 |     0.8468 |        57 |
+| dtp          | Central Dusun               |      0.8881 |   0.9549 |     0.9203 |       133 |
+| ell          | Greek                       |      0.9979 |   0.9979 |     0.9979 |       475 |
+| eng          | English                     |      0.9895 |   0.9839 |     0.9867 |      1055 |
+| epo          | Esperanto                   |      0.9817 |   0.9926 |     0.9871 |       540 |
+| est          | Estonian                    |      0.9545 |   0.9711 |     0.9628 |       173 |
+| eus          | Basque                      |      0.9844 |   0.9583 |     0.9712 |       264 |
+| fao          | Faroese                     |      0.9820 |   0.9859 |     0.9840 |       498 |
+| fin          | Finnish                     |      0.9932 |   0.9780 |     0.9855 |      1045 |
+| fkv          | Kven Finnish                |      0.6154 |   0.8889 |     0.7273 |        18 |
+| fra          | French                      |      0.9871 |   0.9908 |     0.9890 |       542 |
+| frr          | North Frisian               |      0.9640 |   0.9710 |     0.9675 |       138 |
+| fry          | Frisian                     |      0.6774 |   0.9545 |     0.7925 |        22 |
+| gcf          | Guadeloupean Creole French  |      0.9619 |   1.0000 |     0.9806 |       101 |
+| gla          | Scottish Gaelic             |      0.9412 |   0.9796 |     0.9600 |        49 |
+| gle          | Irish                       |      0.9635 |   0.9778 |     0.9706 |       135 |
+| glg          | Galician                    |      0.9104 |   0.9369 |     0.9234 |       206 |
+| gos          | Gronings                    |      0.9549 |   0.9588 |     0.9569 |       243 |
+| grc          | Ancient Greek               |      0.9828 |   0.9828 |     0.9828 |        58 |
+| grn          | Guarani                     |      0.9684 |   0.9935 |     0.9808 |       154 |
+| guc          | Wayuu                       |      0.9111 |   0.9762 |     0.9425 |        42 |
+| hau          | Hausa                       |      0.9814 |   0.9953 |     0.9883 |       425 |
+| heb          | Hebrew                      |      1.0000 |   1.0000 |     1.0000 |       536 |
+| hin          | Hindi                       |      1.0000 |   0.9974 |     0.9987 |       391 |
+| hoc          | Ho                          |      0.9429 |   0.9167 |     0.9296 |        36 |
+| hrv          | Croatian                    |      0.7447 |   0.6119 |     0.6718 |       286 |
+| hrx          | Hunsrik                     |      0.8727 |   0.9231 |     0.8972 |        52 |
+| hsb          | Upper Sorbian               |      0.8400 |   0.8289 |     0.8344 |        76 |
+| hun          | Hungarian                   |      0.9853 |   0.9926 |     0.9889 |       539 |
+| hye          | Armenian                    |      1.0000 |   1.0000 |     1.0000 |       225 |
+| ido          | Ido                         |      0.9791 |   0.9563 |     0.9676 |       343 |
+| ile          | Interlingue                 |      0.9352 |   0.9416 |     0.9384 |       291 |
+| ilo          | Ilocano                     |      0.9917 |   0.9600 |     0.9756 |       125 |
+| ina          | Interlingua                 |      0.9558 |   0.9621 |     0.9589 |       449 |
+| ind          | Indonesian                  |      0.8526 |   0.8203 |     0.8361 |       423 |
+| isl          | Icelandic                   |      0.9863 |   0.9897 |     0.9880 |       871 |
+| ita          | Italian                     |      0.9817 |   0.9711 |     0.9764 |       553 |
+| jav          | Javanese                    |      0.9600 |   0.9600 |     0.9600 |        50 |
+| jbo          | Lojban                      |      1.0000 |   0.9926 |     0.9963 |       405 |
+| jpn          | Japanese                    |      0.9851 |   1.0000 |     0.9925 |       530 |
+| kab          | Kabyle                      |      0.8382 |   0.8959 |     0.8661 |       509 |
+| kat          | Georgian                    |      1.0000 |   0.9885 |     0.9942 |       260 |
+| kaz          | Kazakh                      |      0.9896 |   0.9845 |     0.9870 |       193 |
+| kha          | Khasi                       |      0.9038 |   0.9400 |     0.9216 |       100 |
+| khm          | Khmer                       |      1.0000 |   1.0000 |     1.0000 |        75 |
+| kmr          | Northern Kurdish (Kurmancî) |      0.9851 |   0.9763 |     0.9807 |       338 |
+| knc          | Central Kanuri              |      0.9719 |   0.9886 |     0.9802 |       175 |
+| kor          | Korean                      |      0.9972 |   0.9832 |     0.9902 |       358 |
+| kzj          | Coastal Kadazan             |      0.9615 |   0.9336 |     0.9474 |       241 |
+| lad          | Ladino                      |      0.7846 |   0.7969 |     0.7907 |        64 |
+| lat          | Latin                       |      0.9756 |   0.9639 |     0.9697 |       498 |
+| lfn          | Lingua Franca Nova          |      0.9745 |   0.9700 |     0.9723 |       434 |
+| lij          | Ligurian                    |      0.9333 |   0.9333 |     0.9333 |        90 |
+| lin          | Lingala                     |      0.9765 |   0.9765 |     0.9765 |       213 |
+| lit          | Lithuanian                  |      0.9864 |   0.9922 |     0.9893 |       512 |
+| ltz          | Luxembourgish               |      0.9773 |   0.9348 |     0.9556 |        46 |
+| lvs          | Latvian                     |      0.9597 |   0.9795 |     0.9695 |       146 |
+| lzh          | Literary Chinese            |      0.7692 |   0.8046 |     0.7865 |        87 |
+| mal          | Malayalam                   |      1.0000 |   1.0000 |     1.0000 |        44 |
+| mar          | Marathi                     |      0.9961 |   1.0000 |     0.9980 |       509 |
+| mhr          | Meadow Mari                 |      0.9849 |   0.9751 |     0.9800 |       201 |
+| mkd          | Macedonian                  |      0.9572 |   0.9480 |     0.9526 |       519 |
+| mon          | Mongolian                   |      0.9708 |   0.9779 |     0.9744 |       136 |
+| mus          | Muskogee (Creek)            |      0.9000 |   0.9643 |     0.9310 |        28 |
+| mya          | Burmese                     |      1.0000 |   0.9643 |     0.9818 |        28 |
+| nds          | Low German (Low Saxon)      |      0.9829 |   0.9710 |     0.9769 |       414 |
+| nld          | Dutch                       |      0.9662 |   0.9772 |     0.9717 |       527 |
+| nnb          | Nande                       |      0.9870 |   0.9870 |     0.9870 |       385 |
+| nno          | Norwegian Nynorsk           |      0.9585 |   0.9652 |     0.9619 |       575 |
+| nob          | Norwegian Bokmål            |      0.9247 |   0.9156 |     0.9201 |       912 |
+| nst          | Naga (Tangshang)            |      1.0000 |   1.0000 |     1.0000 |        39 |
+| nus          | Nuer                        |      0.9903 |   0.9903 |     0.9903 |       103 |
+| oci          | Occitan                     |      0.9672 |   0.9555 |     0.9613 |       247 |
+| orv          | Old East Slavic             |      0.9692 |   0.9692 |     0.9692 |        65 |
+| oss          | Ossetian                    |      0.9818 |   0.9926 |     0.9872 |       271 |
+| ota          | Ottoman Turkish             |      0.9204 |   0.9905 |     0.9541 |       105 |
+| pam          | Kapampangan                 |      0.9865 |   0.9865 |     0.9865 |        74 |
+| pcd          | Picard                      |      0.9552 |   0.9846 |     0.9697 |        65 |
+| pes          | Persian                     |      0.9890 |   0.9890 |     0.9890 |       455 |
+| pms          | Piedmontese                 |      0.8780 |   0.9000 |     0.8889 |        40 |
+| pol          | Polish                      |      0.9848 |   0.9829 |     0.9838 |       526 |
+| por          | Portuguese                  |      0.9687 |   0.9616 |     0.9651 |       547 |
+| prg          | Old Prussian                |      0.9800 |   0.9800 |     0.9800 |        50 |
+| rhg          | Rohingya                    |      0.9780 |   0.9944 |     0.9861 |       179 |
+| rom          | Romani                      |      0.9302 |   0.8889 |     0.9091 |        45 |
+| ron          | Romanian                    |      0.9826 |   0.9912 |     0.9869 |       457 |
+| run          | Kirundi                     |      0.9914 |   0.9665 |     0.9788 |       239 |
+| rus          | Russian                     |      0.9634 |   0.9814 |     0.9723 |       537 |
+| sah          | Yakut                       |      1.0000 |   0.9600 |     0.9796 |        50 |
+| sat          | Santali                     |      0.9942 |   0.9942 |     0.9942 |       171 |
+| sdh          | Southern Kurdish            |      0.9423 |   0.9074 |     0.9245 |        54 |
+| shi          | Tashelhit                   |      0.9706 |   0.8980 |     0.9329 |       147 |
+| slk          | Slovak                      |      0.9333 |   0.9380 |     0.9356 |       403 |
+| slv          | Slovenian                   |      0.7018 |   0.8889 |     0.7843 |        45 |
+| sma          | Southern Sami               |      0.9600 |   0.9600 |     0.9600 |       100 |
+| sme          | Northern Sami               |      0.9980 |   0.9901 |     0.9940 |       504 |
+| smj          | Lule Sami                   |      0.9820 |   0.9959 |     0.9889 |       493 |
+| smn          | Inari Sami                  |      0.9950 |   0.9900 |     0.9925 |       201 |
+| sms          | Skolt Sami                  |      0.9750 |   0.9848 |     0.9799 |       198 |
+| spa          | Spanish                     |      0.9760 |   0.9601 |     0.9680 |       551 |
+| sqi          | Albanian                    |      0.9762 |   0.9762 |     0.9762 |       126 |
+| srp          | Serbian                     |      0.8367 |   0.8216 |     0.8291 |       499 |
+| swc          | Congo Swahili               |      0.8727 |   0.8458 |     0.8591 |       454 |
+| swe          | Swedish                     |      0.9819 |   0.9819 |     0.9819 |       994 |
+| swg          | Swabian                     |      0.9694 |   0.9406 |     0.9548 |       101 |
+| swh          | Swahili                     |      0.6798 |   0.7225 |     0.7005 |       191 |
+| tat          | Tatar                       |      0.9791 |   0.9843 |     0.9817 |       381 |
+| tgl          | Tagalog                     |      0.9757 |   0.9710 |     0.9734 |       414 |
+| tha          | Thai                        |      1.0000 |   0.9910 |     0.9955 |       222 |
+| thv          | Tahaggart Tamahaq           |      0.6552 |   0.7037 |     0.6786 |        27 |
+| tig          | Tigre                       |      1.0000 |   1.0000 |     1.0000 |       181 |
+| tlh          | Klingon                     |      1.0000 |   0.9932 |     0.9966 |       442 |
+| tok          | Toki Pona                   |      1.0000 |   1.0000 |     1.0000 |       495 |
+| tpw          | Old Tupi                    |      0.8929 |   0.9259 |     0.9091 |        27 |
+| tuk          | Turkmen                     |      0.9779 |   0.9603 |     0.9690 |       277 |
+| tur          | Turkish                     |      0.9908 |   0.9541 |     0.9721 |       567 |
+| uig          | Uyghur                      |      0.9966 |   0.9900 |     0.9933 |       300 |
+| ukr          | Ukrainian                   |      0.9831 |   0.9831 |     0.9831 |       534 |
+| urd          | Urdu                        |      1.0000 |   0.9914 |     0.9957 |       116 |
+| uzb          | Uzbek                       |      0.8200 |   0.9318 |     0.8723 |        44 |
+| vie          | Vietnamese                  |      0.9977 |   0.9953 |     0.9965 |       427 |
+| vol          | Volapük                     |      0.9908 |   0.9908 |     0.9908 |       218 |
+| war          | Waray                       |      0.9307 |   0.9691 |     0.9495 |        97 |
+| wuu          | Shanghainese                |      0.8318 |   0.9036 |     0.8662 |       197 |
+| xal          | Kalmyk                      |      0.9302 |   0.9524 |     0.9412 |        42 |
+| xmf          | Mingrelian                  |      0.7419 |   0.8519 |     0.7931 |        27 |
+| yid          | Yiddish                     |      0.9971 |   1.0000 |     0.9986 |       348 |
+| yue          | Cantonese                   |      0.9004 |   0.9711 |     0.9344 |       242 |
+| zgh          | Standard Moroccan Tamazight |      0.9873 |   0.9873 |     0.9873 |       158 |
+| zlm          | Malay (Vernacular)          |      0.8488 |   0.8902 |     0.8690 |        82 |
+| zsm          | Malay                       |      0.7606 |   0.7883 |     0.7742 |       274 |
+| zza          | Zaza                        |      0.9294 |   0.9634 |     0.9461 |        82 |
+| accuracy     | nan                         |    nan      | nan      |     0.9591 |     44049 |
+| weighted avg | nan                         |      0.9604 |   0.9591 |     0.9595 |     44049 |
+| macro avg    | nan                         |      0.9371 |   0.9474 |     0.9413 |     44049 |