update lm

Browse files

Files changed (3) hide show

build_lm_processor.ipynb +18 -31
language_model/unigrams.txt +0 -0
special_tokens_map.json +1 -1

build_lm_processor.ipynb CHANGED Viewed

@@ -3,7 +3,7 @@
   {
    "cell_type": "code",
    "execution_count": 1,
-   "id": "5393aa33",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -25,32 +25,21 @@
   {
    "cell_type": "code",
    "execution_count": 2,
-   "id": "2d34d3b8",
    "metadata": {},
    "outputs": [],
    "source": [
     "# KENLM_MODEL_LOC = '/workspace/xls-r-300m-km/data/km_text_word_unigram.arpa'\n",
-    "KENLM_MODEL_LOC = '/workspace/xls-r-300m-km/data/km_wiki_ngram.arpa'"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 3,
-   "id": "f0354cb2",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Loading the LM will be faster if you build a binary file.\n",
-      "Reading /workspace/xls-r-300m-km/vitouphy/xls-r-300m-km/language_model/km_text.arpa\n",
-      "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
-      "Only 81 unigrams passed as vocabulary. Is this small or artificial data?\n",
-      "****************************************************************************************************\n"
-     ]
-    }
-   ],
    "source": [
     "processor = AutoProcessor.from_pretrained(\"vitouphy/xls-r-300m-km\")"
    ]
@@ -58,7 +47,7 @@
   {
    "cell_type": "code",
    "execution_count": 4,
-   "id": "109f28e9",
    "metadata": {},
    "outputs": [
     {
@@ -78,18 +67,16 @@
   {
    "cell_type": "code",
    "execution_count": 5,
-   "id": "300cec39",
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Loading the LM will be faster if you build a binary file.\n",
-      "Reading /workspace/xls-r-300m-km/data/km_wiki_ngram.arpa\n",
-      "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
       "Found entries of length > 1 in alphabet. This is unusual unless style is BPE, but the alphabet was not recognized as BPE type. Is this correct?\n",
-      "****************************************************************************************************\n"
      ]
     }
    ],
@@ -102,8 +89,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
-   "id": "27dd8427",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -116,8 +103,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
-   "id": "94eb248e",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -126,7 +113,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "8f9b3dcc",
    "metadata": {},
    "source": [
     "## Save Model"
@@ -135,7 +122,7 @@
   {
    "cell_type": "code",
    "execution_count": 9,
-   "id": "8b584690",
    "metadata": {},
    "outputs": [
     {
@@ -160,7 +147,7 @@
   {
    "cell_type": "code",
    "execution_count": 12,
-   "id": "3712c030",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -170,7 +157,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "b5d8de20",
    "metadata": {},
    "outputs": [],
    "source": []

   {
    "cell_type": "code",
    "execution_count": 1,
+   "id": "57176d39",
    "metadata": {},
    "outputs": [],
    "source": [
   {
    "cell_type": "code",
    "execution_count": 2,
+   "id": "dbc1f98a",
    "metadata": {},
    "outputs": [],
    "source": [
     "# KENLM_MODEL_LOC = '/workspace/xls-r-300m-km/data/km_text_word_unigram.arpa'\n",
+    "# KENLM_MODEL_LOC = '/workspace/xls-r-300m-km/data/km_wiki_ngram.arpa'\n",
+    "KENLM_MODEL_LOC = '/workspace/xls-r-300m-km/data/kmwiki_5gram.binary'"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 3,
+   "id": "54d76e5f",
    "metadata": {},
+   "outputs": [],
    "source": [
     "processor = AutoProcessor.from_pretrained(\"vitouphy/xls-r-300m-km\")"
    ]
   {
    "cell_type": "code",
    "execution_count": 4,
+   "id": "c76a5c8e",
    "metadata": {},
    "outputs": [
     {
   {
    "cell_type": "code",
    "execution_count": 5,
+   "id": "8b640127",
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
+      "Unigrams not provided and cannot be automatically determined from LM file (only arpa format). Decoding accuracy might be reduced.\n",
       "Found entries of length > 1 in alphabet. This is unusual unless style is BPE, but the alphabet was not recognized as BPE type. Is this correct?\n",
+      "No known unigrams provided, decoding results might be a lot worse.\n"
      ]
     }
    ],
   },
   {
    "cell_type": "code",
+   "execution_count": 6,
+   "id": "2560c32d",
    "metadata": {},
    "outputs": [],
    "source": [
   },
   {
    "cell_type": "code",
+   "execution_count": 7,
+   "id": "badc19a1",
    "metadata": {},
    "outputs": [],
    "source": [
   },
   {
    "cell_type": "markdown",
+   "id": "89e517c8",
    "metadata": {},
    "source": [
     "## Save Model"
   {
    "cell_type": "code",
    "execution_count": 9,
+   "id": "ed9535c8",
    "metadata": {},
    "outputs": [
     {
   {
    "cell_type": "code",
    "execution_count": 12,
+   "id": "758b6f9a",
    "metadata": {},
    "outputs": [],
    "source": [
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "3166a19b",
    "metadata": {},
    "outputs": [],
    "source": []

language_model/unigrams.txt CHANGED Viewed

The diff for this file is too large to render. See raw diff

special_tokens_map.json CHANGED Viewed

@@ -1 +1 @@

- {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "[UNK]", "pad_token": "[PAD]", "additional_special_tokens": [{"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}]}

+ {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "[UNK]", "pad_token": "[PAD]", "additional_special_tokens": [{"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}]}