marinone94 commited on
Commit
48ab1cf
1 Parent(s): 6043014

fix training lm script

Browse files
Files changed (1) hide show
  1. train_n_gram_lm_with_KenLM.ipynb +63 -273
train_n_gram_lm_with_KenLM.ipynb CHANGED
@@ -2,7 +2,7 @@
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
- "execution_count": 1,
6
  "metadata": {
7
  "id": "YP3vVkqYUpLx"
8
  },
@@ -14,7 +14,7 @@
14
  },
15
  {
16
  "cell_type": "code",
17
- "execution_count": 2,
18
  "metadata": {
19
  "colab": {
20
  "base_uri": "https://localhost:8080/"
@@ -22,22 +22,14 @@
22
  "id": "AWly9SmkgSwE",
23
  "outputId": "8af190ed-5037-4e3b-b91b-b5286d8e0888"
24
  },
25
- "outputs": [
26
- {
27
- "name": "stdout",
28
- "output_type": "stream",
29
- "text": [
30
- "/bin/bash: sudo: command not found\n"
31
- ]
32
- }
33
- ],
34
  "source": [
35
  "!sudo apt-get install git-lfs tree"
36
  ]
37
  },
38
  {
39
  "cell_type": "code",
40
- "execution_count": 3,
41
  "metadata": {
42
  "colab": {
43
  "base_uri": "https://localhost:8080/"
@@ -54,42 +46,42 @@
54
  "\u001b[33mWARNING: Ignoring invalid distribution -atasets (/opt/conda/lib/python3.8/site-packages)\u001b[0m\n",
55
  "Requirement already satisfied: datasets in /workspace/.local/lib/python3.8/site-packages (2.2.2)\n",
56
  "Requirement already satisfied: transformers in /opt/conda/lib/python3.8/site-packages (4.17.0.dev0)\n",
57
- "Requirement already satisfied: packaging in /opt/conda/lib/python3.8/site-packages (from datasets) (21.3)\n",
58
- "Requirement already satisfied: responses<0.19 in /opt/conda/lib/python3.8/site-packages (from datasets) (0.18.0)\n",
59
- "Requirement already satisfied: fsspec[http]>=2021.05.0 in /opt/conda/lib/python3.8/site-packages (from datasets) (2022.1.0)\n",
60
- "Requirement already satisfied: requests>=2.19.0 in /opt/conda/lib/python3.8/site-packages (from datasets) (2.24.0)\n",
61
- "Requirement already satisfied: pyarrow>=6.0.0 in /opt/conda/lib/python3.8/site-packages (from datasets) (6.0.1)\n",
62
  "Requirement already satisfied: tqdm>=4.62.1 in /opt/conda/lib/python3.8/site-packages (from datasets) (4.62.3)\n",
63
- "Requirement already satisfied: pandas in /opt/conda/lib/python3.8/site-packages (from datasets) (1.4.0)\n",
 
64
  "Requirement already satisfied: numpy>=1.17 in /opt/conda/lib/python3.8/site-packages (from datasets) (1.19.2)\n",
65
- "Requirement already satisfied: multiprocess in /opt/conda/lib/python3.8/site-packages (from datasets) (0.70.12.2)\n",
66
  "Requirement already satisfied: huggingface-hub<1.0.0,>=0.1.0 in /opt/conda/lib/python3.8/site-packages (from datasets) (0.4.0)\n",
67
- "Requirement already satisfied: xxhash in /opt/conda/lib/python3.8/site-packages (from datasets) (2.0.2)\n",
68
- "Requirement already satisfied: dill<0.3.5 in /opt/conda/lib/python3.8/site-packages (from datasets) (0.3.4)\n",
69
- "Requirement already satisfied: aiohttp in /opt/conda/lib/python3.8/site-packages (from datasets) (3.8.1)\n",
 
 
 
70
  "Requirement already satisfied: sacremoses in /opt/conda/lib/python3.8/site-packages (from transformers) (0.0.47)\n",
71
- "Requirement already satisfied: pyyaml>=5.1 in /opt/conda/lib/python3.8/site-packages (from transformers) (5.4.1)\n",
72
  "Requirement already satisfied: filelock in /opt/conda/lib/python3.8/site-packages (from transformers) (3.0.12)\n",
73
  "Requirement already satisfied: tokenizers!=0.11.3,>=0.10.1 in /opt/conda/lib/python3.8/site-packages (from transformers) (0.11.4)\n",
74
  "Requirement already satisfied: regex!=2019.12.17 in /opt/conda/lib/python3.8/site-packages (from transformers) (2022.1.18)\n",
 
75
  "Requirement already satisfied: typing-extensions>=3.7.4.3 in /opt/conda/lib/python3.8/site-packages (from huggingface-hub<1.0.0,>=0.1.0->datasets) (4.0.1)\n",
76
  "Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /opt/conda/lib/python3.8/site-packages (from packaging->datasets) (3.0.7)\n",
77
- "Requirement already satisfied: chardet<4,>=3.0.2 in /opt/conda/lib/python3.8/site-packages (from requests>=2.19.0->datasets) (3.0.4)\n",
78
- "Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.8/site-packages (from requests>=2.19.0->datasets) (2020.12.5)\n",
79
  "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /opt/conda/lib/python3.8/site-packages (from requests>=2.19.0->datasets) (1.25.11)\n",
 
80
  "Requirement already satisfied: idna<3,>=2.5 in /opt/conda/lib/python3.8/site-packages (from requests>=2.19.0->datasets) (2.10)\n",
81
- "Requirement already satisfied: frozenlist>=1.1.1 in /opt/conda/lib/python3.8/site-packages (from aiohttp->datasets) (1.3.0)\n",
82
- "Requirement already satisfied: charset-normalizer<3.0,>=2.0 in /opt/conda/lib/python3.8/site-packages (from aiohttp->datasets) (2.0.10)\n",
83
  "Requirement already satisfied: async-timeout<5.0,>=4.0.0a3 in /opt/conda/lib/python3.8/site-packages (from aiohttp->datasets) (4.0.2)\n",
84
- "Requirement already satisfied: multidict<7.0,>=4.5 in /opt/conda/lib/python3.8/site-packages (from aiohttp->datasets) (6.0.2)\n",
85
- "Requirement already satisfied: aiosignal>=1.1.2 in /opt/conda/lib/python3.8/site-packages (from aiohttp->datasets) (1.2.0)\n",
86
  "Requirement already satisfied: attrs>=17.3.0 in /opt/conda/lib/python3.8/site-packages (from aiohttp->datasets) (21.4.0)\n",
 
87
  "Requirement already satisfied: yarl<2.0,>=1.0 in /opt/conda/lib/python3.8/site-packages (from aiohttp->datasets) (1.7.2)\n",
88
- "Requirement already satisfied: python-dateutil>=2.8.1 in /opt/conda/lib/python3.8/site-packages (from pandas->datasets) (2.8.2)\n",
 
 
89
  "Requirement already satisfied: pytz>=2020.1 in /opt/conda/lib/python3.8/site-packages (from pandas->datasets) (2021.1)\n",
90
- "Requirement already satisfied: click in /opt/conda/lib/python3.8/site-packages (from sacremoses->transformers) (8.0.3)\n",
91
  "Requirement already satisfied: six in /opt/conda/lib/python3.8/site-packages (from sacremoses->transformers) (1.15.0)\n",
92
  "Requirement already satisfied: joblib in /opt/conda/lib/python3.8/site-packages (from sacremoses->transformers) (1.1.0)\n",
 
93
  "\u001b[33mWARNING: Ignoring invalid distribution -atasets (/opt/conda/lib/python3.8/site-packages)\u001b[0m\n",
94
  "\u001b[33mWARNING: Ignoring invalid distribution -atasets (/opt/conda/lib/python3.8/site-packages)\u001b[0m\n",
95
  "\u001b[33mWARNING: Ignoring invalid distribution -atasets (/opt/conda/lib/python3.8/site-packages)\u001b[0m\n",
@@ -105,7 +97,7 @@
105
  },
106
  {
107
  "cell_type": "code",
108
- "execution_count": 4,
109
  "metadata": {
110
  "colab": {
111
  "base_uri": "https://localhost:8080/"
@@ -121,24 +113,15 @@
121
  "\u001b[33mWARNING: Ignoring invalid distribution -atasets (/opt/conda/lib/python3.8/site-packages)\u001b[0m\n",
122
  "\u001b[33mWARNING: Ignoring invalid distribution -atasets (/opt/conda/lib/python3.8/site-packages)\u001b[0m\n",
123
  "Collecting https://github.com/kpu/kenlm/archive/master.zip\n",
124
- " Downloading https://github.com/kpu/kenlm/archive/master.zip (542 kB)\n",
125
- " |████████████████████████████████| 542 kB 3.8 MB/s \n",
126
- "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25ldone\n",
127
  "\u001b[?25hRequirement already satisfied: pyctcdecode in /opt/conda/lib/python3.8/site-packages (0.3.0)\n",
128
- "Requirement already satisfied: hypothesis<7,>=6.14 in /opt/conda/lib/python3.8/site-packages (from pyctcdecode) (6.46.9)\n",
129
  "Requirement already satisfied: numpy<2.0.0,>=1.15.0 in /opt/conda/lib/python3.8/site-packages (from pyctcdecode) (1.19.2)\n",
 
130
  "Requirement already satisfied: pygtrie<3.0,>=2.1 in /opt/conda/lib/python3.8/site-packages (from pyctcdecode) (2.4.2)\n",
131
  "Requirement already satisfied: attrs>=19.2.0 in /opt/conda/lib/python3.8/site-packages (from hypothesis<7,>=6.14->pyctcdecode) (21.4.0)\n",
132
  "Requirement already satisfied: sortedcontainers<3.0.0,>=2.1.0 in /opt/conda/lib/python3.8/site-packages (from hypothesis<7,>=6.14->pyctcdecode) (2.4.0)\n",
133
- "Building wheels for collected packages: kenlm\n",
134
- " Building wheel for kenlm (setup.py) ... \u001b[?25ldone\n",
135
- "\u001b[?25h Created wheel for kenlm: filename=kenlm-0.0.0-cp38-cp38-linux_x86_64.whl size=2341844 sha256=7389c3819998781002180209fa8ff1711b65630ca5dc282cff4b128a9db2c0bd\n",
136
- " Stored in directory: /tmp/pip-ephem-wheel-cache-yk63c6mt/wheels/ff/08/4e/a3ddc0e786e0f3c1fcd2e7a82c4324c02fc3ae2638471406d2\n",
137
- "Successfully built kenlm\n",
138
- "\u001b[33mWARNING: Ignoring invalid distribution -atasets (/opt/conda/lib/python3.8/site-packages)\u001b[0m\n",
139
- "Installing collected packages: kenlm\n",
140
  "\u001b[33mWARNING: Ignoring invalid distribution -atasets (/opt/conda/lib/python3.8/site-packages)\u001b[0m\n",
141
- "Successfully installed kenlm-0.0.0\n",
142
  "\u001b[33mWARNING: Ignoring invalid distribution -atasets (/opt/conda/lib/python3.8/site-packages)\u001b[0m\n",
143
  "\u001b[33mWARNING: Ignoring invalid distribution -atasets (/opt/conda/lib/python3.8/site-packages)\u001b[0m\n",
144
  "\u001b[33mWARNING: Ignoring invalid distribution -atasets (/opt/conda/lib/python3.8/site-packages)\u001b[0m\n",
@@ -153,7 +136,7 @@
153
  },
154
  {
155
  "cell_type": "code",
156
- "execution_count": 17,
157
  "metadata": {
158
  "colab": {
159
  "base_uri": "https://localhost:8080/",
@@ -185,7 +168,7 @@
185
  {
186
  "data": {
187
  "application/vnd.jupyter.widget-view+json": {
188
- "model_id": "c3eef7b5d70d46feaa5d3d7f1281eb82",
189
  "version_major": 2,
190
  "version_minor": 0
191
  },
@@ -205,7 +188,7 @@
205
  },
206
  {
207
  "cell_type": "code",
208
- "execution_count": 6,
209
  "metadata": {
210
  "id": "fsrpUSEBYH7g"
211
  },
@@ -216,7 +199,7 @@
216
  },
217
  {
218
  "cell_type": "code",
219
- "execution_count": 7,
220
  "metadata": {
221
  "colab": {
222
  "base_uri": "https://localhost:8080/"
@@ -229,7 +212,8 @@
229
  "name": "stdout",
230
  "output_type": "stream",
231
  "text": [
232
- "/bin/bash: sudo: command not found\n"
 
233
  ]
234
  }
235
  ],
@@ -239,7 +223,7 @@
239
  },
240
  {
241
  "cell_type": "code",
242
- "execution_count": 8,
243
  "metadata": {
244
  "colab": {
245
  "base_uri": "https://localhost:8080/"
@@ -252,16 +236,16 @@
252
  "name": "stdout",
253
  "output_type": "stream",
254
  "text": [
255
- "--2022-05-26 11:49:56-- https://kheafield.com/code/kenlm.tar.gz\n",
256
  "Resolving kheafield.com (kheafield.com)... 35.196.63.85\n",
257
  "Connecting to kheafield.com (kheafield.com)|35.196.63.85|:443... connected.\n",
258
  "HTTP request sent, awaiting response... 200 OK\n",
259
  "Length: 491888 (480K) [application/x-gzip]\n",
260
  "Saving to: ‘STDOUT’\n",
261
  "\n",
262
- "- 100%[===================>] 480.36K 799KB/s in 0.6s \n",
263
  "\n",
264
- "2022-05-26 11:49:57 (799 KB/s) - written to stdout [491888/491888]\n",
265
  "\n"
266
  ]
267
  }
@@ -757,7 +741,6 @@
757
  }
758
  ],
759
  "source": [
760
- "\n",
761
  "!kenlm/build/bin/lmplz -o 5 <\"text.txt\" > \"5gram.arpa\""
762
  ]
763
  },
@@ -815,7 +798,7 @@
815
  },
816
  {
817
  "cell_type": "code",
818
- "execution_count": 15,
819
  "metadata": {
820
  "colab": {
821
  "base_uri": "https://localhost:8080/",
@@ -892,134 +875,7 @@
892
  "id": "paV71gdAtkDC",
893
  "outputId": "c2df6859-db57-4d4a-92b0-41b54a4215bf"
894
  },
895
- "outputs": [
896
- {
897
- "data": {
898
- "application/vnd.jupyter.widget-view+json": {
899
- "model_id": "8f7c10edbad644688af3cd4e4674eac7",
900
- "version_major": 2,
901
- "version_minor": 0
902
- },
903
- "text/plain": [
904
- "Downloading: 0%| | 0.00/260 [00:00<?, ?B/s]"
905
- ]
906
- },
907
- "metadata": {},
908
- "output_type": "display_data"
909
- },
910
- {
911
- "data": {
912
- "application/vnd.jupyter.widget-view+json": {
913
- "model_id": "e2a9c7fbf0c143e3a80565e429d10095",
914
- "version_major": 2,
915
- "version_minor": 0
916
- },
917
- "text/plain": [
918
- "Downloading: 0%| | 0.00/335 [00:00<?, ?B/s]"
919
- ]
920
- },
921
- "metadata": {},
922
- "output_type": "display_data"
923
- },
924
- {
925
- "data": {
926
- "application/vnd.jupyter.widget-view+json": {
927
- "model_id": "d038c5533a9f4167a48eb3e70ebd156a",
928
- "version_major": 2,
929
- "version_minor": 0
930
- },
931
- "text/plain": [
932
- "Downloading: 0%| | 0.00/301 [00:00<?, ?B/s]"
933
- ]
934
- },
935
- "metadata": {},
936
- "output_type": "display_data"
937
- },
938
- {
939
- "data": {
940
- "application/vnd.jupyter.widget-view+json": {
941
- "model_id": "cdb0e7da895b4f449599544001e57d12",
942
- "version_major": 2,
943
- "version_minor": 0
944
- },
945
- "text/plain": [
946
- "Downloading: 0%| | 0.00/23.0 [00:00<?, ?B/s]"
947
- ]
948
- },
949
- "metadata": {},
950
- "output_type": "display_data"
951
- },
952
- {
953
- "data": {
954
- "application/vnd.jupyter.widget-view+json": {
955
- "model_id": "ada56f3555ec48a8a22d1d5a2ae81f5f",
956
- "version_major": 2,
957
- "version_minor": 0
958
- },
959
- "text/plain": [
960
- "Downloading: 0%| | 0.00/5.20k [00:00<?, ?B/s]"
961
- ]
962
- },
963
- "metadata": {},
964
- "output_type": "display_data"
965
- },
966
- {
967
- "data": {
968
- "application/vnd.jupyter.widget-view+json": {
969
- "model_id": "f8314566f4154fa990c72e90d2ea7d9a",
970
- "version_major": 2,
971
- "version_minor": 0
972
- },
973
- "text/plain": [
974
- "Downloading: 0%| | 0.00/223 [00:00<?, ?B/s]"
975
- ]
976
- },
977
- "metadata": {},
978
- "output_type": "display_data"
979
- },
980
- {
981
- "data": {
982
- "application/vnd.jupyter.widget-view+json": {
983
- "model_id": "30104479e44c4532b7dab3babcece67a",
984
- "version_major": 2,
985
- "version_minor": 0
986
- },
987
- "text/plain": [
988
- "Downloading: 0%| | 0.00/2.19G [00:00<?, ?B/s]"
989
- ]
990
- },
991
- "metadata": {},
992
- "output_type": "display_data"
993
- },
994
- {
995
- "data": {
996
- "application/vnd.jupyter.widget-view+json": {
997
- "model_id": "57e1b10e91024d00af578d9e175b3ae8",
998
- "version_major": 2,
999
- "version_minor": 0
1000
- },
1001
- "text/plain": [
1002
- "Downloading: 0%| | 0.00/78.0 [00:00<?, ?B/s]"
1003
- ]
1004
- },
1005
- "metadata": {},
1006
- "output_type": "display_data"
1007
- },
1008
- {
1009
- "data": {
1010
- "application/vnd.jupyter.widget-view+json": {
1011
- "model_id": "e48e0eb6e25c4dbb9a53d45124c80eeb",
1012
- "version_major": 2,
1013
- "version_minor": 0
1014
- },
1015
- "text/plain": [
1016
- "Downloading: 0%| | 0.00/6.03M [00:00<?, ?B/s]"
1017
- ]
1018
- },
1019
- "metadata": {},
1020
- "output_type": "display_data"
1021
- }
1022
- ],
1023
  "source": [
1024
  "from transformers import AutoProcessor\n",
1025
  "\n",
@@ -1028,7 +884,7 @@
1028
  },
1029
  {
1030
  "cell_type": "code",
1031
- "execution_count": 16,
1032
  "metadata": {
1033
  "colab": {
1034
  "base_uri": "https://localhost:8080/",
@@ -1088,92 +944,8 @@
1088
  "name": "stderr",
1089
  "output_type": "stream",
1090
  "text": [
1091
- "Cloning https://huggingface.co/marinone94/xls-r-300m-sv-robust into local empty directory.\n"
1092
  ]
1093
- },
1094
- {
1095
- "data": {
1096
- "application/vnd.jupyter.widget-view+json": {
1097
- "model_id": "4d79ace826ea4922a278dc33e8362513",
1098
- "version_major": 2,
1099
- "version_minor": 0
1100
- },
1101
- "text/plain": [
1102
- "Download file language_model/5gram.bin: 0%| | 15.6k/2.04G [00:00<?, ?B/s]"
1103
- ]
1104
- },
1105
- "metadata": {},
1106
- "output_type": "display_data"
1107
- },
1108
- {
1109
- "data": {
1110
- "application/vnd.jupyter.widget-view+json": {
1111
- "model_id": "caae4b13e94d491587e027c710efe8fb",
1112
- "version_major": 2,
1113
- "version_minor": 0
1114
- },
1115
- "text/plain": [
1116
- "Download file training_args.bin: 62%|######1 | 1.84k/2.98k [00:00<?, ?B/s]"
1117
- ]
1118
- },
1119
- "metadata": {},
1120
- "output_type": "display_data"
1121
- },
1122
- {
1123
- "data": {
1124
- "application/vnd.jupyter.widget-view+json": {
1125
- "model_id": "58d1adfe09944bfe8e0e2a8588abed90",
1126
- "version_major": 2,
1127
- "version_minor": 0
1128
- },
1129
- "text/plain": [
1130
- "Download file pytorch_model.bin: 0%| | 3.58k/1.18G [00:00<?, ?B/s]"
1131
- ]
1132
- },
1133
- "metadata": {},
1134
- "output_type": "display_data"
1135
- },
1136
- {
1137
- "data": {
1138
- "application/vnd.jupyter.widget-view+json": {
1139
- "model_id": "a209b62ddcea4cd9968172391ac53d59",
1140
- "version_major": 2,
1141
- "version_minor": 0
1142
- },
1143
- "text/plain": [
1144
- "Clean file training_args.bin: 34%|###3 | 1.00k/2.98k [00:00<?, ?B/s]"
1145
- ]
1146
- },
1147
- "metadata": {},
1148
- "output_type": "display_data"
1149
- },
1150
- {
1151
- "data": {
1152
- "application/vnd.jupyter.widget-view+json": {
1153
- "model_id": "ecf7be42d7c14242bb70bfacb59e7d1c",
1154
- "version_major": 2,
1155
- "version_minor": 0
1156
- },
1157
- "text/plain": [
1158
- "Clean file pytorch_model.bin: 0%| | 1.00k/1.18G [00:00<?, ?B/s]"
1159
- ]
1160
- },
1161
- "metadata": {},
1162
- "output_type": "display_data"
1163
- },
1164
- {
1165
- "data": {
1166
- "application/vnd.jupyter.widget-view+json": {
1167
- "model_id": "aec000d36ed74e008c56f924e8d07d34",
1168
- "version_major": 2,
1169
- "version_minor": 0
1170
- },
1171
- "text/plain": [
1172
- "Clean file language_model/5gram.bin: 0%| | 1.00k/2.04G [00:00<?, ?B/s]"
1173
- ]
1174
- },
1175
- "metadata": {},
1176
- "output_type": "display_data"
1177
  }
1178
  ],
1179
  "source": [
@@ -1184,7 +956,7 @@
1184
  },
1185
  {
1186
  "cell_type": "code",
1187
- "execution_count": 18,
1188
  "metadata": {
1189
  "id": "ZKwKxMoitoGS"
1190
  },
@@ -1196,7 +968,7 @@
1196
  },
1197
  {
1198
  "cell_type": "code",
1199
- "execution_count": 19,
1200
  "metadata": {
1201
  "colab": {
1202
  "base_uri": "https://localhost:8080/"
@@ -1209,8 +981,26 @@
1209
  "name": "stderr",
1210
  "output_type": "stream",
1211
  "text": [
1212
- "Found entries of length > 1 in alphabet. This is unusual unless style is BPE, but the alphabet was not recognized as BPE type. Is this correct?\n",
1213
- "Unigrams and labels don't seem to agree.\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1214
  ]
1215
  }
1216
  ],
 
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
+ "execution_count": 34,
6
  "metadata": {
7
  "id": "YP3vVkqYUpLx"
8
  },
 
14
  },
15
  {
16
  "cell_type": "code",
17
+ "execution_count": 35,
18
  "metadata": {
19
  "colab": {
20
  "base_uri": "https://localhost:8080/"
 
22
  "id": "AWly9SmkgSwE",
23
  "outputId": "8af190ed-5037-4e3b-b91b-b5286d8e0888"
24
  },
25
+ "outputs": [],
 
 
 
 
 
 
 
 
26
  "source": [
27
  "!sudo apt-get install git-lfs tree"
28
  ]
29
  },
30
  {
31
  "cell_type": "code",
32
+ "execution_count": 36,
33
  "metadata": {
34
  "colab": {
35
  "base_uri": "https://localhost:8080/"
 
46
  "\u001b[33mWARNING: Ignoring invalid distribution -atasets (/opt/conda/lib/python3.8/site-packages)\u001b[0m\n",
47
  "Requirement already satisfied: datasets in /workspace/.local/lib/python3.8/site-packages (2.2.2)\n",
48
  "Requirement already satisfied: transformers in /opt/conda/lib/python3.8/site-packages (4.17.0.dev0)\n",
49
+ "Requirement already satisfied: aiohttp in /opt/conda/lib/python3.8/site-packages (from datasets) (3.8.1)\n",
50
+ "Requirement already satisfied: dill<0.3.5 in /opt/conda/lib/python3.8/site-packages (from datasets) (0.3.4)\n",
 
 
 
51
  "Requirement already satisfied: tqdm>=4.62.1 in /opt/conda/lib/python3.8/site-packages (from datasets) (4.62.3)\n",
52
+ "Requirement already satisfied: xxhash in /opt/conda/lib/python3.8/site-packages (from datasets) (2.0.2)\n",
53
+ "Requirement already satisfied: responses<0.19 in /opt/conda/lib/python3.8/site-packages (from datasets) (0.18.0)\n",
54
  "Requirement already satisfied: numpy>=1.17 in /opt/conda/lib/python3.8/site-packages (from datasets) (1.19.2)\n",
 
55
  "Requirement already satisfied: huggingface-hub<1.0.0,>=0.1.0 in /opt/conda/lib/python3.8/site-packages (from datasets) (0.4.0)\n",
56
+ "Requirement already satisfied: packaging in /opt/conda/lib/python3.8/site-packages (from datasets) (21.3)\n",
57
+ "Requirement already satisfied: requests>=2.19.0 in /opt/conda/lib/python3.8/site-packages (from datasets) (2.24.0)\n",
58
+ "Requirement already satisfied: fsspec[http]>=2021.05.0 in /opt/conda/lib/python3.8/site-packages (from datasets) (2022.1.0)\n",
59
+ "Requirement already satisfied: multiprocess in /opt/conda/lib/python3.8/site-packages (from datasets) (0.70.12.2)\n",
60
+ "Requirement already satisfied: pandas in /opt/conda/lib/python3.8/site-packages (from datasets) (1.4.0)\n",
61
+ "Requirement already satisfied: pyarrow>=6.0.0 in /opt/conda/lib/python3.8/site-packages (from datasets) (6.0.1)\n",
62
  "Requirement already satisfied: sacremoses in /opt/conda/lib/python3.8/site-packages (from transformers) (0.0.47)\n",
 
63
  "Requirement already satisfied: filelock in /opt/conda/lib/python3.8/site-packages (from transformers) (3.0.12)\n",
64
  "Requirement already satisfied: tokenizers!=0.11.3,>=0.10.1 in /opt/conda/lib/python3.8/site-packages (from transformers) (0.11.4)\n",
65
  "Requirement already satisfied: regex!=2019.12.17 in /opt/conda/lib/python3.8/site-packages (from transformers) (2022.1.18)\n",
66
+ "Requirement already satisfied: pyyaml>=5.1 in /opt/conda/lib/python3.8/site-packages (from transformers) (5.4.1)\n",
67
  "Requirement already satisfied: typing-extensions>=3.7.4.3 in /opt/conda/lib/python3.8/site-packages (from huggingface-hub<1.0.0,>=0.1.0->datasets) (4.0.1)\n",
68
  "Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /opt/conda/lib/python3.8/site-packages (from packaging->datasets) (3.0.7)\n",
 
 
69
  "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /opt/conda/lib/python3.8/site-packages (from requests>=2.19.0->datasets) (1.25.11)\n",
70
+ "Requirement already satisfied: chardet<4,>=3.0.2 in /opt/conda/lib/python3.8/site-packages (from requests>=2.19.0->datasets) (3.0.4)\n",
71
  "Requirement already satisfied: idna<3,>=2.5 in /opt/conda/lib/python3.8/site-packages (from requests>=2.19.0->datasets) (2.10)\n",
72
+ "Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.8/site-packages (from requests>=2.19.0->datasets) (2020.12.5)\n",
 
73
  "Requirement already satisfied: async-timeout<5.0,>=4.0.0a3 in /opt/conda/lib/python3.8/site-packages (from aiohttp->datasets) (4.0.2)\n",
 
 
74
  "Requirement already satisfied: attrs>=17.3.0 in /opt/conda/lib/python3.8/site-packages (from aiohttp->datasets) (21.4.0)\n",
75
+ "Requirement already satisfied: charset-normalizer<3.0,>=2.0 in /opt/conda/lib/python3.8/site-packages (from aiohttp->datasets) (2.0.10)\n",
76
  "Requirement already satisfied: yarl<2.0,>=1.0 in /opt/conda/lib/python3.8/site-packages (from aiohttp->datasets) (1.7.2)\n",
77
+ "Requirement already satisfied: frozenlist>=1.1.1 in /opt/conda/lib/python3.8/site-packages (from aiohttp->datasets) (1.3.0)\n",
78
+ "Requirement already satisfied: multidict<7.0,>=4.5 in /opt/conda/lib/python3.8/site-packages (from aiohttp->datasets) (6.0.2)\n",
79
+ "Requirement already satisfied: aiosignal>=1.1.2 in /opt/conda/lib/python3.8/site-packages (from aiohttp->datasets) (1.2.0)\n",
80
  "Requirement already satisfied: pytz>=2020.1 in /opt/conda/lib/python3.8/site-packages (from pandas->datasets) (2021.1)\n",
81
+ "Requirement already satisfied: python-dateutil>=2.8.1 in /opt/conda/lib/python3.8/site-packages (from pandas->datasets) (2.8.2)\n",
82
  "Requirement already satisfied: six in /opt/conda/lib/python3.8/site-packages (from sacremoses->transformers) (1.15.0)\n",
83
  "Requirement already satisfied: joblib in /opt/conda/lib/python3.8/site-packages (from sacremoses->transformers) (1.1.0)\n",
84
+ "Requirement already satisfied: click in /opt/conda/lib/python3.8/site-packages (from sacremoses->transformers) (8.0.3)\n",
85
  "\u001b[33mWARNING: Ignoring invalid distribution -atasets (/opt/conda/lib/python3.8/site-packages)\u001b[0m\n",
86
  "\u001b[33mWARNING: Ignoring invalid distribution -atasets (/opt/conda/lib/python3.8/site-packages)\u001b[0m\n",
87
  "\u001b[33mWARNING: Ignoring invalid distribution -atasets (/opt/conda/lib/python3.8/site-packages)\u001b[0m\n",
 
97
  },
98
  {
99
  "cell_type": "code",
100
+ "execution_count": 37,
101
  "metadata": {
102
  "colab": {
103
  "base_uri": "https://localhost:8080/"
 
113
  "\u001b[33mWARNING: Ignoring invalid distribution -atasets (/opt/conda/lib/python3.8/site-packages)\u001b[0m\n",
114
  "\u001b[33mWARNING: Ignoring invalid distribution -atasets (/opt/conda/lib/python3.8/site-packages)\u001b[0m\n",
115
  "Collecting https://github.com/kpu/kenlm/archive/master.zip\n",
116
+ " Using cached https://github.com/kpu/kenlm/archive/master.zip (542 kB)\n",
117
+ " Preparing metadata (setup.py) ... \u001b[?25ldone\n",
 
118
  "\u001b[?25hRequirement already satisfied: pyctcdecode in /opt/conda/lib/python3.8/site-packages (0.3.0)\n",
 
119
  "Requirement already satisfied: numpy<2.0.0,>=1.15.0 in /opt/conda/lib/python3.8/site-packages (from pyctcdecode) (1.19.2)\n",
120
+ "Requirement already satisfied: hypothesis<7,>=6.14 in /opt/conda/lib/python3.8/site-packages (from pyctcdecode) (6.46.9)\n",
121
  "Requirement already satisfied: pygtrie<3.0,>=2.1 in /opt/conda/lib/python3.8/site-packages (from pyctcdecode) (2.4.2)\n",
122
  "Requirement already satisfied: attrs>=19.2.0 in /opt/conda/lib/python3.8/site-packages (from hypothesis<7,>=6.14->pyctcdecode) (21.4.0)\n",
123
  "Requirement already satisfied: sortedcontainers<3.0.0,>=2.1.0 in /opt/conda/lib/python3.8/site-packages (from hypothesis<7,>=6.14->pyctcdecode) (2.4.0)\n",
 
 
 
 
 
 
 
124
  "\u001b[33mWARNING: Ignoring invalid distribution -atasets (/opt/conda/lib/python3.8/site-packages)\u001b[0m\n",
 
125
  "\u001b[33mWARNING: Ignoring invalid distribution -atasets (/opt/conda/lib/python3.8/site-packages)\u001b[0m\n",
126
  "\u001b[33mWARNING: Ignoring invalid distribution -atasets (/opt/conda/lib/python3.8/site-packages)\u001b[0m\n",
127
  "\u001b[33mWARNING: Ignoring invalid distribution -atasets (/opt/conda/lib/python3.8/site-packages)\u001b[0m\n",
 
136
  },
137
  {
138
  "cell_type": "code",
139
+ "execution_count": 38,
140
  "metadata": {
141
  "colab": {
142
  "base_uri": "https://localhost:8080/",
 
168
  {
169
  "data": {
170
  "application/vnd.jupyter.widget-view+json": {
171
+ "model_id": "fb1fe87003eb4d6b936693d8dce9066e",
172
  "version_major": 2,
173
  "version_minor": 0
174
  },
 
188
  },
189
  {
190
  "cell_type": "code",
191
+ "execution_count": 39,
192
  "metadata": {
193
  "id": "fsrpUSEBYH7g"
194
  },
 
199
  },
200
  {
201
  "cell_type": "code",
202
+ "execution_count": 40,
203
  "metadata": {
204
  "colab": {
205
  "base_uri": "https://localhost:8080/"
 
212
  "name": "stdout",
213
  "output_type": "stream",
214
  "text": [
215
+ "E: Could not open lock file /var/lib/dpkg/lock-frontend - open (13: Permission denied)\n",
216
+ "E: Unable to acquire the dpkg frontend lock (/var/lib/dpkg/lock-frontend), are you root?\n"
217
  ]
218
  }
219
  ],
 
223
  },
224
  {
225
  "cell_type": "code",
226
+ "execution_count": 28,
227
  "metadata": {
228
  "colab": {
229
  "base_uri": "https://localhost:8080/"
 
236
  "name": "stdout",
237
  "output_type": "stream",
238
  "text": [
239
+ "--2022-05-26 13:39:11-- https://kheafield.com/code/kenlm.tar.gz\n",
240
  "Resolving kheafield.com (kheafield.com)... 35.196.63.85\n",
241
  "Connecting to kheafield.com (kheafield.com)|35.196.63.85|:443... connected.\n",
242
  "HTTP request sent, awaiting response... 200 OK\n",
243
  "Length: 491888 (480K) [application/x-gzip]\n",
244
  "Saving to: ‘STDOUT’\n",
245
  "\n",
246
+ "- 100%[===================>] 480.36K 845KB/s in 0.6s \n",
247
  "\n",
248
+ "2022-05-26 13:39:12 (845 KB/s) - written to stdout [491888/491888]\n",
249
  "\n"
250
  ]
251
  }
 
741
  }
742
  ],
743
  "source": [
 
744
  "!kenlm/build/bin/lmplz -o 5 <\"text.txt\" > \"5gram.arpa\""
745
  ]
746
  },
 
798
  },
799
  {
800
  "cell_type": "code",
801
+ "execution_count": null,
802
  "metadata": {
803
  "colab": {
804
  "base_uri": "https://localhost:8080/",
 
875
  "id": "paV71gdAtkDC",
876
  "outputId": "c2df6859-db57-4d4a-92b0-41b54a4215bf"
877
  },
878
+ "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
879
  "source": [
880
  "from transformers import AutoProcessor\n",
881
  "\n",
 
884
  },
885
  {
886
  "cell_type": "code",
887
+ "execution_count": 18,
888
  "metadata": {
889
  "colab": {
890
  "base_uri": "https://localhost:8080/",
 
944
  "name": "stderr",
945
  "output_type": "stream",
946
  "text": [
947
+ "/workspace/xls-r-300m-sv-robust/xls-r-300m-sv-robust is already a clone of https://huggingface.co/marinone94/xls-r-300m-sv-robust. Make sure you pull the latest changes with `repo.git_pull()`.\n"
948
  ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
949
  }
950
  ],
951
  "source": [
 
956
  },
957
  {
958
  "cell_type": "code",
959
+ "execution_count": 19,
960
  "metadata": {
961
  "id": "ZKwKxMoitoGS"
962
  },
 
968
  },
969
  {
970
  "cell_type": "code",
971
+ "execution_count": 20,
972
  "metadata": {
973
  "colab": {
974
  "base_uri": "https://localhost:8080/"
 
981
  "name": "stderr",
982
  "output_type": "stream",
983
  "text": [
984
+ "Loading the LM will be faster if you build a binary file.\n",
985
+ "Reading /workspace/xls-r-300m-sv-robust/5gram_correct.arpa\n",
986
+ "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n"
987
+ ]
988
+ },
989
+ {
990
+ "ename": "OSError",
991
+ "evalue": "Cannot read model '5gram_correct.arpa' (End of file Byte: 0)",
992
+ "output_type": "error",
993
+ "traceback": [
994
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
995
+ "\u001b[0;31mRuntimeError\u001b[0m Traceback (most recent call last)",
996
+ "File \u001b[0;32mkenlm.pyx:139\u001b[0m, in \u001b[0;36mkenlm.Model.__init__\u001b[0;34m()\u001b[0m\n",
997
+ "\u001b[0;31mRuntimeError\u001b[0m: End of file Byte: 0",
998
+ "\nThe above exception was the direct cause of the following exception:\n",
999
+ "\u001b[0;31mOSError\u001b[0m Traceback (most recent call last)",
1000
+ "Input \u001b[0;32mIn [20]\u001b[0m, in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mpyctcdecode\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m build_ctcdecoder\n\u001b[0;32m----> 3\u001b[0m decoder \u001b[38;5;241m=\u001b[39m \u001b[43mbuild_ctcdecoder\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 4\u001b[0m \u001b[43m \u001b[49m\u001b[43mlabels\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mlist\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43msorted_vocab_dict\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mkeys\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 5\u001b[0m \u001b[43m \u001b[49m\u001b[43mkenlm_model_path\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m5gram_correct.arpa\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 6\u001b[0m \u001b[43m \u001b[49m\u001b[43malpha\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m0.5\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 7\u001b[0m \u001b[43m \u001b[49m\u001b[43mbeta\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m1.5\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 8\u001b[0m \u001b[43m)\u001b[49m\n",
1001
+ "File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/pyctcdecode/decoder.py:790\u001b[0m, in \u001b[0;36mbuild_ctcdecoder\u001b[0;34m(labels, kenlm_model_path, unigrams, alpha, beta, unk_score_offset, lm_score_boundary)\u001b[0m\n\u001b[1;32m 767\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mbuild_ctcdecoder\u001b[39m(\n\u001b[1;32m 768\u001b[0m labels: List[\u001b[38;5;28mstr\u001b[39m],\n\u001b[1;32m 769\u001b[0m kenlm_model_path: Optional[\u001b[38;5;28mstr\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 774\u001b[0m lm_score_boundary: \u001b[38;5;28mbool\u001b[39m \u001b[38;5;241m=\u001b[39m DEFAULT_SCORE_LM_BOUNDARY,\n\u001b[1;32m 775\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m BeamSearchDecoderCTC:\n\u001b[1;32m 776\u001b[0m \u001b[38;5;124;03m\"\"\"Build a BeamSearchDecoderCTC instance with main functionality.\u001b[39;00m\n\u001b[1;32m 777\u001b[0m \n\u001b[1;32m 778\u001b[0m \u001b[38;5;124;03m Args:\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 788\u001b[0m \u001b[38;5;124;03m instance of BeamSearchDecoderCTC\u001b[39;00m\n\u001b[1;32m 789\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m--> 790\u001b[0m kenlm_model \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01mif\u001b[39;00m kenlm_model_path \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[43mkenlm\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mModel\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkenlm_model_path\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 791\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m kenlm_model_path \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m kenlm_model_path\u001b[38;5;241m.\u001b[39mendswith(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m.arpa\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[1;32m 792\u001b[0m logger\u001b[38;5;241m.\u001b[39minfo(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mUsing arpa instead of binary LM file, decoder instantiation might be slow.\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
1002
+ "File \u001b[0;32mkenlm.pyx:142\u001b[0m, in \u001b[0;36mkenlm.Model.__init__\u001b[0;34m()\u001b[0m\n",
1003
+ "\u001b[0;31mOSError\u001b[0m: Cannot read model '5gram_correct.arpa' (End of file Byte: 0)"
1004
  ]
1005
  }
1006
  ],