zaidmehdi commited on
Commit
6c50497
โ€ข
1 Parent(s): fe78e9e

defining structure and reading data

Browse files
Files changed (1) hide show
  1. src/classifier.ipynb +252 -3
src/classifier.ipynb CHANGED
@@ -1,16 +1,265 @@
1
  {
2
  "cells": [
 
 
 
 
 
 
 
 
3
  {
4
  "cell_type": "code",
5
- "execution_count": null,
6
  "metadata": {},
7
  "outputs": [],
8
- "source": []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  }
10
  ],
11
  "metadata": {
 
 
 
 
 
12
  "language_info": {
13
- "name": "python"
 
 
 
 
 
 
 
 
 
14
  }
15
  },
16
  "nbformat": 4,
 
1
  {
2
  "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "# Arabic Dialect Classifier\n",
8
+ "This notebook contains the training of the classifier model."
9
+ ]
10
+ },
11
  {
12
  "cell_type": "code",
13
+ "execution_count": 1,
14
  "metadata": {},
15
  "outputs": [],
16
+ "source": [
17
+ "import pandas as pd"
18
+ ]
19
+ },
20
+ {
21
+ "cell_type": "markdown",
22
+ "metadata": {},
23
+ "source": [
24
+ "## 1. Exploring the Dataset"
25
+ ]
26
+ },
27
+ {
28
+ "cell_type": "code",
29
+ "execution_count": 5,
30
+ "metadata": {},
31
+ "outputs": [],
32
+ "source": [
33
+ "df_train = pd.read_csv(\"../data/DA_train_labeled.tsv\", sep=\"\\t\")\n",
34
+ "df_test = pd.read_csv(\"../data/DA_dev_labeled.tsv\", sep=\"\\t\")"
35
+ ]
36
+ },
37
+ {
38
+ "cell_type": "code",
39
+ "execution_count": 6,
40
+ "metadata": {},
41
+ "outputs": [
42
+ {
43
+ "data": {
44
+ "text/html": [
45
+ "<div>\n",
46
+ "<style scoped>\n",
47
+ " .dataframe tbody tr th:only-of-type {\n",
48
+ " vertical-align: middle;\n",
49
+ " }\n",
50
+ "\n",
51
+ " .dataframe tbody tr th {\n",
52
+ " vertical-align: top;\n",
53
+ " }\n",
54
+ "\n",
55
+ " .dataframe thead th {\n",
56
+ " text-align: right;\n",
57
+ " }\n",
58
+ "</style>\n",
59
+ "<table border=\"1\" class=\"dataframe\">\n",
60
+ " <thead>\n",
61
+ " <tr style=\"text-align: right;\">\n",
62
+ " <th></th>\n",
63
+ " <th>#1_tweetid</th>\n",
64
+ " <th>#2_tweet</th>\n",
65
+ " <th>#3_country_label</th>\n",
66
+ " <th>#4_province_label</th>\n",
67
+ " </tr>\n",
68
+ " </thead>\n",
69
+ " <tbody>\n",
70
+ " <tr>\n",
71
+ " <th>0</th>\n",
72
+ " <td>TRAIN_0</td>\n",
73
+ " <td>ุญุงุฌุฉ ุญู„ูˆุฉ ุงูƒูŠุฏ</td>\n",
74
+ " <td>Egypt</td>\n",
75
+ " <td>eg_Faiyum</td>\n",
76
+ " </tr>\n",
77
+ " <tr>\n",
78
+ " <th>1</th>\n",
79
+ " <td>TRAIN_1</td>\n",
80
+ " <td>ุนู… ุจุดุชุบู„ูˆุง ู„ู„ุดุนุจ ุงู„ุงู…ูŠุฑูƒูŠ ุงู…ุง ู†ุญู† ูŠูƒุฐุจูˆุง ูˆูŠุบุดูˆ...</td>\n",
81
+ " <td>Iraq</td>\n",
82
+ " <td>iq_Dihok</td>\n",
83
+ " </tr>\n",
84
+ " <tr>\n",
85
+ " <th>2</th>\n",
86
+ " <td>TRAIN_2</td>\n",
87
+ " <td>ุงุจุดุฑ ุทุงู„ ุนู…ุฑูƒ</td>\n",
88
+ " <td>Saudi_Arabia</td>\n",
89
+ " <td>sa_Ha'il</td>\n",
90
+ " </tr>\n",
91
+ " <tr>\n",
92
+ " <th>3</th>\n",
93
+ " <td>TRAIN_3</td>\n",
94
+ " <td>ู…ู†ุทู‚ 2017: ุฃู†ุง ูˆุงู„ุบุฑูŠุจ ุนู„ูŠ ุฅุจู† ุนู…ูŠ ูˆุฃู†ุง ูˆุงู„ุบุฑูŠ...</td>\n",
95
+ " <td>Mauritania</td>\n",
96
+ " <td>mr_Nouakchott</td>\n",
97
+ " </tr>\n",
98
+ " <tr>\n",
99
+ " <th>4</th>\n",
100
+ " <td>TRAIN_4</td>\n",
101
+ " <td>ุดู‡ุฑูŠู† ูˆุชุฑูˆุญ ูˆุงู„ุจุงู‚ูŠ ุบูŠุฑ ุตูŠู ู…ู„ูŠู†ุง</td>\n",
102
+ " <td>Algeria</td>\n",
103
+ " <td>dz_El-Oued</td>\n",
104
+ " </tr>\n",
105
+ " </tbody>\n",
106
+ "</table>\n",
107
+ "</div>"
108
+ ],
109
+ "text/plain": [
110
+ " #1_tweetid #2_tweet \\\n",
111
+ "0 TRAIN_0 ุญุงุฌุฉ ุญู„ูˆุฉ ุงูƒูŠุฏ \n",
112
+ "1 TRAIN_1 ุนู… ุจุดุชุบู„ูˆุง ู„ู„ุดุนุจ ุงู„ุงู…ูŠุฑูƒูŠ ุงู…ุง ู†ุญู† ูŠูƒุฐุจูˆุง ูˆูŠุบุดูˆ... \n",
113
+ "2 TRAIN_2 ุงุจุดุฑ ุทุงู„ ุนู…ุฑูƒ \n",
114
+ "3 TRAIN_3 ู…ู†ุทู‚ 2017: ุฃู†ุง ูˆุงู„ุบุฑูŠุจ ุนู„ูŠ ุฅุจู† ุนู…ูŠ ูˆุฃู†ุง ูˆุงู„ุบุฑูŠ... \n",
115
+ "4 TRAIN_4 ุดู‡ุฑูŠู† ูˆุชุฑูˆุญ ูˆุงู„ุจุงู‚ูŠ ุบูŠุฑ ุตูŠู ู…ู„ูŠู†ุง \n",
116
+ "\n",
117
+ " #3_country_label #4_province_label \n",
118
+ "0 Egypt eg_Faiyum \n",
119
+ "1 Iraq iq_Dihok \n",
120
+ "2 Saudi_Arabia sa_Ha'il \n",
121
+ "3 Mauritania mr_Nouakchott \n",
122
+ "4 Algeria dz_El-Oued "
123
+ ]
124
+ },
125
+ "execution_count": 6,
126
+ "metadata": {},
127
+ "output_type": "execute_result"
128
+ }
129
+ ],
130
+ "source": [
131
+ "df_train.head()"
132
+ ]
133
+ },
134
+ {
135
+ "cell_type": "code",
136
+ "execution_count": 7,
137
+ "metadata": {},
138
+ "outputs": [
139
+ {
140
+ "data": {
141
+ "text/html": [
142
+ "<div>\n",
143
+ "<style scoped>\n",
144
+ " .dataframe tbody tr th:only-of-type {\n",
145
+ " vertical-align: middle;\n",
146
+ " }\n",
147
+ "\n",
148
+ " .dataframe tbody tr th {\n",
149
+ " vertical-align: top;\n",
150
+ " }\n",
151
+ "\n",
152
+ " .dataframe thead th {\n",
153
+ " text-align: right;\n",
154
+ " }\n",
155
+ "</style>\n",
156
+ "<table border=\"1\" class=\"dataframe\">\n",
157
+ " <thead>\n",
158
+ " <tr style=\"text-align: right;\">\n",
159
+ " <th></th>\n",
160
+ " <th>#1_tweetid</th>\n",
161
+ " <th>#2_tweet</th>\n",
162
+ " <th>#3_country_label</th>\n",
163
+ " <th>#4_province_label</th>\n",
164
+ " </tr>\n",
165
+ " </thead>\n",
166
+ " <tbody>\n",
167
+ " <tr>\n",
168
+ " <th>0</th>\n",
169
+ " <td>DEV_0</td>\n",
170
+ " <td>ู‚ูˆู„ู†ุง ุงูˆู† ู„ุงูŠู† ู„ุง ูŠุง ุนู„ูŠ ุงูˆู† ู„ุงูŠู† ู„ุง</td>\n",
171
+ " <td>Egypt</td>\n",
172
+ " <td>eg_Alexandria</td>\n",
173
+ " </tr>\n",
174
+ " <tr>\n",
175
+ " <th>1</th>\n",
176
+ " <td>DEV_1</td>\n",
177
+ " <td>ู‡ู‡ู‡ู‡ู‡ ุจุงูŠุฎู‡ ู‡ู‡ู‡ู‡ู‡ URL ย โ€ฆ</td>\n",
178
+ " <td>Oman</td>\n",
179
+ " <td>om_Muscat</td>\n",
180
+ " </tr>\n",
181
+ " <tr>\n",
182
+ " <th>2</th>\n",
183
+ " <td>DEV_2</td>\n",
184
+ " <td>ุฑุจู†ุง ูŠุฎู„ูŠูƒ ูŠุง ุฏูˆูƒ ูˆู„ูƒ ุงู„ู…ุซู„ :D</td>\n",
185
+ " <td>Lebanon</td>\n",
186
+ " <td>lb_South-Lebanon</td>\n",
187
+ " </tr>\n",
188
+ " <tr>\n",
189
+ " <th>3</th>\n",
190
+ " <td>DEV_3</td>\n",
191
+ " <td>#ุงูˆุงู…ุฑ_ู…ู„ูƒูŠู‡ ูŠุงุดุจุงุจ ุงูŠ ูˆุงุญุฏ ููŠูƒู… ุนู†ุฏู‡ ุดูŠ ูŠุฐูƒุฑู‡...</td>\n",
192
+ " <td>Syria</td>\n",
193
+ " <td>sy_Damascus-City</td>\n",
194
+ " </tr>\n",
195
+ " <tr>\n",
196
+ " <th>4</th>\n",
197
+ " <td>DEV_4</td>\n",
198
+ " <td>ุดุฏ ุนุงู„ุฎุท ุญุชู‰ ู‡ูŠุง ุงูƒูˆูŠุณู‡</td>\n",
199
+ " <td>Libya</td>\n",
200
+ " <td>ly_Misrata</td>\n",
201
+ " </tr>\n",
202
+ " </tbody>\n",
203
+ "</table>\n",
204
+ "</div>"
205
+ ],
206
+ "text/plain": [
207
+ " #1_tweetid #2_tweet \\\n",
208
+ "0 DEV_0 ู‚ูˆู„ู†ุง ุงูˆู† ู„ุงูŠู† ู„ุง ูŠุง ุนู„ูŠ ุงูˆู† ู„ุงูŠู† ู„ุง \n",
209
+ "1 DEV_1 ู‡ู‡ู‡ู‡ู‡ ุจุงูŠุฎู‡ ู‡ู‡ู‡ู‡ู‡ URL ย โ€ฆ \n",
210
+ "2 DEV_2 ุฑุจู†ุง ูŠุฎู„ูŠูƒ ูŠุง ุฏูˆูƒ ูˆู„ูƒ ุงู„ู…ุซู„ :D \n",
211
+ "3 DEV_3 #ุงูˆุงู…ุฑ_ู…ู„ูƒูŠู‡ ูŠุงุดุจุงุจ ุงูŠ ูˆุงุญุฏ ููŠูƒู… ุนู†ุฏู‡ ุดูŠ ูŠุฐูƒุฑู‡... \n",
212
+ "4 DEV_4 ุดุฏ ุนุงู„ุฎุท ุญุชู‰ ู‡ูŠุง ุงูƒูˆูŠุณู‡ \n",
213
+ "\n",
214
+ " #3_country_label #4_province_label \n",
215
+ "0 Egypt eg_Alexandria \n",
216
+ "1 Oman om_Muscat \n",
217
+ "2 Lebanon lb_South-Lebanon \n",
218
+ "3 Syria sy_Damascus-City \n",
219
+ "4 Libya ly_Misrata "
220
+ ]
221
+ },
222
+ "execution_count": 7,
223
+ "metadata": {},
224
+ "output_type": "execute_result"
225
+ }
226
+ ],
227
+ "source": [
228
+ "df_test.head()"
229
+ ]
230
+ },
231
+ {
232
+ "cell_type": "markdown",
233
+ "metadata": {},
234
+ "source": [
235
+ "## 2. Training the Classifier"
236
+ ]
237
+ },
238
+ {
239
+ "cell_type": "markdown",
240
+ "metadata": {},
241
+ "source": [
242
+ "## 3. Evaluating the Performance"
243
+ ]
244
  }
245
  ],
246
  "metadata": {
247
+ "kernelspec": {
248
+ "display_name": "adc",
249
+ "language": "python",
250
+ "name": "python3"
251
+ },
252
  "language_info": {
253
+ "codemirror_mode": {
254
+ "name": "ipython",
255
+ "version": 3
256
+ },
257
+ "file_extension": ".py",
258
+ "mimetype": "text/x-python",
259
+ "name": "python",
260
+ "nbconvert_exporter": "python",
261
+ "pygments_lexer": "ipython3",
262
+ "version": "3.11.7"
263
  }
264
  },
265
  "nbformat": 4,