Spaces:

zaidmehdi
/

arabic-dialect-classifier

Sleeping

App Files Files Community

zaidmehdi commited on Feb 25

Commit

370a710

•

1 Parent(s): ba240c4

training random forest

Browse files

Files changed (1) hide show

src/classifier.ipynb +64 -7

src/classifier.ipynb CHANGED Viewed

@@ -10,7 +10,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 30,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -22,6 +22,7 @@
     "import pandas as pd\n",
     "from sklearn.ensemble import RandomForestClassifier\n",
     "from sklearn.linear_model import LogisticRegression\n",
     "import torch\n",
     "from transformers import AutoModel, AutoTokenizer"
    ]
@@ -575,7 +576,9 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "We can try different models."
    ]
   },
   {
@@ -623,19 +626,73 @@
    ]
   },
   {
-   "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "For the Random forest, we can do a grid search to optimize the hyperparameters. We will use a 5-fold cross validation strategy."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
    "metadata": {},
-   "outputs": [],
    "source": [
-    "rf_model = RandomForestClassifier()\n"
    ]
   },
   {

   },
   {
    "cell_type": "code",
+   "execution_count": 36,
    "metadata": {},
    "outputs": [],
    "source": [
     "import pandas as pd\n",
     "from sklearn.ensemble import RandomForestClassifier\n",
     "from sklearn.linear_model import LogisticRegression\n",
+    "from sklearn.model_selection import RandomizedSearchCV\n",
     "import torch\n",
     "from transformers import AutoModel, AutoTokenizer"
    ]
    "cell_type": "markdown",
    "metadata": {},
    "source": [
+    "We can try different models.  \n",
+    "\n",
+    "For the ensemble models, we can do a randomized or grid search to tune the hyperparameters. We will use a 5-fold cross validation strategy, and optimize for the macro averaged f1 score (because we want to give an equal importance to each class, regardless of how many observations each one has)."
    ]
   },
   {
    ]
   },
   {
+   "cell_type": "code",
+   "execution_count": 39,
    "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<style>#sk-container-id-3 {color: black;}#sk-container-id-3 pre{padding: 0;}#sk-container-id-3 div.sk-toggleable {background-color: white;}#sk-container-id-3 label.sk-toggleable__label {cursor: pointer;display: block;width: 100%;margin-bottom: 0;padding: 0.3em;box-sizing: border-box;text-align: center;}#sk-container-id-3 label.sk-toggleable__label-arrow:before {content: \"▸\";float: left;margin-right: 0.25em;color: #696969;}#sk-container-id-3 label.sk-toggleable__label-arrow:hover:before {color: black;}#sk-container-id-3 div.sk-estimator:hover label.sk-toggleable__label-arrow:before {color: black;}#sk-container-id-3 div.sk-toggleable__content {max-height: 0;max-width: 0;overflow: hidden;text-align: left;background-color: #f0f8ff;}#sk-container-id-3 div.sk-toggleable__content pre {margin: 0.2em;color: black;border-radius: 0.25em;background-color: #f0f8ff;}#sk-container-id-3 input.sk-toggleable__control:checked~div.sk-toggleable__content {max-height: 200px;max-width: 100%;overflow: auto;}#sk-container-id-3 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {content: \"▾\";}#sk-container-id-3 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-3 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-3 input.sk-hidden--visually {border: 0;clip: rect(1px 1px 1px 1px);clip: rect(1px, 1px, 1px, 1px);height: 1px;margin: -1px;overflow: hidden;padding: 0;position: absolute;width: 1px;}#sk-container-id-3 div.sk-estimator {font-family: monospace;background-color: #f0f8ff;border: 1px dotted black;border-radius: 0.25em;box-sizing: border-box;margin-bottom: 0.5em;}#sk-container-id-3 div.sk-estimator:hover {background-color: #d4ebff;}#sk-container-id-3 div.sk-parallel-item::after {content: \"\";width: 100%;border-bottom: 1px solid gray;flex-grow: 1;}#sk-container-id-3 div.sk-label:hover label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-3 div.sk-serial::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: 0;}#sk-container-id-3 div.sk-serial {display: flex;flex-direction: column;align-items: center;background-color: white;padding-right: 0.2em;padding-left: 0.2em;position: relative;}#sk-container-id-3 div.sk-item {position: relative;z-index: 1;}#sk-container-id-3 div.sk-parallel {display: flex;align-items: stretch;justify-content: center;background-color: white;position: relative;}#sk-container-id-3 div.sk-item::before, #sk-container-id-3 div.sk-parallel-item::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: -1;}#sk-container-id-3 div.sk-parallel-item {display: flex;flex-direction: column;z-index: 1;position: relative;background-color: white;}#sk-container-id-3 div.sk-parallel-item:first-child::after {align-self: flex-end;width: 50%;}#sk-container-id-3 div.sk-parallel-item:last-child::after {align-self: flex-start;width: 50%;}#sk-container-id-3 div.sk-parallel-item:only-child::after {width: 0;}#sk-container-id-3 div.sk-dashed-wrapped {border: 1px dashed gray;margin: 0 0.4em 0.5em 0.4em;box-sizing: border-box;padding-bottom: 0.4em;background-color: white;}#sk-container-id-3 div.sk-label label {font-family: monospace;font-weight: bold;display: inline-block;line-height: 1.2em;}#sk-container-id-3 div.sk-label-container {text-align: center;}#sk-container-id-3 div.sk-container {/* jupyter's `normalize.less` sets `[hidden] { display: none; }` but bootstrap.min.css set `[hidden] { display: none !important; }` so we also need the `!important` here to be able to override the default hidden behavior on the sphinx rendered scikit-learn.org. See: https://github.com/scikit-learn/scikit-learn/issues/21755 */display: inline-block !important;position: relative;}#sk-container-id-3 div.sk-text-repr-fallback {display: none;}</style><div id=\"sk-container-id-3\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>RandomizedSearchCV(cv=5,\n",
+       "                   estimator=RandomForestClassifier(class_weight=&#x27;balanced&#x27;,\n",
+       "                                                    random_state=2024),\n",
+       "                   n_iter=20,\n",
+       "                   param_distributions={&#x27;max_depth&#x27;: [3, 4, 5, 6, 7, 8],\n",
+       "                                        &#x27;n_estimators&#x27;: [100, 150, 200, 250,\n",
+       "                                                         300, 400, 500]},\n",
+       "                   scoring=&#x27;f1_macro&#x27;)</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item sk-dashed-wrapped\"><div class=\"sk-label-container\"><div class=\"sk-label sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-3\" type=\"checkbox\" ><label for=\"sk-estimator-id-3\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">RandomizedSearchCV</label><div class=\"sk-toggleable__content\"><pre>RandomizedSearchCV(cv=5,\n",
+       "                   estimator=RandomForestClassifier(class_weight=&#x27;balanced&#x27;,\n",
+       "                                                    random_state=2024),\n",
+       "                   n_iter=20,\n",
+       "                   param_distributions={&#x27;max_depth&#x27;: [3, 4, 5, 6, 7, 8],\n",
+       "                                        &#x27;n_estimators&#x27;: [100, 150, 200, 250,\n",
+       "                                                         300, 400, 500]},\n",
+       "                   scoring=&#x27;f1_macro&#x27;)</pre></div></div></div><div class=\"sk-parallel\"><div class=\"sk-parallel-item\"><div class=\"sk-item\"><div class=\"sk-label-container\"><div class=\"sk-label sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-4\" type=\"checkbox\" ><label for=\"sk-estimator-id-4\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">estimator: RandomForestClassifier</label><div class=\"sk-toggleable__content\"><pre>RandomForestClassifier(class_weight=&#x27;balanced&#x27;, random_state=2024)</pre></div></div></div><div class=\"sk-serial\"><div class=\"sk-item\"><div class=\"sk-estimator sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-5\" type=\"checkbox\" ><label for=\"sk-estimator-id-5\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">RandomForestClassifier</label><div class=\"sk-toggleable__content\"><pre>RandomForestClassifier(class_weight=&#x27;balanced&#x27;, random_state=2024)</pre></div></div></div></div></div></div></div></div></div></div>"
+      ],
+      "text/plain": [
+       "RandomizedSearchCV(cv=5,\n",
+       "                   estimator=RandomForestClassifier(class_weight='balanced',\n",
+       "                                                    random_state=2024),\n",
+       "                   n_iter=20,\n",
+       "                   param_distributions={'max_depth': [3, 4, 5, 6, 7, 8],\n",
+       "                                        'n_estimators': [100, 150, 200, 250,\n",
+       "                                                         300, 400, 500]},\n",
+       "                   scoring='f1_macro')"
+      ]
+     },
+     "execution_count": 39,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
+    "rf_model = RandomForestClassifier(class_weight=\"balanced\", random_state=2024)\n",
+    "parameters = {\n",
+    "    \"n_estimators\": [100, 150, 200, 250, 300, 400, 500],\n",
+    "    \"max_depth\": [3, 4, 5, 6, 7, 8]\n",
+    "}\n",
+    "rf_search = RandomizedSearchCV(estimator=rf_model, param_distributions=parameters, \n",
+    "                         scoring=\"f1_macro\", cv=5, n_iter=20)\n",
+    "rf_search.fit(X_train, y_train)"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 40,
    "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Best Parameters: {'n_estimators': 400, 'max_depth': 8}\n",
+      "Best Score: 0.15591886021384346\n"
+     ]
+    }
+   ],
    "source": [
+    "print(\"Best Parameters:\", rf_search.best_params_)\n",
+    "print(\"Best Score:\", rf_search.best_score_)"
    ]
   },
   {