Spaces:

zaidmehdi
/

arabic-dialect-classifier

Sleeping

App Files Files Community

zaidmehdi commited on Feb 26

Commit

cf5faeb

•

1 Parent(s): 370a710

grid search for xgboost

Browse files

Files changed (1) hide show

src/classifier.ipynb +83 -5

src/classifier.ipynb CHANGED Viewed

@@ -10,7 +10,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 36,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -23,8 +23,10 @@
     "from sklearn.ensemble import RandomForestClassifier\n",
     "from sklearn.linear_model import LogisticRegression\n",
     "from sklearn.model_selection import RandomizedSearchCV\n",
     "import torch\n",
-    "from transformers import AutoModel, AutoTokenizer"
    ]
   },
   {
@@ -533,6 +535,16 @@
     "    pickle.dump(data_hidden, f)"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -549,7 +561,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 28,
    "metadata": {},
    "outputs": [
     {
@@ -558,7 +570,7 @@
        "((21000, 768), (21000,))"
       ]
      },
-     "execution_count": 28,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -695,6 +707,72 @@
     "print(\"Best Score:\", rf_search.best_score_)"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -719,7 +797,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.7"
   }
  },
  "nbformat": 4,

   },
   {
    "cell_type": "code",
+   "execution_count": 6,
    "metadata": {},
    "outputs": [],
    "source": [
     "from sklearn.ensemble import RandomForestClassifier\n",
     "from sklearn.linear_model import LogisticRegression\n",
     "from sklearn.model_selection import RandomizedSearchCV\n",
+    "from sklearn.preprocessing import LabelEncoder\n",
     "import torch\n",
+    "from transformers import AutoModel, AutoTokenizer\n",
+    "import xgboost as xgb"
    ]
   },
   {
     "    pickle.dump(data_hidden, f)"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open(\"../data/data_hidden.pkl\", \"rb\") as f:\n",
+    "    data_hidden = pickle.load(f)"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
   },
   {
    "cell_type": "code",
+   "execution_count": 3,
    "metadata": {},
    "outputs": [
     {
        "((21000, 768), (21000,))"
       ]
      },
+     "execution_count": 3,
      "metadata": {},
      "output_type": "execute_result"
     }
     "print(\"Best Score:\", rf_search.best_score_)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### 2.3.3 XGBoost"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "For XGBoost, we first need to encode the target variable."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "label_encoder = LabelEncoder()\n",
+    "y_train_encoded = label_encoder.fit_transform(y_train)\n",
+    "y_test_encoded = label_encoder.transform(y_test)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "xgb_model = xgb.XGBClassifier(device=\"cuda\", seed=2024)\n",
+    "parameters = {\n",
+    "    \"n_estimators\" : [100, 150, 200, 300, 400, 450, 500],\n",
+    "    \"max_depth\" : [3, 4, 5, 6, 7, 8],\n",
+    "    \"learning_rate\": [0.1, 0.05, 0.01, 0.005, 0.001]\n",
+    "}\n",
+    "xgb_search = RandomizedSearchCV(estimator=xgb_model, param_distributions=parameters,\n",
+    "                                scoring=\"f1_macro\", cv=5, n_iter=20)\n",
+    "xgb_search.fit(X_train, y_train_encoded)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(\"Best Parameters:\", xgb_search.best_params_)\n",
+    "print(\"Best Score (Macro Average F1):\", xgb_search.best_score_)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### 2.3.4 LightGBM"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
+   "version": "3.1.0"
   }
  },
  "nbformat": 4,