Spaces:

zaidmehdi
/

arabic-dialect-classifier

Sleeping

App Files Files Community

zaidmehdi commited on Feb 25

Commit

76f50ce

•

1 Parent(s): 07f51a8

tokenizing data and extracting features from last hidden layer

Browse files

Files changed (1) hide show

src/classifier.ipynb +209 -14

src/classifier.ipynb CHANGED Viewed

@@ -10,12 +10,26 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
    "metadata": {},
-   "outputs": [],
    "source": [
     "import matplotlib.pyplot as plt\n",
-    "import pandas as pd"
    ]
   },
   {
@@ -27,7 +41,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -37,7 +51,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
    "metadata": {},
    "outputs": [
     {
@@ -123,7 +137,7 @@
        "4          Algeria        dz_El-Oued  "
       ]
      },
-     "execution_count": 6,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -134,7 +148,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
    "metadata": {},
    "outputs": [
     {
@@ -220,7 +234,7 @@
        "4            Libya        ly_Misrata  "
       ]
      },
-     "execution_count": 7,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -231,7 +245,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
    "metadata": {},
    "outputs": [
     {
@@ -249,7 +263,7 @@
        " dtype: int64)"
       ]
      },
-     "execution_count": 12,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -267,7 +281,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 32,
    "metadata": {},
    "outputs": [
     {
@@ -276,7 +290,7 @@
        "Text(0.5, 1.0, 'Value counts of country label in train data')"
       ]
      },
-     "execution_count": 32,
      "metadata": {},
      "output_type": "execute_result"
     },
@@ -299,7 +313,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 33,
    "metadata": {},
    "outputs": [
     {
@@ -308,7 +322,7 @@
        "Text(0.5, 1.0, 'Value counts of country label in test data')"
       ]
      },
-     "execution_count": 33,
      "metadata": {},
      "output_type": "execute_result"
     },
@@ -343,6 +357,187 @@
     "## 2. Training the Classifier"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},

   },
   {
    "cell_type": "code",
+   "execution_count": 1,
    "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/mehdi/miniconda3/envs/adc/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    }
+   ],
    "source": [
+    "import pickle\n",
+    "\n",
+    "from datasets import DatasetDict, Dataset\n",
     "import matplotlib.pyplot as plt\n",
+    "import pandas as pd\n",
+    "import torch\n",
+    "from transformers import AutoModel, AutoTokenizer"
    ]
   },
   {
   },
   {
    "cell_type": "code",
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
   },
   {
    "cell_type": "code",
+   "execution_count": 3,
    "metadata": {},
    "outputs": [
     {
        "4          Algeria        dz_El-Oued  "
       ]
      },
+     "execution_count": 3,
      "metadata": {},
      "output_type": "execute_result"
     }
   },
   {
    "cell_type": "code",
+   "execution_count": 4,
    "metadata": {},
    "outputs": [
     {
        "4            Libya        ly_Misrata  "
       ]
      },
+     "execution_count": 4,
      "metadata": {},
      "output_type": "execute_result"
     }
   },
   {
    "cell_type": "code",
+   "execution_count": 5,
    "metadata": {},
    "outputs": [
     {
        " dtype: int64)"
       ]
      },
+     "execution_count": 5,
      "metadata": {},
      "output_type": "execute_result"
     }
   },
   {
    "cell_type": "code",
+   "execution_count": 6,
    "metadata": {},
    "outputs": [
     {
        "Text(0.5, 1.0, 'Value counts of country label in train data')"
       ]
      },
+     "execution_count": 6,
      "metadata": {},
      "output_type": "execute_result"
     },
   },
   {
    "cell_type": "code",
+   "execution_count": 7,
    "metadata": {},
    "outputs": [
     {
        "Text(0.5, 1.0, 'Value counts of country label in test data')"
       ]
      },
+     "execution_count": 7,
      "metadata": {},
      "output_type": "execute_result"
     },
     "## 2. Training the Classifier"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "For this classifier, we will convert the tweets into vector embeddings using the AraBART model. We will use the last hidden layer of the model to extract the features"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 2.1 Data Preparation\n",
+    "The first step is to prepare our data by tokenizing it to use it with the model AraBART."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "First, we load the model and its tokenizer."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "torch.device"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "device = torch.device(\"cpu\")\n",
+    "type(device)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model = AutoModel.from_pretrained(\"moussaKam/AraBART\").to(device)\n",
+    "tokenizer = AutoTokenizer.from_pretrained(\"moussaKam/AraBART\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Next, we convert the datasets into a DatasetDict object."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "mapper = {\"#2_tweet\": \"tweet\", \"#3_country_label\": \"label\"}\n",
+    "columns_to_keep = [\"tweet\", \"label\"]\n",
+    "\n",
+    "df_train = df_train.rename(columns=mapper)[columns_to_keep]\n",
+    "df_test = df_test.rename(columns=mapper)[columns_to_keep]\n",
+    "\n",
+    "train_dataset = Dataset.from_pandas(df_train)\n",
+    "test_dataset = Dataset.from_pandas(df_test)\n",
+    "data = DatasetDict({'train': train_dataset, 'test': test_dataset})"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Then, we tokenkize the dataset."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                                   \r"
+     ]
+    }
+   ],
+   "source": [
+    "def tokenize(batch):\n",
+    "  return tokenizer(batch[\"tweet\"], padding=True)\n",
+    "\n",
+    "data_encoded = data.map(tokenize, batched=True, batch_size=None)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['tweet', 'label', 'input_ids', 'attention_mask']"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data_encoded[\"train\"].column_names"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 2.2 Feature Extraction"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def extract_hidden_states(batch):\n",
+    "  inputs = {k:v.to(device) for k,v in batch.items()\n",
+    "            if k in tokenizer.model_input_names}\n",
+    "  with torch.no_grad():\n",
+    "    last_hidden_state = model(**inputs).last_hidden_state\n",
+    "\n",
+    "  return{\"hidden_state\": last_hidden_state[:,0].cpu().numpy()}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                                   \r"
+     ]
+    }
+   ],
+   "source": [
+    "data_encoded.set_format(\"torch\", columns=[\"input_ids\", \"attention_mask\", \"label\"])\n",
+    "data_hidden = data_encoded.map(extract_hidden_states, batched=True, batch_size=50)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open(\"../data/data_hidden.pkl\", \"wb\") as f:\n",
+    "    pickle.dump(data_hidden, f)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 2.3 Model Training"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},