{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "NAbhfWJQ90eO", "outputId": "c949cfb8-1f4e-4868-f707-7ad6e94a7c5a" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: transformers in /usr/local/lib/python3.10/dist-packages (4.41.2)\n", "Collecting datasets\n", " Downloading datasets-2.20.0-py3-none-any.whl (547 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m547.8/547.8 kB\u001b[0m \u001b[31m6.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: torch in /usr/local/lib/python3.10/dist-packages (2.3.0+cu121)\n", "Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (2.0.3)\n", "Requirement already satisfied: scikit-learn in /usr/local/lib/python3.10/dist-packages (1.2.2)\n", "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers) (3.15.3)\n", "Requirement already satisfied: huggingface-hub<1.0,>=0.23.0 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.23.4)\n", "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (1.25.2)\n", "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers) (24.1)\n", "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (6.0.1)\n", "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (2024.5.15)\n", "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers) (2.31.0)\n", "Requirement already satisfied: tokenizers<0.20,>=0.19 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.19.1)\n", "Requirement already satisfied: safetensors>=0.4.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.4.3)\n", "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.10/dist-packages (from transformers) (4.66.4)\n", "Collecting pyarrow>=15.0.0 (from datasets)\n", " Downloading pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl (40.8 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m40.8/40.8 MB\u001b[0m \u001b[31m7.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: pyarrow-hotfix in /usr/local/lib/python3.10/dist-packages (from datasets) (0.6)\n", "Collecting dill<0.3.9,>=0.3.0 (from datasets)\n", " Downloading dill-0.3.8-py3-none-any.whl (116 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m116.3/116.3 kB\u001b[0m \u001b[31m8.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hCollecting requests (from transformers)\n", " Downloading requests-2.32.3-py3-none-any.whl (64 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m64.9/64.9 kB\u001b[0m \u001b[31m3.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hCollecting xxhash (from datasets)\n", " Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m194.1/194.1 kB\u001b[0m \u001b[31m4.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hCollecting multiprocess (from datasets)\n", " Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m9.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: fsspec[http]<=2024.5.0,>=2023.1.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (2023.6.0)\n", "Requirement already satisfied: aiohttp in /usr/local/lib/python3.10/dist-packages (from datasets) (3.9.5)\n", "Requirement already satisfied: typing-extensions>=4.8.0 in /usr/local/lib/python3.10/dist-packages (from torch) (4.12.2)\n", "Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch) (1.12.1)\n", "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch) (3.3)\n", "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch) (3.1.4)\n", "Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)\n", " Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)\n", "Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)\n", " Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)\n", "Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)\n", " Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)\n", "Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)\n", " Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)\n", "Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)\n", " Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)\n", "Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)\n", " Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)\n", "Collecting nvidia-curand-cu12==10.3.2.106 (from torch)\n", " Using cached nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)\n", "Collecting nvidia-cusolver-cu12==11.4.5.107 (from torch)\n", " Using cached nvidia_cusolver_cu12-11.4.5.107-py3-none-manylinux1_x86_64.whl (124.2 MB)\n", "Collecting nvidia-cusparse-cu12==12.1.0.106 (from torch)\n", " Using cached nvidia_cusparse_cu12-12.1.0.106-py3-none-manylinux1_x86_64.whl (196.0 MB)\n", "Collecting nvidia-nccl-cu12==2.20.5 (from torch)\n", " Using cached nvidia_nccl_cu12-2.20.5-py3-none-manylinux2014_x86_64.whl (176.2 MB)\n", "Collecting nvidia-nvtx-cu12==12.1.105 (from torch)\n", " Using cached nvidia_nvtx_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (99 kB)\n", "Requirement already satisfied: triton==2.3.0 in /usr/local/lib/python3.10/dist-packages (from torch) (2.3.0)\n", "Collecting nvidia-nvjitlink-cu12 (from nvidia-cusolver-cu12==11.4.5.107->torch)\n", " Downloading nvidia_nvjitlink_cu12-12.5.40-py3-none-manylinux2014_x86_64.whl (21.3 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m21.3/21.3 MB\u001b[0m \u001b[31m43.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.10/dist-packages (from pandas) (2.8.2)\n", "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas) (2023.4)\n", "Requirement already satisfied: tzdata>=2022.1 in /usr/local/lib/python3.10/dist-packages (from pandas) (2024.1)\n", "Requirement already satisfied: scipy>=1.3.2 in /usr/local/lib/python3.10/dist-packages (from scikit-learn) (1.11.4)\n", "Requirement already satisfied: joblib>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from scikit-learn) (1.4.2)\n", "Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn) (3.5.0)\n", "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.3.1)\n", "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (23.2.0)\n", "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.4.1)\n", "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (6.0.5)\n", "Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.9.4)\n", "Requirement already satisfied: async-timeout<5.0,>=4.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (4.0.3)\n", "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.2->pandas) (1.16.0)\n", "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.3.2)\n", "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.7)\n", "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2.0.7)\n", "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2024.6.2)\n", "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch) (2.1.5)\n", "Requirement already satisfied: mpmath<1.4.0,>=1.1.0 in /usr/local/lib/python3.10/dist-packages (from sympy->torch) (1.3.0)\n", "Installing collected packages: xxhash, requests, pyarrow, nvidia-nvtx-cu12, nvidia-nvjitlink-cu12, nvidia-nccl-cu12, nvidia-curand-cu12, nvidia-cufft-cu12, nvidia-cuda-runtime-cu12, nvidia-cuda-nvrtc-cu12, nvidia-cuda-cupti-cu12, nvidia-cublas-cu12, dill, nvidia-cusparse-cu12, nvidia-cudnn-cu12, multiprocess, nvidia-cusolver-cu12, datasets\n", " Attempting uninstall: requests\n", " Found existing installation: requests 2.31.0\n", " Uninstalling requests-2.31.0:\n", " Successfully uninstalled requests-2.31.0\n", " Attempting uninstall: pyarrow\n", " Found existing installation: pyarrow 14.0.2\n", " Uninstalling pyarrow-14.0.2:\n", " Successfully uninstalled pyarrow-14.0.2\n", "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", "cudf-cu12 24.4.1 requires pyarrow<15.0.0a0,>=14.0.1, but you have pyarrow 16.1.0 which is incompatible.\n", "google-colab 1.0.0 requires requests==2.31.0, but you have requests 2.32.3 which is incompatible.\n", "ibis-framework 8.0.0 requires pyarrow<16,>=2, but you have pyarrow 16.1.0 which is incompatible.\u001b[0m\u001b[31m\n", "\u001b[0mSuccessfully installed datasets-2.20.0 dill-0.3.8 multiprocess-0.70.16 nvidia-cublas-cu12-12.1.3.1 nvidia-cuda-cupti-cu12-12.1.105 nvidia-cuda-nvrtc-cu12-12.1.105 nvidia-cuda-runtime-cu12-12.1.105 nvidia-cudnn-cu12-8.9.2.26 nvidia-cufft-cu12-11.0.2.54 nvidia-curand-cu12-10.3.2.106 nvidia-cusolver-cu12-11.4.5.107 nvidia-cusparse-cu12-12.1.0.106 nvidia-nccl-cu12-2.20.5 nvidia-nvjitlink-cu12-12.5.40 nvidia-nvtx-cu12-12.1.105 pyarrow-16.1.0 requests-2.32.3 xxhash-3.4.1\n", "Requirement already satisfied: transformers[torch] in /usr/local/lib/python3.10/dist-packages (4.41.2)\n", "Collecting transformers[torch]\n", " Downloading transformers-4.42.3-py3-none-any.whl (9.3 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m9.3/9.3 MB\u001b[0m \u001b[31m43.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hCollecting accelerate\n", " Downloading accelerate-0.31.0-py3-none-any.whl (309 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m309.4/309.4 kB\u001b[0m \u001b[31m27.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers[torch]) (3.15.3)\n", "Requirement already satisfied: huggingface-hub<1.0,>=0.23.2 in /usr/local/lib/python3.10/dist-packages (from transformers[torch]) (0.23.4)\n", "Requirement already satisfied: numpy<2.0,>=1.17 in /usr/local/lib/python3.10/dist-packages (from transformers[torch]) (1.25.2)\n", "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers[torch]) (24.1)\n", "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers[torch]) (6.0.1)\n", "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers[torch]) (2024.5.15)\n", "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers[torch]) (2.32.3)\n", "Requirement already satisfied: safetensors>=0.4.1 in /usr/local/lib/python3.10/dist-packages (from transformers[torch]) (0.4.3)\n", "Requirement already satisfied: tokenizers<0.20,>=0.19 in /usr/local/lib/python3.10/dist-packages (from transformers[torch]) (0.19.1)\n", "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.10/dist-packages (from transformers[torch]) (4.66.4)\n", "Requirement already satisfied: torch in /usr/local/lib/python3.10/dist-packages (from transformers[torch]) (2.3.0+cu121)\n", "Requirement already satisfied: psutil in /usr/local/lib/python3.10/dist-packages (from accelerate) (5.9.5)\n", "Requirement already satisfied: fsspec>=2023.5.0 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.23.2->transformers[torch]) (2023.6.0)\n", "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.23.2->transformers[torch]) (4.12.2)\n", "Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch->transformers[torch]) (1.12.1)\n", "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch->transformers[torch]) (3.3)\n", "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch->transformers[torch]) (3.1.4)\n", "Requirement already satisfied: nvidia-cuda-nvrtc-cu12==12.1.105 in /usr/local/lib/python3.10/dist-packages (from torch->transformers[torch]) (12.1.105)\n", "Requirement already satisfied: nvidia-cuda-runtime-cu12==12.1.105 in /usr/local/lib/python3.10/dist-packages (from torch->transformers[torch]) (12.1.105)\n", "Requirement already satisfied: nvidia-cuda-cupti-cu12==12.1.105 in /usr/local/lib/python3.10/dist-packages (from torch->transformers[torch]) (12.1.105)\n", "Requirement already satisfied: nvidia-cudnn-cu12==8.9.2.26 in /usr/local/lib/python3.10/dist-packages (from torch->transformers[torch]) (8.9.2.26)\n", "Requirement already satisfied: nvidia-cublas-cu12==12.1.3.1 in /usr/local/lib/python3.10/dist-packages (from torch->transformers[torch]) (12.1.3.1)\n", "Requirement already satisfied: nvidia-cufft-cu12==11.0.2.54 in /usr/local/lib/python3.10/dist-packages (from torch->transformers[torch]) (11.0.2.54)\n", "Requirement already satisfied: nvidia-curand-cu12==10.3.2.106 in /usr/local/lib/python3.10/dist-packages (from torch->transformers[torch]) (10.3.2.106)\n", "Requirement already satisfied: nvidia-cusolver-cu12==11.4.5.107 in /usr/local/lib/python3.10/dist-packages (from torch->transformers[torch]) (11.4.5.107)\n", "Requirement already satisfied: nvidia-cusparse-cu12==12.1.0.106 in /usr/local/lib/python3.10/dist-packages (from torch->transformers[torch]) (12.1.0.106)\n", "Requirement already satisfied: nvidia-nccl-cu12==2.20.5 in /usr/local/lib/python3.10/dist-packages (from torch->transformers[torch]) (2.20.5)\n", "Requirement already satisfied: nvidia-nvtx-cu12==12.1.105 in /usr/local/lib/python3.10/dist-packages (from torch->transformers[torch]) (12.1.105)\n", "Requirement already satisfied: triton==2.3.0 in /usr/local/lib/python3.10/dist-packages (from torch->transformers[torch]) (2.3.0)\n", "Requirement already satisfied: nvidia-nvjitlink-cu12 in /usr/local/lib/python3.10/dist-packages (from nvidia-cusolver-cu12==11.4.5.107->torch->transformers[torch]) (12.5.40)\n", "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->transformers[torch]) (3.3.2)\n", "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers[torch]) (3.7)\n", "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->transformers[torch]) (2.0.7)\n", "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers[torch]) (2024.6.2)\n", "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch->transformers[torch]) (2.1.5)\n", "Requirement already satisfied: mpmath<1.4.0,>=1.1.0 in /usr/local/lib/python3.10/dist-packages (from sympy->torch->transformers[torch]) (1.3.0)\n", "Installing collected packages: transformers, accelerate\n", " Attempting uninstall: transformers\n", " Found existing installation: transformers 4.41.2\n", " Uninstalling transformers-4.41.2:\n", " Successfully uninstalled transformers-4.41.2\n", "Successfully installed accelerate-0.31.0 transformers-4.42.3\n" ] } ], "source": [ "# Install required packages\n", "!pip install transformers datasets torch pandas scikit-learn\n", "!pip install transformers[torch] accelerate -U" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "I962QgRn975V" }, "outputs": [], "source": [ "from datasets import load_dataset\n", "from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments\n", "from sklearn.metrics import accuracy_score, precision_recall_fscore_support\n", "import torch\n", "import numpy as np" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 365, "referenced_widgets": [ "3aa2d2c7cab240658ec6554214ee3ad7", "e1792986236446e7a3d848613b1c0599", "f34114418a59483695853e1d4cc6ca67", "f6812bad47a64dc5b582fadbe52c339c", "f65e290b435f4d3a9f0fcb9654e0b392", "65edba10a26041acbe07983ee78dbf3b", "794be1c72b2d480cb7d592cf38b344d4", "8662e3faa0af4cb6a92b2515ce7a33cb", "cb8b3861b5484a9894de28a8d4819d91", "a1d2150f8ac148c5bf6e7e33489ccf21", "4c8bda921be944aca01c69a9605022f9", "ab8a94a341f84aebb871f98441ae4b19", "fa35af04e659455e8105e419ad97d455", "36f116e0c60c4872b7e29547feb43db6", "087a16ac54c14a27af8773e09b6d0f18", "253ba6195935403ba6b980e276e3873f", "d5753dacf94b43278f08b30551bb4381", "e25e1371a99d4924989b92cacc904ebf", "39e282a041a840ed80cbda67132c7cd1", "1d5600a1aa304a80be5cb42d6b933f01", "b1f8edd5d0a04af1933403096a5daa89", "19b33ffa4ee64ee896d7afdf917bbbbd", "848486d32ca144b4a7f9b02a03dc05f3", "498240abaf19425a9fe1dee16366374a", "d71580c3b54b47b59f8674d55c4f21d8", "6cf4cb08aa8a4697a0b46d5bb77e13cc", "c77e78fe61a44afdbd160fd9c4825327", "7a562a932da145f6937001fdadfd9fcd", "d7e07a73a069466290e25d42b57016a7", "ae148326c55f4e28ac1969214dc74316", "af8d8849b28f4510a52cec74692f3484", "fe9454621d9943148a998d6d31f705a8", "1224146d904c4d07a443b21e289c996c", "ecd5ad2f610b4feebc85026b0071f994", "a60cee623cf6404096d7fb24c2d8ec68", "e6212b06576348128e980c808eb83bc4", "14a84f697d614cb1b053cae316091b1f", "c6269f998e18423abd854fa125876d2c", "014813c1b29a4d6098e01469d981147d", "1a95b6333b684374819b01293d8257ac", "a07dc804c7454960919a5d0387fe7ba9", "ca53db4f5e124962a94d5d47e11b24f3", "55cf8640fab74a7793ce379b5c59f784", "360f370d29a54264bfa073e2b7bfab74", "35b5421e32c745a0a9172818e8fb585d", "c8b0afc2bb164b8a85e4e1b325f7bdc6", "123bcf04d6ed4244a08cbd181ad3642a", "78e1145e20564144878074a9b24cd2fd", "0a641e870ef94e7e96ee6b61c5b52a84", "89aab23fa6d9469db8572ffcc95bdb52", "b803032e400d47f78ff337645565fd1f", "12cff4e76b22453d8242c20f0d5e2a53", "7eae2c05276a4da19bdfc3f3687ab650", "b16c5b9847934377b4f075ff5f8bf5a7", "ed678aca044a40c9b6c8f0c7fe4715ea", "5fecbf304801421fb0c5ec7528c9253a", "50d471f019834ebbb835d7259d0fe42c", "a9d54f99b32e467e808ceef227412d6c", "50648e98735c434eb2a14627ebbf3b48", "db69b45da913417bbf426ace57b9ef4f", "c5f43e108aed4b7e98167330dc26569d", "065aa1bbc3ac4fee9c7b2c4268fd3ee1", "ef1d4db90ba141dea124a1120d81be1c", "1627696439aa4dfe82ca80c96a1a4fec", "bb2215e23623455e9da2b9ba42b715ff", "b9ddbfdfa2e34b1a87ac17b9c690c9d3", "a44255796b364f5ba2d6f008140e1b8f", "fcfc3d99a7e04824af6a93049fa3b495", "09fb0a1907284161b9c79bc94a475d59", "5874401d0bdf4bf4bfe63d7aecd25885", "927c09c5dcfd4b38bdc26c848e63f1af", "9fbd3f7c362a4ba1b8fd4ec5982568ab", "3367b670e9bf4e9a90461c3e52ec56c9", "c34c9fc9e9b140759a61c94170269891", "814c916679934bcba4fe25a3f83d38f8", "d4fb0f1d18da4e95b91f79f993366bec", "8b1c1bb5ac4d4997b727d664bd73db04" ] }, "id": "fT-hrc5699v9", "outputId": "6f777e84-7a92-4554-86cf-c47216e9c785" }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_token.py:89: UserWarning: \n", "The secret `HF_TOKEN` does not exist in your Colab secrets.\n", "To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n", "You will be able to reuse this secret in all of your notebooks.\n", "Please note that authentication is recommended but still optional to access public models or datasets.\n", " warnings.warn(\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "3aa2d2c7cab240658ec6554214ee3ad7", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading readme: 0%| | 0.00/578 [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "ab8a94a341f84aebb871f98441ae4b19", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading data: 0%| | 0.00/420k [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "848486d32ca144b4a7f9b02a03dc05f3", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading data: 0%| | 0.00/37.2k [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "ecd5ad2f610b4feebc85026b0071f994", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading data: 0%| | 0.00/90.3k [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "35b5421e32c745a0a9172818e8fb585d", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Generating train split: 0%| | 0/5378 [00:00, ? examples/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "5fecbf304801421fb0c5ec7528c9253a", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Generating validation split: 0%| | 0/415 [00:00, ? examples/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "a44255796b364f5ba2d6f008140e1b8f", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Generating test split: 0%| | 0/1038 [00:00, ? examples/s]" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Load the dataset\n", "dataset = load_dataset('CodeHima/TOS_Dataset')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 145, "referenced_widgets": [ "76fe3f39033c4bd28d8120b455d50edd", "09d7fda593704de6943b3f26aeef8f90", "3118c0993f0d40d783b89142bec63203", "526df5db50c64b9080bbede99697721f", "3db6770b497d48289995a26df471cec7", "ff10e07ea1be4dccbed925af09fc8054", "7a16bb0179144ea69481eb0364db3573", "df6e6279878a443193a0efcaa0ea619f", "24ebad6c9a57464690514d9698ce771a", "047c89ad031b4fdb9a7021b9e832c845", "0c83d57ad7d2483094fab4a49160d863", "9bfa214d40584752a746c8d321f0bef7", "fbdfb9fffebf4989b3b5e47ea9a96388", "4cbf7d9ab4ba43d5bd1a0f0363ab09e9", "e6f29579177f457782a7310bd3d8e17a", "8d06b4b6e9e94b88b3e3aac563042c10", "d7d93eb14b5b4272bff36398558f4c8a", "be715227a48c4aee8abae5650c9d646f", "fdf4d87d035340698073027428f2d1ad", "aea3bc177956479189c0121dfb1dac40", "a669197eb7734694af7912baab1904a2", "608795d2aa234c6da5c9f2d261ac356e", "49bb6af61b5e444abc20d4e9a62d3698", "6efe2829e2a042a3b0ec56d5b16140ca", "b739cc39c5704e6d850e02c4b67a4a0e", "660d38a45fd445dbb017ce21f1fa3df5", "ada30bac9b95456e8922c0ab9d15079b", "cc106834fbed4694a4d78536ba83990d", "dc0c36aba1014e39a9afaab5aa49c040", "5d14319cba6c44b9945b25f919d6d8e0", "cdd516f6634c41f3a2e76ebbe5d0a0b7", "f5ce6c3788e149dd97275845e01ddbeb", "4f2238ed73c54b129cf07c54b2c2a10c", "580ae3ea7f674e2183c9c528c3014785", "a10fd932a94249acbd94a8e8640e8119", "6e57eef2cc334d1cad40fdc8874ab80c", "d20331bde75342398e5653ed346495f7", "6cf3b8f230ba43da8ca233ffde59d79c", "925e0bc9db224c5eafb473c6ee60f872", "369daa8686af4e399ec445bf1c06dd6e", "1814f465bfd4490daaf4a947ae10f720", "c9d2996be3574c52bcb45db67b9390dc", "14d7ae4aa6b1494a95297fa0d98ead00", "710d9a7bd47c4fca84d510d9c77974fc" ] }, "id": "DgwV70zm-DHd", "outputId": "80b5a670-298c-4a40-9d71-75da52dcc25c" }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "76fe3f39033c4bd28d8120b455d50edd", "version_major": 2, "version_minor": 0 }, "text/plain": [ "tokenizer_config.json: 0%| | 0.00/48.0 [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "9bfa214d40584752a746c8d321f0bef7", "version_major": 2, "version_minor": 0 }, "text/plain": [ "vocab.txt: 0%| | 0.00/232k [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "49bb6af61b5e444abc20d4e9a62d3698", "version_major": 2, "version_minor": 0 }, "text/plain": [ "tokenizer.json: 0%| | 0.00/466k [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "580ae3ea7f674e2183c9c528c3014785", "version_major": 2, "version_minor": 0 }, "text/plain": [ "config.json: 0%| | 0.00/570 [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Initialize the tokenizer\n", "tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "jhkkmHJ--FNJ" }, "outputs": [], "source": [ "# Create label mapping\n", "label_mapping = {'clearly_fair': 0, 'potentially_unfair': 1, 'clearly_unfair': 2}" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "UP0lFTYY-Hfr" }, "outputs": [], "source": [ "def tokenize_and_encode(examples):\n", " tokenized = tokenizer(examples['sentence'], padding='max_length', truncation=True, max_length=512)\n", " tokenized['labels'] = [label_mapping[label] for label in examples['unfairness_level']]\n", " return tokenized" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 113, "referenced_widgets": [ "305a8a7667b6476b9e1df9885b8e8dbd", "15720ee7aeb248d09b3e3b6f177808e4", "e03e56b3e04148889fbaf4cb7e0930f5", "3b6031f8bf8d416fab5761ecdd241aa3", "33ca580741584630af0083ff35844060", "a4f304c12d8a4111a66ca890417eedb3", "28537715028b4c5dbb39329608aae0dd", "c9b43bf781e44dd1a5c5ac82325d19fb", "7cca2d54709e4492bd3860863bcaf128", "f4a43d7cf8004645a7f461eb1436c98b", "981d4e8c070447cf99fb04f82ed9aa8c", "d1a6b0b2bbad47cb8977616f9e48407d", "aef2303096604a749568f609073b808c", "fbbc9399887a456e80b74072a42a0e6c", "145774afdad2451280d36d8cecfbc9d1", "03f03e51a69740e5b2bddbba466900b1", "80987de577d849d993420937c8f80d33", "d884f609d93a4a74be40fe7095d7c0d9", "21628d5a26f64b8689dc8fddb8fc8adc", "7e2d7f267c4246ef89024a9c254e56ba", "2f7cdfd91e1d4a228a6f7a9af32f965e", "314d45a1519c4054b8f82e2095be1dcb", "1f30b1af28dd4de48544398fe257db30", "e0dd145b6e7043949361596cc12d088f", "90562a4022b1444abc1d08f690ef163a", "cc9aaa66634b4c03a1ade8ae1de3b523", "d2558dc27f9e4d439d64bb7161226ac1", "af26e7b775e84c4b88f1ddd0a0713871", "f8c712d29a254da08f58d09ea03e4fec", "2d0f39be50fc4ecdbebdf59040b26b4e", "beaa733789dc4e4885715951c7f02f94", "1f22acd0d7a749238a3de415e20174e6", "d3961bd43ddc47b6813ffc0af1b138b2" ] }, "id": "rUefZAEd-LGy", "outputId": "410989db-b241-4a24-d337-1102a448f955" }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "305a8a7667b6476b9e1df9885b8e8dbd", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Map: 0%| | 0/5378 [00:00, ? examples/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "d1a6b0b2bbad47cb8977616f9e48407d", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Map: 0%| | 0/415 [00:00, ? examples/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "1f30b1af28dd4de48544398fe257db30", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Map: 0%| | 0/1038 [00:00, ? examples/s]" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Tokenize and encode the dataset\n", "tokenized_dataset = dataset.map(tokenize_and_encode, batched=True)\n", "\n", "# Set the format for PyTorch\n", "tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "b1Ob3lCW-NfS" }, "outputs": [], "source": [ "# Split the dataset\n", "train_data = tokenized_dataset['train']\n", "validation_data = tokenized_dataset['validation']\n", "test_data = tokenized_dataset['test']" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 104, "referenced_widgets": [ "78d96f07dce1410aaaed0c8e9d5ed943", "b90d29f65b7c45949d62a9d7180aa2ef", "9d932bda893b45c6a21e468450ed6eca", "e79a2f10e5e2484897a6f61648cd1d34", "485a1e13a9d242989cc0d7dad6a3d4f0", "46c0e19b8056492f90249b31a5d79982", "65f003238fdb49ea9a3618e45916e1af", "5f27cd1305e041cdad3ea94d99a3a8e0", "7ebdd40615e64a27b5e130d0d1f05f8e", "fd4f4ff79a8948c6935fd1d6e5e0950b", "a63ff7cf22d2450491aa8038d99bfd4c" ] }, "id": "YoHKge0w-PT6", "outputId": "385ff3da-e464-4452-8f2c-2d356bb771d7" }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "78d96f07dce1410aaaed0c8e9d5ed943", "version_major": 2, "version_minor": 0 }, "text/plain": [ "model.safetensors: 0%| | 0.00/440M [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stderr", "output_type": "stream", "text": [ "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']\n", "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" ] } ], "source": [ "# Load the model\n", "num_labels = 3 # clearly_fair, potentially_unfair, clearly_unfair\n", "model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "_ZYZVg1v-REq", "outputId": "bf42d938-59f9-45d2-c39c-45e473c7d27f" }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/usr/local/lib/python3.10/dist-packages/transformers/training_args.py:1494: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead\n", " warnings.warn(\n" ] } ], "source": [ "# Define training arguments\n", "training_args = TrainingArguments(\n", " output_dir='./results',\n", " num_train_epochs=3,\n", " per_device_train_batch_size=16,\n", " per_device_eval_batch_size=16,\n", " warmup_steps=500,\n", " weight_decay=0.01,\n", " logging_dir='./logs',\n", " logging_steps=10,\n", " evaluation_strategy=\"epoch\"\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "XOwSYwb8-TqY" }, "outputs": [], "source": [ "# Define the compute metrics function\n", "def compute_metrics(pred):\n", " labels = pred.label_ids\n", " preds = np.argmax(pred.predictions, axis=-1)\n", " precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')\n", " acc = accuracy_score(labels, preds)\n", " return {\n", " 'accuracy': acc,\n", " 'f1': f1,\n", " 'precision': precision,\n", " 'recall': recall\n", " }" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "5jYNKDh9-Vhf" }, "outputs": [], "source": [ "# Initialize the Trainer\n", "trainer = Trainer(\n", " model=model,\n", " args=training_args,\n", " train_dataset=train_data,\n", " eval_dataset=validation_data,\n", " compute_metrics=compute_metrics\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 204 }, "id": "72kGtJV2-WcH", "outputId": "5b9792a3-cbbf-4693-f152-361dd2ae0cdd" }, "outputs": [ { "data": { "text/html": [ "\n", "
Epoch | \n", "Training Loss | \n", "Validation Loss | \n", "Accuracy | \n", "F1 | \n", "Precision | \n", "Recall | \n", "
---|---|---|---|---|---|---|
1 | \n", "0.340800 | \n", "0.438809 | \n", "0.826506 | \n", "0.821947 | \n", "0.859294 | \n", "0.826506 | \n", "
2 | \n", "0.278800 | \n", "0.374012 | \n", "0.853012 | \n", "0.849384 | \n", "0.846826 | \n", "0.853012 | \n", "
3 | \n", "0.069500 | \n", "0.425419 | \n", "0.889157 | \n", "0.885282 | \n", "0.883729 | \n", "0.889157 | \n", "
"
],
"text/plain": [
"