{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "b65299ef-8f7d-4ed6-8b7c-ad6b010e15d4", "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Collecting transformers==4.36.2\n", " Downloading transformers-4.36.2-py3-none-any.whl (8.2 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m8.2/8.2 MB\u001b[0m \u001b[31m28.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n", "\u001b[?25hCollecting datasets==2.16.1\n", " Downloading datasets-2.16.1-py3-none-any.whl (507 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m507.1/507.1 kB\u001b[0m \u001b[31m266.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hCollecting accelerate==0.26.1\n", " Downloading accelerate-0.26.1-py3-none-any.whl (270 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m270.9/270.9 kB\u001b[0m \u001b[31m272.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hCollecting evaluate==0.4.1\n", " Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m84.1/84.1 kB\u001b[0m \u001b[31m248.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hCollecting bitsandbytes==0.42.0\n", " Downloading bitsandbytes-0.42.0-py3-none-any.whl (105.0 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m105.0/105.0 MB\u001b[0m \u001b[31m87.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n", "\u001b[?25hCollecting trl==0.7.10\n", " Downloading trl-0.7.10-py3-none-any.whl (150 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m150.9/150.9 kB\u001b[0m \u001b[31m186.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hCollecting peft==0.7.1\n", " Downloading peft-0.7.1-py3-none-any.whl (168 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m168.3/168.3 kB\u001b[0m \u001b[31m222.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: filelock in /opt/app-root/lib/python3.9/site-packages (from transformers==4.36.2->-r requirements.txt (line 1)) (3.14.0)\n", "Collecting huggingface-hub<1.0,>=0.19.3\n", " Downloading huggingface_hub-0.24.5-py3-none-any.whl (417 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m417.5/417.5 kB\u001b[0m \u001b[31m317.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: requests in /opt/app-root/lib/python3.9/site-packages (from transformers==4.36.2->-r requirements.txt (line 1)) (2.32.2)\n", "Collecting safetensors>=0.3.1\n", " Downloading safetensors-0.4.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (436 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m436.3/436.3 kB\u001b[0m \u001b[31m127.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hCollecting tokenizers<0.19,>=0.14\n", " Downloading tokenizers-0.15.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.6 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.6/3.6 MB\u001b[0m \u001b[31m92.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0mta \u001b[36m0:00:01\u001b[0m\n", "\u001b[?25hRequirement already satisfied: tqdm>=4.27 in /opt/app-root/lib/python3.9/site-packages (from transformers==4.36.2->-r requirements.txt (line 1)) (4.66.4)\n", "Requirement already satisfied: packaging>=20.0 in /opt/app-root/lib/python3.9/site-packages (from transformers==4.36.2->-r requirements.txt (line 1)) (24.0)\n", "Collecting regex!=2019.12.17\n", " Downloading regex-2024.7.24-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (775 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m775.9/775.9 kB\u001b[0m \u001b[31m262.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: pyyaml>=5.1 in /opt/app-root/lib/python3.9/site-packages (from transformers==4.36.2->-r requirements.txt (line 1)) (6.0.1)\n", "Requirement already satisfied: numpy>=1.17 in /opt/app-root/lib/python3.9/site-packages (from transformers==4.36.2->-r requirements.txt (line 1)) (1.24.4)\n", "Collecting pyarrow-hotfix\n", " Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)\n", "Collecting fsspec[http]<=2023.10.0,>=2023.1.0\n", " Downloading fsspec-2023.10.0-py3-none-any.whl (166 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m166.4/166.4 kB\u001b[0m \u001b[31m288.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hCollecting dill<0.3.8,>=0.3.0\n", " Downloading dill-0.3.7-py3-none-any.whl (115 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m115.3/115.3 kB\u001b[0m \u001b[31m144.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: aiohttp in /opt/app-root/lib/python3.9/site-packages (from datasets==2.16.1->-r requirements.txt (line 2)) (3.9.5)\n", "Requirement already satisfied: pyarrow>=8.0.0 in /opt/app-root/lib/python3.9/site-packages (from datasets==2.16.1->-r requirements.txt (line 2)) (14.0.1)\n", "Requirement already satisfied: pandas in /opt/app-root/lib/python3.9/site-packages (from datasets==2.16.1->-r requirements.txt (line 2)) (1.5.3)\n", "Collecting multiprocess\n", " Downloading multiprocess-0.70.16-py39-none-any.whl (133 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m133.4/133.4 kB\u001b[0m \u001b[31m246.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hCollecting xxhash\n", " Downloading xxhash-3.4.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (193 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m193.8/193.8 kB\u001b[0m \u001b[31m282.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: psutil in /opt/app-root/lib/python3.9/site-packages (from accelerate==0.26.1->-r requirements.txt (line 3)) (5.9.8)\n", "Collecting torch>=1.10.0\n", " Downloading torch-2.4.0-cp39-cp39-manylinux1_x86_64.whl (797.2 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m797.2/797.2 MB\u001b[0m \u001b[31m88.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", "\u001b[?25hCollecting responses<0.19\n", " Downloading responses-0.18.0-py3-none-any.whl (38 kB)\n", "Requirement already satisfied: scipy in /opt/app-root/lib/python3.9/site-packages (from bitsandbytes==0.42.0->-r requirements.txt (line 5)) (1.12.0)\n", "Collecting tyro>=0.5.11\n", " Downloading tyro-0.8.6-py3-none-any.whl (103 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m103.8/103.8 kB\u001b[0m \u001b[31m236.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: frozenlist>=1.1.1 in /opt/app-root/lib/python3.9/site-packages (from aiohttp->datasets==2.16.1->-r requirements.txt (line 2)) (1.4.1)\n", "Requirement already satisfied: async-timeout<5.0,>=4.0 in /opt/app-root/lib/python3.9/site-packages (from aiohttp->datasets==2.16.1->-r requirements.txt (line 2)) (4.0.3)\n", "Requirement already satisfied: aiosignal>=1.1.2 in /opt/app-root/lib/python3.9/site-packages (from aiohttp->datasets==2.16.1->-r requirements.txt (line 2)) (1.3.1)\n", "Requirement already satisfied: multidict<7.0,>=4.5 in /opt/app-root/lib/python3.9/site-packages (from aiohttp->datasets==2.16.1->-r requirements.txt (line 2)) (6.0.5)\n", "Requirement already satisfied: attrs>=17.3.0 in /opt/app-root/lib/python3.9/site-packages (from aiohttp->datasets==2.16.1->-r requirements.txt (line 2)) (23.2.0)\n", "Requirement already satisfied: yarl<2.0,>=1.0 in /opt/app-root/lib/python3.9/site-packages (from aiohttp->datasets==2.16.1->-r requirements.txt (line 2)) (1.9.4)\n", "Requirement already satisfied: typing-extensions>=3.7.4.3 in /opt/app-root/lib/python3.9/site-packages (from huggingface-hub<1.0,>=0.19.3->transformers==4.36.2->-r requirements.txt (line 1)) (4.11.0)\n", "Requirement already satisfied: idna<4,>=2.5 in /opt/app-root/lib/python3.9/site-packages (from requests->transformers==4.36.2->-r requirements.txt (line 1)) (3.7)\n", "Requirement already satisfied: charset-normalizer<4,>=2 in /opt/app-root/lib/python3.9/site-packages (from requests->transformers==4.36.2->-r requirements.txt (line 1)) (3.3.2)\n", "Requirement already satisfied: certifi>=2017.4.17 in /opt/app-root/lib/python3.9/site-packages (from requests->transformers==4.36.2->-r requirements.txt (line 1)) (2024.2.2)\n", "Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/app-root/lib/python3.9/site-packages (from requests->transformers==4.36.2->-r requirements.txt (line 1)) (1.26.18)\n", "Collecting nvidia-nvtx-cu12==12.1.105\n", " Downloading nvidia_nvtx_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (99 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m99.1/99.1 kB\u001b[0m \u001b[31m225.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: jinja2 in /opt/app-root/lib/python3.9/site-packages (from torch>=1.10.0->accelerate==0.26.1->-r requirements.txt (line 3)) (3.1.4)\n", "Collecting nvidia-nccl-cu12==2.20.5\n", " Downloading nvidia_nccl_cu12-2.20.5-py3-none-manylinux2014_x86_64.whl (176.2 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m176.2/176.2 MB\u001b[0m \u001b[31m84.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", "\u001b[?25hCollecting triton==3.0.0\n", " Downloading triton-3.0.0-1-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (209.4 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m209.4/209.4 MB\u001b[0m \u001b[31m87.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", "\u001b[?25hCollecting nvidia-cufft-cu12==11.0.2.54\n", " Downloading nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m121.6/121.6 MB\u001b[0m \u001b[31m84.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", "\u001b[?25hCollecting nvidia-cuda-cupti-cu12==12.1.105\n", " Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m14.1/14.1 MB\u001b[0m \u001b[31m87.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n", "\u001b[?25hRequirement already satisfied: networkx in /opt/app-root/lib/python3.9/site-packages (from torch>=1.10.0->accelerate==0.26.1->-r requirements.txt (line 3)) (3.2.1)\n", "Collecting nvidia-cuda-nvrtc-cu12==12.1.105\n", " Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m23.7/23.7 MB\u001b[0m \u001b[31m88.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n", "\u001b[?25hCollecting sympy\n", " Downloading sympy-1.13.2-py3-none-any.whl (6.2 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.2/6.2 MB\u001b[0m \u001b[31m98.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0mta \u001b[36m0:00:01\u001b[0m\n", "\u001b[?25hCollecting nvidia-cusparse-cu12==12.1.0.106\n", " Downloading nvidia_cusparse_cu12-12.1.0.106-py3-none-manylinux1_x86_64.whl (196.0 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m196.0/196.0 MB\u001b[0m \u001b[31m65.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", "\u001b[?25hCollecting nvidia-cuda-runtime-cu12==12.1.105\n", " Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m823.6/823.6 kB\u001b[0m \u001b[31m121.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hCollecting nvidia-curand-cu12==10.3.2.106\n", " Downloading nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m56.5/56.5 MB\u001b[0m \u001b[31m69.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", "\u001b[?25hCollecting nvidia-cudnn-cu12==9.1.0.70\n", " Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl (664.8 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m664.8/664.8 MB\u001b[0m \u001b[31m75.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", "\u001b[?25hCollecting nvidia-cublas-cu12==12.1.3.1\n", " Downloading nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m410.6/410.6 MB\u001b[0m \u001b[31m46.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", "\u001b[?25hCollecting nvidia-cusolver-cu12==11.4.5.107\n", " Downloading nvidia_cusolver_cu12-11.4.5.107-py3-none-manylinux1_x86_64.whl (124.2 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m124.2/124.2 MB\u001b[0m \u001b[31m36.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", "\u001b[?25hCollecting nvidia-nvjitlink-cu12\n", " Downloading nvidia_nvjitlink_cu12-12.6.20-py3-none-manylinux2014_x86_64.whl (19.7 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m19.7/19.7 MB\u001b[0m \u001b[31m37.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", "\u001b[?25hRequirement already satisfied: rich>=11.1.0 in /opt/app-root/lib/python3.9/site-packages (from tyro>=0.5.11->trl==0.7.10->-r requirements.txt (line 6)) (12.6.0)\n", "Collecting docstring-parser>=0.16\n", " Downloading docstring_parser-0.16-py3-none-any.whl (36 kB)\n", "Collecting shtab>=1.5.6\n", " Downloading shtab-1.7.1-py3-none-any.whl (14 kB)\n", "Collecting eval-type-backport>=0.1.3\n", " Downloading eval_type_backport-0.2.0-py3-none-any.whl (5.9 kB)\n", "Collecting multiprocess\n", " Downloading multiprocess-0.70.15-py39-none-any.whl (133 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m133.3/133.3 kB\u001b[0m \u001b[31m222.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: pytz>=2020.1 in /opt/app-root/lib/python3.9/site-packages (from pandas->datasets==2.16.1->-r requirements.txt (line 2)) (2024.1)\n", "Requirement already satisfied: python-dateutil>=2.8.1 in /opt/app-root/lib/python3.9/site-packages (from pandas->datasets==2.16.1->-r requirements.txt (line 2)) (2.9.0.post0)\n", "Requirement already satisfied: six>=1.5 in /opt/app-root/lib/python3.9/site-packages (from python-dateutil>=2.8.1->pandas->datasets==2.16.1->-r requirements.txt (line 2)) (1.16.0)\n", "Requirement already satisfied: pygments<3.0.0,>=2.6.0 in /opt/app-root/lib/python3.9/site-packages (from rich>=11.1.0->tyro>=0.5.11->trl==0.7.10->-r requirements.txt (line 6)) (2.18.0)\n", "Requirement already satisfied: commonmark<0.10.0,>=0.9.0 in /opt/app-root/lib/python3.9/site-packages (from rich>=11.1.0->tyro>=0.5.11->trl==0.7.10->-r requirements.txt (line 6)) (0.9.1)\n", "Requirement already satisfied: MarkupSafe>=2.0 in /opt/app-root/lib/python3.9/site-packages (from jinja2->torch>=1.10.0->accelerate==0.26.1->-r requirements.txt (line 3)) (2.1.5)\n", "Collecting mpmath<1.4,>=1.1.0\n", " Downloading mpmath-1.3.0-py3-none-any.whl (536 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m536.2/536.2 kB\u001b[0m \u001b[31m68.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hInstalling collected packages: mpmath, xxhash, triton, sympy, shtab, safetensors, regex, pyarrow-hotfix, nvidia-nvtx-cu12, nvidia-nvjitlink-cu12, nvidia-nccl-cu12, nvidia-curand-cu12, nvidia-cufft-cu12, nvidia-cuda-runtime-cu12, nvidia-cuda-nvrtc-cu12, nvidia-cuda-cupti-cu12, nvidia-cublas-cu12, fsspec, eval-type-backport, docstring-parser, dill, tyro, responses, nvidia-cusparse-cu12, nvidia-cudnn-cu12, multiprocess, huggingface-hub, bitsandbytes, tokenizers, nvidia-cusolver-cu12, transformers, torch, datasets, evaluate, accelerate, trl, peft\n", " Attempting uninstall: fsspec\n", " Found existing installation: fsspec 2024.5.0\n", " Uninstalling fsspec-2024.5.0:\n", " Successfully uninstalled fsspec-2024.5.0\n", " Attempting uninstall: docstring-parser\n", " Found existing installation: docstring_parser 0.8.1\n", " Uninstalling docstring_parser-0.8.1:\n", " Successfully uninstalled docstring_parser-0.8.1\n", " Attempting uninstall: dill\n", " Found existing installation: dill 0.3.8\n", " Uninstalling dill-0.3.8:\n", " Successfully uninstalled dill-0.3.8\n", "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", "codeflare-torchx 0.6.0.dev2 requires docstring-parser==0.8.1, but you have docstring-parser 0.16 which is incompatible.\u001b[0m\u001b[31m\n", "\u001b[0mSuccessfully installed accelerate-0.26.1 bitsandbytes-0.42.0 datasets-2.16.1 dill-0.3.7 docstring-parser-0.16 eval-type-backport-0.2.0 evaluate-0.4.1 fsspec-2023.10.0 huggingface-hub-0.24.5 mpmath-1.3.0 multiprocess-0.70.15 nvidia-cublas-cu12-12.1.3.1 nvidia-cuda-cupti-cu12-12.1.105 nvidia-cuda-nvrtc-cu12-12.1.105 nvidia-cuda-runtime-cu12-12.1.105 nvidia-cudnn-cu12-9.1.0.70 nvidia-cufft-cu12-11.0.2.54 nvidia-curand-cu12-10.3.2.106 nvidia-cusolver-cu12-11.4.5.107 nvidia-cusparse-cu12-12.1.0.106 nvidia-nccl-cu12-2.20.5 nvidia-nvjitlink-cu12-12.6.20 nvidia-nvtx-cu12-12.1.105 peft-0.7.1 pyarrow-hotfix-0.6 regex-2024.7.24 responses-0.18.0 safetensors-0.4.4 shtab-1.7.1 sympy-1.13.2 tokenizers-0.15.2 torch-2.4.0 transformers-4.36.2 triton-3.0.0 trl-0.7.10 tyro-0.8.6 xxhash-3.4.1\n", "\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip available: \u001b[0m\u001b[31;49m22.2.2\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.2\u001b[0m\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n" ] } ], "source": [ "!pip install -r requirements.txt" ] }, { "cell_type": "code", "execution_count": 8, "id": "dfaf1390-b9ec-4b36-976a-01f6efafa15f", "metadata": { "tags": [] }, "outputs": [ { "ename": "ModuleNotFoundError", "evalue": "No module named 'trustyai.detoxify'", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn[8], line 16\u001b[0m\n\u001b[1;32m 14\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mnumpy\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mnp\u001b[39;00m\n\u001b[1;32m 15\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mtorch\u001b[39;00m\n\u001b[0;32m---> 16\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtrustyai\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdetoxify\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m TMaRCo\n", "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'trustyai.detoxify'" ] } ], "source": [ "from transformers import (\n", " AutoTokenizer,\n", " AutoModelForCausalLM,\n", " DataCollatorForLanguageModeling,\n", " BitsAndBytesConfig,\n", " Trainer,\n", " TrainingArguments,\n", " set_seed\n", " )\n", "from datasets import load_dataset, load_from_disk\n", "from peft import LoraConfig\n", "from trl import SFTTrainer\n", "from trl.trainer import ConstantLengthDataset\n", "import numpy as np\n", "import torch\n", "from trustyai.detoxify import TMaRCo" ] }, { "cell_type": "code", "execution_count": 4, "id": "a31ac0ed-fe1f-4128-9c86-d5c834658b67", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\u001b[31mERROR: Could not find a version that satisfies the requirement trustyai.detoxify (from versions: none)\u001b[0m\u001b[31m\n", "\u001b[0m\u001b[31mERROR: No matching distribution found for trustyai.detoxify\u001b[0m\u001b[31m\n", "\u001b[0m\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip available: \u001b[0m\u001b[31;49m22.2.2\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.2\u001b[0m\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n" ] } ], "source": [ "!pip install trustyai.detoxify" ] }, { "cell_type": "code", "execution_count": 6, "id": "8e892a23-4728-4f91-a939-2fb2eaa02f31", "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: trustyai in /opt/app-root/lib/python3.9/site-packages (0.6.0)\n", "Requirement already satisfied: pyarrow==14.0.1 in /opt/app-root/lib/python3.9/site-packages (from trustyai) (14.0.1)\n", "Requirement already satisfied: numpy~=1.24.1 in /opt/app-root/lib/python3.9/site-packages (from trustyai) (1.24.4)\n", "Requirement already satisfied: pandas~=1.5.3 in /opt/app-root/lib/python3.9/site-packages (from trustyai) (1.5.3)\n", "Requirement already satisfied: Jpype1==1.4.1 in /opt/app-root/lib/python3.9/site-packages (from trustyai) (1.4.1)\n", "Requirement already satisfied: matplotlib~=3.6.3 in /opt/app-root/lib/python3.9/site-packages (from trustyai) (3.6.3)\n", "Requirement already satisfied: jupyter-bokeh~=3.0.5 in /opt/app-root/lib/python3.9/site-packages (from trustyai) (3.0.7)\n", "Requirement already satisfied: packaging in /opt/app-root/lib/python3.9/site-packages (from Jpype1==1.4.1->trustyai) (24.0)\n", "Requirement already satisfied: ipywidgets==8.* in /opt/app-root/lib/python3.9/site-packages (from jupyter-bokeh~=3.0.5->trustyai) (8.1.2)\n", "Requirement already satisfied: bokeh==3.* in /opt/app-root/lib/python3.9/site-packages (from jupyter-bokeh~=3.0.5->trustyai) (3.4.1)\n", "Requirement already satisfied: pillow>=7.1.0 in /opt/app-root/lib/python3.9/site-packages (from bokeh==3.*->jupyter-bokeh~=3.0.5->trustyai) (10.3.0)\n", "Requirement already satisfied: PyYAML>=3.10 in /opt/app-root/lib/python3.9/site-packages (from bokeh==3.*->jupyter-bokeh~=3.0.5->trustyai) (6.0.1)\n", "Requirement already satisfied: tornado>=6.2 in /opt/app-root/lib/python3.9/site-packages (from bokeh==3.*->jupyter-bokeh~=3.0.5->trustyai) (6.4)\n", "Requirement already satisfied: Jinja2>=2.9 in /opt/app-root/lib/python3.9/site-packages (from bokeh==3.*->jupyter-bokeh~=3.0.5->trustyai) (3.1.4)\n", "Requirement already satisfied: contourpy>=1.2 in /opt/app-root/lib/python3.9/site-packages (from bokeh==3.*->jupyter-bokeh~=3.0.5->trustyai) (1.2.1)\n", "Requirement already satisfied: xyzservices>=2021.09.1 in /opt/app-root/lib/python3.9/site-packages (from bokeh==3.*->jupyter-bokeh~=3.0.5->trustyai) (2024.4.0)\n", "Requirement already satisfied: jupyterlab-widgets~=3.0.10 in /opt/app-root/lib/python3.9/site-packages (from ipywidgets==8.*->jupyter-bokeh~=3.0.5->trustyai) (3.0.10)\n", "Requirement already satisfied: comm>=0.1.3 in /opt/app-root/lib/python3.9/site-packages (from ipywidgets==8.*->jupyter-bokeh~=3.0.5->trustyai) (0.2.2)\n", "Requirement already satisfied: traitlets>=4.3.1 in /opt/app-root/lib/python3.9/site-packages (from ipywidgets==8.*->jupyter-bokeh~=3.0.5->trustyai) (5.14.3)\n", "Requirement already satisfied: widgetsnbextension~=4.0.10 in /opt/app-root/lib/python3.9/site-packages (from ipywidgets==8.*->jupyter-bokeh~=3.0.5->trustyai) (4.0.10)\n", "Requirement already satisfied: ipython>=6.1.0 in /opt/app-root/lib/python3.9/site-packages (from ipywidgets==8.*->jupyter-bokeh~=3.0.5->trustyai) (8.18.1)\n", "Requirement already satisfied: kiwisolver>=1.0.1 in /opt/app-root/lib/python3.9/site-packages (from matplotlib~=3.6.3->trustyai) (1.4.5)\n", "Requirement already satisfied: fonttools>=4.22.0 in /opt/app-root/lib/python3.9/site-packages (from matplotlib~=3.6.3->trustyai) (4.51.0)\n", "Requirement already satisfied: python-dateutil>=2.7 in /opt/app-root/lib/python3.9/site-packages (from matplotlib~=3.6.3->trustyai) (2.9.0.post0)\n", "Requirement already satisfied: pyparsing>=2.2.1 in /opt/app-root/lib/python3.9/site-packages (from matplotlib~=3.6.3->trustyai) (3.1.2)\n", "Requirement already satisfied: cycler>=0.10 in /opt/app-root/lib/python3.9/site-packages (from matplotlib~=3.6.3->trustyai) (0.12.1)\n", "Requirement already satisfied: pytz>=2020.1 in /opt/app-root/lib/python3.9/site-packages (from pandas~=1.5.3->trustyai) (2024.1)\n", "Requirement already satisfied: six>=1.5 in /opt/app-root/lib/python3.9/site-packages (from python-dateutil>=2.7->matplotlib~=3.6.3->trustyai) (1.16.0)\n", "Requirement already satisfied: pygments>=2.4.0 in /opt/app-root/lib/python3.9/site-packages (from ipython>=6.1.0->ipywidgets==8.*->jupyter-bokeh~=3.0.5->trustyai) (2.18.0)\n", "Requirement already satisfied: typing-extensions in /opt/app-root/lib/python3.9/site-packages (from ipython>=6.1.0->ipywidgets==8.*->jupyter-bokeh~=3.0.5->trustyai) (4.11.0)\n", "Requirement already satisfied: decorator in /opt/app-root/lib/python3.9/site-packages (from ipython>=6.1.0->ipywidgets==8.*->jupyter-bokeh~=3.0.5->trustyai) (5.1.1)\n", "Requirement already satisfied: matplotlib-inline in /opt/app-root/lib/python3.9/site-packages (from ipython>=6.1.0->ipywidgets==8.*->jupyter-bokeh~=3.0.5->trustyai) (0.1.7)\n", "Requirement already satisfied: jedi>=0.16 in /opt/app-root/lib/python3.9/site-packages (from ipython>=6.1.0->ipywidgets==8.*->jupyter-bokeh~=3.0.5->trustyai) (0.19.1)\n", "Requirement already satisfied: prompt-toolkit<3.1.0,>=3.0.41 in /opt/app-root/lib/python3.9/site-packages (from ipython>=6.1.0->ipywidgets==8.*->jupyter-bokeh~=3.0.5->trustyai) (3.0.43)\n", "Requirement already satisfied: stack-data in /opt/app-root/lib/python3.9/site-packages (from ipython>=6.1.0->ipywidgets==8.*->jupyter-bokeh~=3.0.5->trustyai) (0.6.3)\n", "Requirement already satisfied: exceptiongroup in /opt/app-root/lib/python3.9/site-packages (from ipython>=6.1.0->ipywidgets==8.*->jupyter-bokeh~=3.0.5->trustyai) (1.2.1)\n", "Requirement already satisfied: pexpect>4.3 in /opt/app-root/lib/python3.9/site-packages (from ipython>=6.1.0->ipywidgets==8.*->jupyter-bokeh~=3.0.5->trustyai) (4.9.0)\n", "Requirement already satisfied: MarkupSafe>=2.0 in /opt/app-root/lib/python3.9/site-packages (from Jinja2>=2.9->bokeh==3.*->jupyter-bokeh~=3.0.5->trustyai) (2.1.5)\n", "Requirement already satisfied: parso<0.9.0,>=0.8.3 in /opt/app-root/lib/python3.9/site-packages (from jedi>=0.16->ipython>=6.1.0->ipywidgets==8.*->jupyter-bokeh~=3.0.5->trustyai) (0.8.4)\n", "Requirement already satisfied: ptyprocess>=0.5 in /opt/app-root/lib/python3.9/site-packages (from pexpect>4.3->ipython>=6.1.0->ipywidgets==8.*->jupyter-bokeh~=3.0.5->trustyai) (0.7.0)\n", "Requirement already satisfied: wcwidth in /opt/app-root/lib/python3.9/site-packages (from prompt-toolkit<3.1.0,>=3.0.41->ipython>=6.1.0->ipywidgets==8.*->jupyter-bokeh~=3.0.5->trustyai) (0.2.13)\n", "Requirement already satisfied: pure-eval in /opt/app-root/lib/python3.9/site-packages (from stack-data->ipython>=6.1.0->ipywidgets==8.*->jupyter-bokeh~=3.0.5->trustyai) (0.2.2)\n", "Requirement already satisfied: asttokens>=2.1.0 in /opt/app-root/lib/python3.9/site-packages (from stack-data->ipython>=6.1.0->ipywidgets==8.*->jupyter-bokeh~=3.0.5->trustyai) (2.4.1)\n", "Requirement already satisfied: executing>=1.2.0 in /opt/app-root/lib/python3.9/site-packages (from stack-data->ipython>=6.1.0->ipywidgets==8.*->jupyter-bokeh~=3.0.5->trustyai) (1.2.0)\n", "\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip available: \u001b[0m\u001b[31;49m22.2.2\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.2\u001b[0m\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n", "Note: you may need to restart the kernel to use updated packages.\n" ] } ], "source": [ "pip install trustyai\n" ] }, { "cell_type": "code", "execution_count": 7, "id": "3a11251f-4637-4ecf-9d29-3db29f266795", "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Collecting detoxify\n", " Downloading detoxify-0.5.2-py3-none-any.whl (12 kB)\n", "Collecting sentencepiece>=0.1.94\n", " Downloading sentencepiece-0.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m4.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n", "\u001b[?25hRequirement already satisfied: transformers in /opt/app-root/lib/python3.9/site-packages (from detoxify) (4.36.2)\n", "Requirement already satisfied: torch>=1.7.0 in /opt/app-root/lib/python3.9/site-packages (from detoxify) (2.4.0)\n", "Requirement already satisfied: nvidia-cublas-cu12==12.1.3.1 in /opt/app-root/lib/python3.9/site-packages (from torch>=1.7.0->detoxify) (12.1.3.1)\n", "Requirement already satisfied: networkx in /opt/app-root/lib/python3.9/site-packages (from torch>=1.7.0->detoxify) (3.2.1)\n", "Requirement already satisfied: nvidia-cusolver-cu12==11.4.5.107 in /opt/app-root/lib/python3.9/site-packages (from torch>=1.7.0->detoxify) (11.4.5.107)\n", "Requirement already satisfied: nvidia-cudnn-cu12==9.1.0.70 in /opt/app-root/lib/python3.9/site-packages (from torch>=1.7.0->detoxify) (9.1.0.70)\n", "Requirement already satisfied: filelock in /opt/app-root/lib/python3.9/site-packages (from torch>=1.7.0->detoxify) (3.14.0)\n", "Requirement already satisfied: nvidia-nccl-cu12==2.20.5 in /opt/app-root/lib/python3.9/site-packages (from torch>=1.7.0->detoxify) (2.20.5)\n", "Requirement already satisfied: nvidia-cufft-cu12==11.0.2.54 in /opt/app-root/lib/python3.9/site-packages (from torch>=1.7.0->detoxify) (11.0.2.54)\n", "Requirement already satisfied: nvidia-cuda-runtime-cu12==12.1.105 in /opt/app-root/lib/python3.9/site-packages (from torch>=1.7.0->detoxify) (12.1.105)\n", "Requirement already satisfied: sympy in /opt/app-root/lib/python3.9/site-packages (from torch>=1.7.0->detoxify) (1.13.2)\n", "Requirement already satisfied: nvidia-cusparse-cu12==12.1.0.106 in /opt/app-root/lib/python3.9/site-packages (from torch>=1.7.0->detoxify) (12.1.0.106)\n", "Requirement already satisfied: nvidia-curand-cu12==10.3.2.106 in /opt/app-root/lib/python3.9/site-packages (from torch>=1.7.0->detoxify) (10.3.2.106)\n", "Requirement already satisfied: triton==3.0.0 in /opt/app-root/lib/python3.9/site-packages (from torch>=1.7.0->detoxify) (3.0.0)\n", "Requirement already satisfied: fsspec in /opt/app-root/lib/python3.9/site-packages (from torch>=1.7.0->detoxify) (2023.10.0)\n", "Requirement already satisfied: nvidia-cuda-nvrtc-cu12==12.1.105 in /opt/app-root/lib/python3.9/site-packages (from torch>=1.7.0->detoxify) (12.1.105)\n", "Requirement already satisfied: jinja2 in /opt/app-root/lib/python3.9/site-packages (from torch>=1.7.0->detoxify) (3.1.4)\n", "Requirement already satisfied: nvidia-nvtx-cu12==12.1.105 in /opt/app-root/lib/python3.9/site-packages (from torch>=1.7.0->detoxify) (12.1.105)\n", "Requirement already satisfied: nvidia-cuda-cupti-cu12==12.1.105 in /opt/app-root/lib/python3.9/site-packages (from torch>=1.7.0->detoxify) (12.1.105)\n", "Requirement already satisfied: typing-extensions>=4.8.0 in /opt/app-root/lib/python3.9/site-packages (from torch>=1.7.0->detoxify) (4.11.0)\n", "Requirement already satisfied: nvidia-nvjitlink-cu12 in /opt/app-root/lib/python3.9/site-packages (from nvidia-cusolver-cu12==11.4.5.107->torch>=1.7.0->detoxify) (12.6.20)\n", "Requirement already satisfied: safetensors>=0.3.1 in /opt/app-root/lib/python3.9/site-packages (from transformers->detoxify) (0.4.4)\n", "Requirement already satisfied: pyyaml>=5.1 in /opt/app-root/lib/python3.9/site-packages (from transformers->detoxify) (6.0.1)\n", "Requirement already satisfied: huggingface-hub<1.0,>=0.19.3 in /opt/app-root/lib/python3.9/site-packages (from transformers->detoxify) (0.24.5)\n", "Requirement already satisfied: numpy>=1.17 in /opt/app-root/lib/python3.9/site-packages (from transformers->detoxify) (1.24.4)\n", "Requirement already satisfied: requests in /opt/app-root/lib/python3.9/site-packages (from transformers->detoxify) (2.32.2)\n", "Requirement already satisfied: tqdm>=4.27 in /opt/app-root/lib/python3.9/site-packages (from transformers->detoxify) (4.66.4)\n", "Requirement already satisfied: packaging>=20.0 in /opt/app-root/lib/python3.9/site-packages (from transformers->detoxify) (24.0)\n", "Requirement already satisfied: regex!=2019.12.17 in /opt/app-root/lib/python3.9/site-packages (from transformers->detoxify) (2024.7.24)\n", "Requirement already satisfied: tokenizers<0.19,>=0.14 in /opt/app-root/lib/python3.9/site-packages (from transformers->detoxify) (0.15.2)\n", "Requirement already satisfied: MarkupSafe>=2.0 in /opt/app-root/lib/python3.9/site-packages (from jinja2->torch>=1.7.0->detoxify) (2.1.5)\n", "Requirement already satisfied: charset-normalizer<4,>=2 in /opt/app-root/lib/python3.9/site-packages (from requests->transformers->detoxify) (3.3.2)\n", "Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/app-root/lib/python3.9/site-packages (from requests->transformers->detoxify) (1.26.18)\n", "Requirement already satisfied: certifi>=2017.4.17 in /opt/app-root/lib/python3.9/site-packages (from requests->transformers->detoxify) (2024.2.2)\n", "Requirement already satisfied: idna<4,>=2.5 in /opt/app-root/lib/python3.9/site-packages (from requests->transformers->detoxify) (3.7)\n", "Requirement already satisfied: mpmath<1.4,>=1.1.0 in /opt/app-root/lib/python3.9/site-packages (from sympy->torch>=1.7.0->detoxify) (1.3.0)\n", "Installing collected packages: sentencepiece, detoxify\n", "Successfully installed detoxify-0.5.2 sentencepiece-0.2.0\n", "\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip available: \u001b[0m\u001b[31;49m22.2.2\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.2\u001b[0m\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n", "Note: you may need to restart the kernel to use updated packages.\n" ] } ], "source": [ "pip install detoxify\n" ] }, { "cell_type": "code", "execution_count": 9, "id": "00c33828-12a4-4983-9a83-d0876d5cf890", "metadata": { "tags": [] }, "outputs": [ { "ename": "ModuleNotFoundError", "evalue": "No module named 'trustyai.detoxify'", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn[9], line 16\u001b[0m\n\u001b[1;32m 14\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mnumpy\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mnp\u001b[39;00m\n\u001b[1;32m 15\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mtorch\u001b[39;00m\n\u001b[0;32m---> 16\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtrustyai\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdetoxify\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m TMaRCo\n", "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'trustyai.detoxify'" ] } ], "source": [ "from transformers import (\n", " AutoTokenizer,\n", " AutoModelForCausalLM,\n", " DataCollatorForLanguageModeling,\n", " BitsAndBytesConfig,\n", " Trainer,\n", " TrainingArguments,\n", " set_seed\n", " )\n", "from datasets import load_dataset, load_from_disk\n", "from peft import LoraConfig\n", "from trl import SFTTrainer\n", "from trl.trainer import ConstantLengthDataset\n", "import numpy as np\n", "import torch\n", "from trustyai.detoxify import TMaRCo" ] }, { "cell_type": "code", "execution_count": 10, "id": "ec775cea-5977-410c-ab48-3c2daca20f2c", "metadata": { "tags": [] }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "130c867226c047c29c24e468a4c03305", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading readme: 0%| | 0.00/4.22k [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "7bf59f45d0c8478a8a876bf0e68fabfe", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading data: 0%| | 0.00/67.7M [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "6909d2c0c216477b8b5c3e0802cf617e", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Generating train split: 0 examples [00:00, ? examples/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "['filename', 'begin', 'end', 'challenging', 'prompt.text', 'prompt.profanity', 'prompt.sexually_explicit', 'prompt.identity_attack', 'prompt.flirtation', 'prompt.threat', 'prompt.insult', 'prompt.severe_toxicity', 'prompt.toxicity', 'continuation.text', 'continuation.severe_toxicity', 'continuation.toxicity', 'continuation.profanity', 'continuation.sexually_explicit', 'continuation.identity_attack', 'continuation.flirtation', 'continuation.threat', 'continuation.insult']\n" ] } ], "source": [ "dataset_name = \"allenai/real-toxicity-prompts\"\n", "raw_dataset = load_dataset(dataset_name, split=\"train\").flatten()\n", "print(raw_dataset.column_names)" ] }, { "cell_type": "code", "execution_count": 11, "id": "7e5225f7-12c3-46ac-8cf1-0b9c3cedbf4e", "metadata": { "tags": [] }, "outputs": [ { "ename": "NameError", "evalue": "name 'TMaRCo' is not defined", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn[11], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m# load TMaRCo expert and non-expert models\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m tmarco \u001b[38;5;241m=\u001b[39m \u001b[43mTMaRCo\u001b[49m()\n\u001b[1;32m 3\u001b[0m tmarco\u001b[38;5;241m.\u001b[39mload_models([\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtrustyai/gminus\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtrustyai/gplus\u001b[39m\u001b[38;5;124m\"\u001b[39m])\n", "\u001b[0;31mNameError\u001b[0m: name 'TMaRCo' is not defined" ] } ], "source": [ "# load TMaRCo expert and non-expert models\n", "tmarco = TMaRCo()\n", "tmarco.load_models([\"trustyai/gminus\", \"trustyai/gplus\"])" ] }, { "cell_type": "code", "execution_count": 12, "id": "081cbdd4-b8a6-4cf0-b1b8-2d542d123b34", "metadata": { "tags": [] }, "outputs": [ { "ename": "ModuleNotFoundError", "evalue": "No module named 'trustyai.detoxify'", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn[12], line 16\u001b[0m\n\u001b[1;32m 14\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mnumpy\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mnp\u001b[39;00m\n\u001b[1;32m 15\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mtorch\u001b[39;00m\n\u001b[0;32m---> 16\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtrustyai\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdetoxify\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m TMaRCo\n", "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'trustyai.detoxify'" ] } ], "source": [ "from transformers import (\n", " AutoTokenizer,\n", " AutoModelForCausalLM,\n", " DataCollatorForLanguageModeling,\n", " BitsAndBytesConfig,\n", " Trainer,\n", " TrainingArguments,\n", " set_seed\n", " )\n", "from datasets import load_dataset, load_from_disk\n", "from peft import LoraConfig\n", "from trl import SFTTrainer\n", "from trl.trainer import ConstantLengthDataset\n", "import numpy as np\n", "import torch\n", "from trustyai.detoxify import TMaRCo" ] }, { "cell_type": "code", "execution_count": 13, "id": "8c69b13b-7151-4e30-bd27-1dfb26c952b3", "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: detoxify in /opt/app-root/lib/python3.9/site-packages (0.5.2)\n", "Requirement already satisfied: sentencepiece>=0.1.94 in /opt/app-root/lib/python3.9/site-packages (from detoxify) (0.2.0)\n", "Requirement already satisfied: transformers in /opt/app-root/lib/python3.9/site-packages (from detoxify) (4.36.2)\n", "Requirement already satisfied: torch>=1.7.0 in /opt/app-root/lib/python3.9/site-packages (from detoxify) (2.4.0)\n", "Requirement already satisfied: fsspec in /opt/app-root/lib/python3.9/site-packages (from torch>=1.7.0->detoxify) (2023.10.0)\n", "Requirement already satisfied: nvidia-nccl-cu12==2.20.5 in /opt/app-root/lib/python3.9/site-packages (from torch>=1.7.0->detoxify) (2.20.5)\n", "Requirement already satisfied: nvidia-nvtx-cu12==12.1.105 in /opt/app-root/lib/python3.9/site-packages (from torch>=1.7.0->detoxify) (12.1.105)\n", "Requirement already satisfied: triton==3.0.0 in /opt/app-root/lib/python3.9/site-packages (from torch>=1.7.0->detoxify) (3.0.0)\n", "Requirement already satisfied: sympy in /opt/app-root/lib/python3.9/site-packages (from torch>=1.7.0->detoxify) (1.13.2)\n", "Requirement already satisfied: networkx in /opt/app-root/lib/python3.9/site-packages (from torch>=1.7.0->detoxify) (3.2.1)\n", "Requirement already satisfied: nvidia-cuda-cupti-cu12==12.1.105 in /opt/app-root/lib/python3.9/site-packages (from torch>=1.7.0->detoxify) (12.1.105)\n", "Requirement already satisfied: nvidia-cufft-cu12==11.0.2.54 in /opt/app-root/lib/python3.9/site-packages (from torch>=1.7.0->detoxify) (11.0.2.54)\n", "Requirement already satisfied: filelock in /opt/app-root/lib/python3.9/site-packages (from torch>=1.7.0->detoxify) (3.14.0)\n", "Requirement already satisfied: nvidia-cusolver-cu12==11.4.5.107 in /opt/app-root/lib/python3.9/site-packages (from torch>=1.7.0->detoxify) (11.4.5.107)\n", "Requirement already satisfied: typing-extensions>=4.8.0 in /opt/app-root/lib/python3.9/site-packages (from torch>=1.7.0->detoxify) (4.11.0)\n", "Requirement already satisfied: jinja2 in /opt/app-root/lib/python3.9/site-packages (from torch>=1.7.0->detoxify) (3.1.4)\n", "Requirement already satisfied: nvidia-curand-cu12==10.3.2.106 in /opt/app-root/lib/python3.9/site-packages (from torch>=1.7.0->detoxify) (10.3.2.106)\n", "Requirement already satisfied: nvidia-cuda-nvrtc-cu12==12.1.105 in /opt/app-root/lib/python3.9/site-packages (from torch>=1.7.0->detoxify) (12.1.105)\n", "Requirement already satisfied: nvidia-cuda-runtime-cu12==12.1.105 in /opt/app-root/lib/python3.9/site-packages (from torch>=1.7.0->detoxify) (12.1.105)\n", "Requirement already satisfied: nvidia-cusparse-cu12==12.1.0.106 in /opt/app-root/lib/python3.9/site-packages (from torch>=1.7.0->detoxify) (12.1.0.106)\n", "Requirement already satisfied: nvidia-cublas-cu12==12.1.3.1 in /opt/app-root/lib/python3.9/site-packages (from torch>=1.7.0->detoxify) (12.1.3.1)\n", "Requirement already satisfied: nvidia-cudnn-cu12==9.1.0.70 in /opt/app-root/lib/python3.9/site-packages (from torch>=1.7.0->detoxify) (9.1.0.70)\n", "Requirement already satisfied: nvidia-nvjitlink-cu12 in /opt/app-root/lib/python3.9/site-packages (from nvidia-cusolver-cu12==11.4.5.107->torch>=1.7.0->detoxify) (12.6.20)\n", "Requirement already satisfied: numpy>=1.17 in /opt/app-root/lib/python3.9/site-packages (from transformers->detoxify) (1.24.4)\n", "Requirement already satisfied: requests in /opt/app-root/lib/python3.9/site-packages (from transformers->detoxify) (2.32.2)\n", "Requirement already satisfied: packaging>=20.0 in /opt/app-root/lib/python3.9/site-packages (from transformers->detoxify) (24.0)\n", "Requirement already satisfied: regex!=2019.12.17 in /opt/app-root/lib/python3.9/site-packages (from transformers->detoxify) (2024.7.24)\n", "Requirement already satisfied: safetensors>=0.3.1 in /opt/app-root/lib/python3.9/site-packages (from transformers->detoxify) (0.4.4)\n", "Requirement already satisfied: tqdm>=4.27 in /opt/app-root/lib/python3.9/site-packages (from transformers->detoxify) (4.66.4)\n", "Requirement already satisfied: tokenizers<0.19,>=0.14 in /opt/app-root/lib/python3.9/site-packages (from transformers->detoxify) (0.15.2)\n", "Requirement already satisfied: huggingface-hub<1.0,>=0.19.3 in /opt/app-root/lib/python3.9/site-packages (from transformers->detoxify) (0.24.5)\n", "Requirement already satisfied: pyyaml>=5.1 in /opt/app-root/lib/python3.9/site-packages (from transformers->detoxify) (6.0.1)\n", "Requirement already satisfied: MarkupSafe>=2.0 in /opt/app-root/lib/python3.9/site-packages (from jinja2->torch>=1.7.0->detoxify) (2.1.5)\n", "Requirement already satisfied: charset-normalizer<4,>=2 in /opt/app-root/lib/python3.9/site-packages (from requests->transformers->detoxify) (3.3.2)\n", "Requirement already satisfied: certifi>=2017.4.17 in /opt/app-root/lib/python3.9/site-packages (from requests->transformers->detoxify) (2024.2.2)\n", "Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/app-root/lib/python3.9/site-packages (from requests->transformers->detoxify) (1.26.18)\n", "Requirement already satisfied: idna<4,>=2.5 in /opt/app-root/lib/python3.9/site-packages (from requests->transformers->detoxify) (3.7)\n", "Requirement already satisfied: mpmath<1.4,>=1.1.0 in /opt/app-root/lib/python3.9/site-packages (from sympy->torch>=1.7.0->detoxify) (1.3.0)\n", "\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip available: \u001b[0m\u001b[31;49m22.2.2\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.2\u001b[0m\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n", "Note: you may need to restart the kernel to use updated packages.\n" ] } ], "source": [ "pip install detoxify\n" ] }, { "cell_type": "code", "execution_count": 14, "id": "2597373d-496a-4681-9edf-0a62da82e6b5", "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Collecting trustai\n", " Downloading trustai-0.1.12-py3-none-any.whl (89 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m89.9/89.9 kB\u001b[0m \u001b[31m1.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n", "\u001b[?25hRequirement already satisfied: numpy in /opt/app-root/lib/python3.9/site-packages (from trustai) (1.24.4)\n", "Requirement already satisfied: scikit-learn in /opt/app-root/lib/python3.9/site-packages (from trustai) (1.4.2)\n", "Requirement already satisfied: matplotlib in /opt/app-root/lib/python3.9/site-packages (from trustai) (3.6.3)\n", "Requirement already satisfied: IPython in /opt/app-root/lib/python3.9/site-packages (from trustai) (8.18.1)\n", "Requirement already satisfied: tqdm in /opt/app-root/lib/python3.9/site-packages (from trustai) (4.66.4)\n", "Requirement already satisfied: pygments>=2.4.0 in /opt/app-root/lib/python3.9/site-packages (from IPython->trustai) (2.18.0)\n", "Requirement already satisfied: traitlets>=5 in /opt/app-root/lib/python3.9/site-packages (from IPython->trustai) (5.14.3)\n", "Requirement already satisfied: pexpect>4.3 in /opt/app-root/lib/python3.9/site-packages (from IPython->trustai) (4.9.0)\n", "Requirement already satisfied: matplotlib-inline in /opt/app-root/lib/python3.9/site-packages (from IPython->trustai) (0.1.7)\n", "Requirement already satisfied: jedi>=0.16 in /opt/app-root/lib/python3.9/site-packages (from IPython->trustai) (0.19.1)\n", "Requirement already satisfied: prompt-toolkit<3.1.0,>=3.0.41 in /opt/app-root/lib/python3.9/site-packages (from IPython->trustai) (3.0.43)\n", "Requirement already satisfied: typing-extensions in /opt/app-root/lib/python3.9/site-packages (from IPython->trustai) (4.11.0)\n", "Requirement already satisfied: exceptiongroup in /opt/app-root/lib/python3.9/site-packages (from IPython->trustai) (1.2.1)\n", "Requirement already satisfied: stack-data in /opt/app-root/lib/python3.9/site-packages (from IPython->trustai) (0.6.3)\n", "Requirement already satisfied: decorator in /opt/app-root/lib/python3.9/site-packages (from IPython->trustai) (5.1.1)\n", "Requirement already satisfied: pyparsing>=2.2.1 in /opt/app-root/lib/python3.9/site-packages (from matplotlib->trustai) (3.1.2)\n", "Requirement already satisfied: kiwisolver>=1.0.1 in /opt/app-root/lib/python3.9/site-packages (from matplotlib->trustai) (1.4.5)\n", "Requirement already satisfied: python-dateutil>=2.7 in /opt/app-root/lib/python3.9/site-packages (from matplotlib->trustai) (2.9.0.post0)\n", "Requirement already satisfied: contourpy>=1.0.1 in /opt/app-root/lib/python3.9/site-packages (from matplotlib->trustai) (1.2.1)\n", "Requirement already satisfied: fonttools>=4.22.0 in /opt/app-root/lib/python3.9/site-packages (from matplotlib->trustai) (4.51.0)\n", "Requirement already satisfied: packaging>=20.0 in /opt/app-root/lib/python3.9/site-packages (from matplotlib->trustai) (24.0)\n", "Requirement already satisfied: pillow>=6.2.0 in /opt/app-root/lib/python3.9/site-packages (from matplotlib->trustai) (10.3.0)\n", "Requirement already satisfied: cycler>=0.10 in /opt/app-root/lib/python3.9/site-packages (from matplotlib->trustai) (0.12.1)\n", "Requirement already satisfied: joblib>=1.2.0 in /opt/app-root/lib/python3.9/site-packages (from scikit-learn->trustai) (1.4.2)\n", "Requirement already satisfied: threadpoolctl>=2.0.0 in /opt/app-root/lib/python3.9/site-packages (from scikit-learn->trustai) (3.5.0)\n", "Requirement already satisfied: scipy>=1.6.0 in /opt/app-root/lib/python3.9/site-packages (from scikit-learn->trustai) (1.12.0)\n", "Requirement already satisfied: parso<0.9.0,>=0.8.3 in /opt/app-root/lib/python3.9/site-packages (from jedi>=0.16->IPython->trustai) (0.8.4)\n", "Requirement already satisfied: ptyprocess>=0.5 in /opt/app-root/lib/python3.9/site-packages (from pexpect>4.3->IPython->trustai) (0.7.0)\n", "Requirement already satisfied: wcwidth in /opt/app-root/lib/python3.9/site-packages (from prompt-toolkit<3.1.0,>=3.0.41->IPython->trustai) (0.2.13)\n", "Requirement already satisfied: six>=1.5 in /opt/app-root/lib/python3.9/site-packages (from python-dateutil>=2.7->matplotlib->trustai) (1.16.0)\n", "Requirement already satisfied: asttokens>=2.1.0 in /opt/app-root/lib/python3.9/site-packages (from stack-data->IPython->trustai) (2.4.1)\n", "Requirement already satisfied: executing>=1.2.0 in /opt/app-root/lib/python3.9/site-packages (from stack-data->IPython->trustai) (1.2.0)\n", "Requirement already satisfied: pure-eval in /opt/app-root/lib/python3.9/site-packages (from stack-data->IPython->trustai) (0.2.2)\n", "Installing collected packages: trustai\n", "Successfully installed trustai-0.1.12\n", "\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip available: \u001b[0m\u001b[31;49m22.2.2\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.2\u001b[0m\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n", "Note: you may need to restart the kernel to use updated packages.\n" ] } ], "source": [ "pip install trustai" ] }, { "cell_type": "code", "execution_count": 15, "id": "1e881be6-2868-41a4-91d0-9c4152671ce0", "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Help on package trustyai:\n", "\n", "NAME\n", " trustyai - Main TrustyAI Python bindings\n", "\n", "PACKAGE CONTENTS\n", " _default_initializer\n", " explainers (package)\n", " initializer\n", " language (package)\n", " local (package)\n", " metrics (package)\n", " model (package)\n", " utils (package)\n", " version\n", " visualizations (package)\n", "\n", "FUNCTIONS\n", " init()\n", " Deprecated manual initializer for the JVM. This function has been replaced by\n", " automatic initialization when importing the components of the module that require\n", " JVM access, or by manual user initialization via :func:`trustyai`initializer.init`.\n", "\n", "DATA\n", " TRUSTYAI_IS_INITIALIZED = False\n", "\n", "VERSION\n", " 0.6.0\n", "\n", "FILE\n", " /opt/app-root/lib64/python3.9/site-packages/trustyai/__init__.py\n", "\n", "\n" ] } ], "source": [ "import trustyai\n", "help(trustyai)\n" ] }, { "cell_type": "code", "execution_count": 16, "id": "7e1a95c3-ec09-4cc8-9f44-da66f34bb16f", "metadata": { "tags": [] }, "outputs": [], "source": [ "from trustyai.language import detoxify\n" ] }, { "cell_type": "code", "execution_count": 17, "id": "4fb8eb2a-0540-40d9-b981-94b3e01d7d98", "metadata": { "tags": [] }, "outputs": [ { "ename": "ImportError", "evalue": "cannot import name 'detoxify' from 'trustyai.explainers' (/opt/app-root/lib64/python3.9/site-packages/trustyai/explainers/__init__.py)", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mImportError\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn[17], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtrustyai\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mexplainers\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m detoxify\n", "\u001b[0;31mImportError\u001b[0m: cannot import name 'detoxify' from 'trustyai.explainers' (/opt/app-root/lib64/python3.9/site-packages/trustyai/explainers/__init__.py)" ] } ], "source": [ "from trustyai.explainers import detoxify\n" ] }, { "cell_type": "code", "execution_count": 18, "id": "3aaf3742-22dd-4580-b89b-ec17d30d3d2a", "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: detoxify in /opt/app-root/lib/python3.9/site-packages (0.5.2)\n", "Requirement already satisfied: sentencepiece>=0.1.94 in /opt/app-root/lib/python3.9/site-packages (from detoxify) (0.2.0)\n", "Requirement already satisfied: torch>=1.7.0 in /opt/app-root/lib/python3.9/site-packages (from detoxify) (2.4.0)\n", "Requirement already satisfied: transformers in /opt/app-root/lib/python3.9/site-packages (from detoxify) (4.36.2)\n", "Requirement already satisfied: nvidia-cusparse-cu12==12.1.0.106 in /opt/app-root/lib/python3.9/site-packages (from torch>=1.7.0->detoxify) (12.1.0.106)\n", "Requirement already satisfied: nvidia-cudnn-cu12==9.1.0.70 in /opt/app-root/lib/python3.9/site-packages (from torch>=1.7.0->detoxify) (9.1.0.70)\n", "Requirement already satisfied: nvidia-cublas-cu12==12.1.3.1 in /opt/app-root/lib/python3.9/site-packages (from torch>=1.7.0->detoxify) (12.1.3.1)\n", "Requirement already satisfied: typing-extensions>=4.8.0 in /opt/app-root/lib/python3.9/site-packages (from torch>=1.7.0->detoxify) (4.11.0)\n", "Requirement already satisfied: nvidia-cuda-cupti-cu12==12.1.105 in /opt/app-root/lib/python3.9/site-packages (from torch>=1.7.0->detoxify) (12.1.105)\n", "Requirement already satisfied: nvidia-cuda-nvrtc-cu12==12.1.105 in /opt/app-root/lib/python3.9/site-packages (from torch>=1.7.0->detoxify) (12.1.105)\n", "Requirement already satisfied: nvidia-cusolver-cu12==11.4.5.107 in /opt/app-root/lib/python3.9/site-packages (from torch>=1.7.0->detoxify) (11.4.5.107)\n", "Requirement already satisfied: nvidia-nccl-cu12==2.20.5 in /opt/app-root/lib/python3.9/site-packages (from torch>=1.7.0->detoxify) (2.20.5)\n", "Requirement already satisfied: filelock in /opt/app-root/lib/python3.9/site-packages (from torch>=1.7.0->detoxify) (3.14.0)\n", "Requirement already satisfied: networkx in /opt/app-root/lib/python3.9/site-packages (from torch>=1.7.0->detoxify) (3.2.1)\n", "Requirement already satisfied: sympy in /opt/app-root/lib/python3.9/site-packages (from torch>=1.7.0->detoxify) (1.13.2)\n", "Requirement already satisfied: nvidia-cuda-runtime-cu12==12.1.105 in /opt/app-root/lib/python3.9/site-packages (from torch>=1.7.0->detoxify) (12.1.105)\n", "Requirement already satisfied: nvidia-cufft-cu12==11.0.2.54 in /opt/app-root/lib/python3.9/site-packages (from torch>=1.7.0->detoxify) (11.0.2.54)\n", "Requirement already satisfied: nvidia-curand-cu12==10.3.2.106 in /opt/app-root/lib/python3.9/site-packages (from torch>=1.7.0->detoxify) (10.3.2.106)\n", "Requirement already satisfied: triton==3.0.0 in /opt/app-root/lib/python3.9/site-packages (from torch>=1.7.0->detoxify) (3.0.0)\n", "Requirement already satisfied: fsspec in /opt/app-root/lib/python3.9/site-packages (from torch>=1.7.0->detoxify) (2023.10.0)\n", "Requirement already satisfied: jinja2 in /opt/app-root/lib/python3.9/site-packages (from torch>=1.7.0->detoxify) (3.1.4)\n", "Requirement already satisfied: nvidia-nvtx-cu12==12.1.105 in /opt/app-root/lib/python3.9/site-packages (from torch>=1.7.0->detoxify) (12.1.105)\n", "Requirement already satisfied: nvidia-nvjitlink-cu12 in /opt/app-root/lib/python3.9/site-packages (from nvidia-cusolver-cu12==11.4.5.107->torch>=1.7.0->detoxify) (12.6.20)\n", "Requirement already satisfied: huggingface-hub<1.0,>=0.19.3 in /opt/app-root/lib/python3.9/site-packages (from transformers->detoxify) (0.24.5)\n", "Requirement already satisfied: safetensors>=0.3.1 in /opt/app-root/lib/python3.9/site-packages (from transformers->detoxify) (0.4.4)\n", "Requirement already satisfied: tqdm>=4.27 in /opt/app-root/lib/python3.9/site-packages (from transformers->detoxify) (4.66.4)\n", "Requirement already satisfied: pyyaml>=5.1 in /opt/app-root/lib/python3.9/site-packages (from transformers->detoxify) (6.0.1)\n", "Requirement already satisfied: requests in /opt/app-root/lib/python3.9/site-packages (from transformers->detoxify) (2.32.2)\n", "Requirement already satisfied: numpy>=1.17 in /opt/app-root/lib/python3.9/site-packages (from transformers->detoxify) (1.24.4)\n", "Requirement already satisfied: packaging>=20.0 in /opt/app-root/lib/python3.9/site-packages (from transformers->detoxify) (24.0)\n", "Requirement already satisfied: tokenizers<0.19,>=0.14 in /opt/app-root/lib/python3.9/site-packages (from transformers->detoxify) (0.15.2)\n", "Requirement already satisfied: regex!=2019.12.17 in /opt/app-root/lib/python3.9/site-packages (from transformers->detoxify) (2024.7.24)\n", "Requirement already satisfied: MarkupSafe>=2.0 in /opt/app-root/lib/python3.9/site-packages (from jinja2->torch>=1.7.0->detoxify) (2.1.5)\n", "Requirement already satisfied: certifi>=2017.4.17 in /opt/app-root/lib/python3.9/site-packages (from requests->transformers->detoxify) (2024.2.2)\n", "Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/app-root/lib/python3.9/site-packages (from requests->transformers->detoxify) (1.26.18)\n", "Requirement already satisfied: idna<4,>=2.5 in /opt/app-root/lib/python3.9/site-packages (from requests->transformers->detoxify) (3.7)\n", "Requirement already satisfied: charset-normalizer<4,>=2 in /opt/app-root/lib/python3.9/site-packages (from requests->transformers->detoxify) (3.3.2)\n", "Requirement already satisfied: mpmath<1.4,>=1.1.0 in /opt/app-root/lib/python3.9/site-packages (from sympy->torch>=1.7.0->detoxify) (1.3.0)\n", "\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip available: \u001b[0m\u001b[31;49m22.2.2\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.2\u001b[0m\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n", "Note: you may need to restart the kernel to use updated packages.\n" ] } ], "source": [ "pip install detoxify\n" ] }, { "cell_type": "code", "execution_count": 19, "id": "ba40bcb3-1fe7-442e-985c-80a53342c137", "metadata": { "tags": [] }, "outputs": [], "source": [ "from detoxify import Detoxify\n" ] }, { "cell_type": "code", "execution_count": 20, "id": "38781751-9c16-46f3-be5a-77520c025d24", "metadata": { "tags": [] }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Downloading: \"https://github.com/unitaryai/detoxify/releases/download/v0.1-alpha/toxic_original-c1212f89.ckpt\" to /opt/app-root/src/.cache/torch/hub/checkpoints/toxic_original-c1212f89.ckpt\n", "100%|██████████| 418M/418M [00:10<00:00, 40.5MB/s] \n", "/opt/app-root/lib64/python3.9/site-packages/transformers/utils/generic.py:309: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead.\n", " _torch_pytree._register_pytree_node(\n", "/opt/app-root/lib64/python3.9/site-packages/huggingface_hub/file_download.py:1150: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n", " warnings.warn(\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "86e6a3ce7f4740019c944ffffd0cbf5e", "version_major": 2, "version_minor": 0 }, "text/plain": [ "config.json: 0%| | 0.00/570 [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "6b821b580d954919aa8349c4b99f5fdd", "version_major": 2, "version_minor": 0 }, "text/plain": [ "tokenizer_config.json: 0%| | 0.00/48.0 [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "983baa6dbc134f11b5144e3ee7831f4a", "version_major": 2, "version_minor": 0 }, "text/plain": [ "vocab.txt: 0%| | 0.00/232k [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "1b6b7ac83241448d9e22bb3204b32de4", "version_major": 2, "version_minor": 0 }, "text/plain": [ "tokenizer.json: 0%| | 0.00/466k [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "{'toxicity': [0.00074575818143785], 'severe_toxicity': [0.0001111913297791034], 'obscene': [0.0001741646119626239], 'threat': [0.00011403383541619405], 'insult': [0.00017978664254769683], 'identity_attack': [0.00013735856919083744]}\n" ] } ], "source": [ "model = Detoxify('original')\n", "results = model.predict([\"Your text here to analyze for toxicity.\"])\n", "print(results)\n" ] }, { "cell_type": "code", "execution_count": 21, "id": "ad30c7d9-3986-49d4-aa52-1d385ec7f938", "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: transformers in /opt/app-root/lib/python3.9/site-packages (4.36.2)\n", "Requirement already satisfied: torch in /opt/app-root/lib/python3.9/site-packages (2.4.0)\n", "Requirement already satisfied: detoxify in /opt/app-root/lib/python3.9/site-packages (0.5.2)\n", "Requirement already satisfied: requests in /opt/app-root/lib/python3.9/site-packages (from transformers) (2.32.2)\n", "Requirement already satisfied: numpy>=1.17 in /opt/app-root/lib/python3.9/site-packages (from transformers) (1.24.4)\n", "Requirement already satisfied: regex!=2019.12.17 in /opt/app-root/lib/python3.9/site-packages (from transformers) (2024.7.24)\n", "Requirement already satisfied: safetensors>=0.3.1 in /opt/app-root/lib/python3.9/site-packages (from transformers) (0.4.4)\n", "Requirement already satisfied: filelock in /opt/app-root/lib/python3.9/site-packages (from transformers) (3.14.0)\n", "Requirement already satisfied: tokenizers<0.19,>=0.14 in /opt/app-root/lib/python3.9/site-packages (from transformers) (0.15.2)\n", "Requirement already satisfied: huggingface-hub<1.0,>=0.19.3 in /opt/app-root/lib/python3.9/site-packages (from transformers) (0.24.5)\n", "Requirement already satisfied: packaging>=20.0 in /opt/app-root/lib/python3.9/site-packages (from transformers) (24.0)\n", "Requirement already satisfied: pyyaml>=5.1 in /opt/app-root/lib/python3.9/site-packages (from transformers) (6.0.1)\n", "Requirement already satisfied: tqdm>=4.27 in /opt/app-root/lib/python3.9/site-packages (from transformers) (4.66.4)\n", "Requirement already satisfied: typing-extensions>=4.8.0 in /opt/app-root/lib/python3.9/site-packages (from torch) (4.11.0)\n", "Requirement already satisfied: nvidia-curand-cu12==10.3.2.106 in /opt/app-root/lib/python3.9/site-packages (from torch) (10.3.2.106)\n", "Requirement already satisfied: nvidia-cublas-cu12==12.1.3.1 in /opt/app-root/lib/python3.9/site-packages (from torch) (12.1.3.1)\n", "Requirement already satisfied: nvidia-cusolver-cu12==11.4.5.107 in /opt/app-root/lib/python3.9/site-packages (from torch) (11.4.5.107)\n", "Requirement already satisfied: triton==3.0.0 in /opt/app-root/lib/python3.9/site-packages (from torch) (3.0.0)\n", "Requirement already satisfied: nvidia-nccl-cu12==2.20.5 in /opt/app-root/lib/python3.9/site-packages (from torch) (2.20.5)\n", "Requirement already satisfied: nvidia-cuda-runtime-cu12==12.1.105 in /opt/app-root/lib/python3.9/site-packages (from torch) (12.1.105)\n", "Requirement already satisfied: nvidia-cuda-cupti-cu12==12.1.105 in /opt/app-root/lib/python3.9/site-packages (from torch) (12.1.105)\n", "Requirement already satisfied: nvidia-cuda-nvrtc-cu12==12.1.105 in /opt/app-root/lib/python3.9/site-packages (from torch) (12.1.105)\n", "Requirement already satisfied: nvidia-cudnn-cu12==9.1.0.70 in /opt/app-root/lib/python3.9/site-packages (from torch) (9.1.0.70)\n", "Requirement already satisfied: nvidia-cusparse-cu12==12.1.0.106 in /opt/app-root/lib/python3.9/site-packages (from torch) (12.1.0.106)\n", "Requirement already satisfied: jinja2 in /opt/app-root/lib/python3.9/site-packages (from torch) (3.1.4)\n", "Requirement already satisfied: networkx in /opt/app-root/lib/python3.9/site-packages (from torch) (3.2.1)\n", "Requirement already satisfied: fsspec in /opt/app-root/lib/python3.9/site-packages (from torch) (2023.10.0)\n", "Requirement already satisfied: sympy in /opt/app-root/lib/python3.9/site-packages (from torch) (1.13.2)\n", "Requirement already satisfied: nvidia-cufft-cu12==11.0.2.54 in /opt/app-root/lib/python3.9/site-packages (from torch) (11.0.2.54)\n", "Requirement already satisfied: nvidia-nvtx-cu12==12.1.105 in /opt/app-root/lib/python3.9/site-packages (from torch) (12.1.105)\n", "Requirement already satisfied: nvidia-nvjitlink-cu12 in /opt/app-root/lib/python3.9/site-packages (from nvidia-cusolver-cu12==11.4.5.107->torch) (12.6.20)\n", "Requirement already satisfied: sentencepiece>=0.1.94 in /opt/app-root/lib/python3.9/site-packages (from detoxify) (0.2.0)\n", "Requirement already satisfied: MarkupSafe>=2.0 in /opt/app-root/lib/python3.9/site-packages (from jinja2->torch) (2.1.5)\n", "Requirement already satisfied: idna<4,>=2.5 in /opt/app-root/lib/python3.9/site-packages (from requests->transformers) (3.7)\n", "Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/app-root/lib/python3.9/site-packages (from requests->transformers) (1.26.18)\n", "Requirement already satisfied: charset-normalizer<4,>=2 in /opt/app-root/lib/python3.9/site-packages (from requests->transformers) (3.3.2)\n", "Requirement already satisfied: certifi>=2017.4.17 in /opt/app-root/lib/python3.9/site-packages (from requests->transformers) (2024.2.2)\n", "Requirement already satisfied: mpmath<1.4,>=1.1.0 in /opt/app-root/lib/python3.9/site-packages (from sympy->torch) (1.3.0)\n", "\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip available: \u001b[0m\u001b[31;49m22.2.2\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.2\u001b[0m\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n", "Note: you may need to restart the kernel to use updated packages.\n" ] } ], "source": [ "pip install transformers torch detoxify\n" ] }, { "cell_type": "code", "execution_count": 22, "id": "273d649d-2610-4072-b3ce-662ec6a83451", "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: pip in /opt/app-root/lib/python3.9/site-packages (22.2.2)\n", "Collecting pip\n", " Downloading pip-24.2-py3-none-any.whl (1.8 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.8/1.8 MB\u001b[0m \u001b[31m9.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n", "\u001b[?25hInstalling collected packages: pip\n", " Attempting uninstall: pip\n", " Found existing installation: pip 22.2.2\n", " Uninstalling pip-22.2.2:\n", " Successfully uninstalled pip-22.2.2\n", "Successfully installed pip-24.2\n", "Note: you may need to restart the kernel to use updated packages.\n" ] } ], "source": [ "pip install --upgrade pip" ] }, { "cell_type": "code", "execution_count": 23, "id": "fa48aa69-71a0-4890-9f2d-d11810d1b973", "metadata": { "tags": [] }, "outputs": [], "source": [ "from detoxify import Detoxify\n" ] }, { "cell_type": "code", "execution_count": 24, "id": "915a0a6b-5d2e-41cc-acee-6d78aa6548be", "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'toxicity': 0.0007457582, 'severe_toxicity': 0.00011119133, 'obscene': 0.00017416461, 'threat': 0.000114033835, 'insult': 0.00017978664, 'identity_attack': 0.00013735857}\n" ] } ], "source": [ "model = Detoxify('original')\n", "results = model.predict(\"Your text here to analyze for toxicity.\")\n", "print(results)\n" ] }, { "cell_type": "code", "execution_count": 25, "id": "37690b1a-dadb-48ac-8c01-4fbab9337b1f", "metadata": { "tags": [] }, "outputs": [], "source": [ "from detoxify import Detoxify\n" ] }, { "cell_type": "code", "execution_count": 26, "id": "001cf409-66d3-46a6-8212-142a0d3fb8f0", "metadata": { "tags": [] }, "outputs": [], "source": [ "model = Detoxify('original')\n" ] }, { "cell_type": "code", "execution_count": 27, "id": "e67c1be0-1f87-4770-ae14-51cbdf225526", "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'toxicity': 0.0006598092, 'severe_toxicity': 0.0001192591, 'obscene': 0.00017243883, 'threat': 0.0001247127, 'insult': 0.00018202125, 'identity_attack': 0.0001411103}\n" ] } ], "source": [ "results = model.predict(\"Your text to analyze for toxicity.\")\n", "print(results)\n", "\n" ] }, { "cell_type": "code", "execution_count": 28, "id": "0cdffd0d-2e38-4422-b092-949c435faa15", "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['filename', 'begin', 'end', 'challenging', 'prompt.text', 'prompt.profanity', 'prompt.sexually_explicit', 'prompt.identity_attack', 'prompt.flirtation', 'prompt.threat', 'prompt.insult', 'prompt.severe_toxicity', 'prompt.toxicity', 'continuation.text', 'continuation.severe_toxicity', 'continuation.toxicity', 'continuation.profanity', 'continuation.sexually_explicit', 'continuation.identity_attack', 'continuation.flirtation', 'continuation.threat', 'continuation.insult']\n" ] } ], "source": [ "from datasets import load_dataset\n", "\n", "dataset_name = \"allenai/real-toxicity-prompts\"\n", "raw_dataset = load_dataset(dataset_name, split=\"train\").flatten()\n", "print(raw_dataset.column_names)\n" ] }, { "cell_type": "code", "execution_count": 29, "id": "37d26d71-e393-4fec-bd44-8a2865b6ca43", "metadata": { "tags": [] }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Downloading: \"https://github.com/unitaryai/detoxify/releases/download/v0.3-alpha/toxic_debiased-c7548aa0.ckpt\" to /opt/app-root/src/.cache/torch/hub/checkpoints/toxic_debiased-c7548aa0.ckpt\n", "100%|██████████| 476M/476M [00:09<00:00, 51.1MB/s] \n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "d4ad2a7870234bc8a4cb5bea3e720b1e", "version_major": 2, "version_minor": 0 }, "text/plain": [ "config.json: 0%| | 0.00/481 [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "8d261f4c9f8747758bf985b31f663066", "version_major": 2, "version_minor": 0 }, "text/plain": [ "tokenizer_config.json: 0%| | 0.00/25.0 [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "c5ed66a88fd64d1f84507ccb0d98b7e7", "version_major": 2, "version_minor": 0 }, "text/plain": [ "vocab.json: 0%| | 0.00/899k [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "d85b2f33baa64a0da233beecc2f08aca", "version_major": 2, "version_minor": 0 }, "text/plain": [ "merges.txt: 0%| | 0.00/456k [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "c11ba0f1c6e845e88087df80cfbe6bf5", "version_major": 2, "version_minor": 0 }, "text/plain": [ "tokenizer.json: 0%| | 0.00/1.36M [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Expert Model Results: {'toxicity': 0.0007457582, 'severe_toxicity': 0.00011119133, 'obscene': 0.00017416461, 'threat': 0.000114033835, 'insult': 0.00017978664, 'identity_attack': 0.00013735857}\n", "Non-Expert Model Results: {'toxicity': 0.0005239008, 'severe_toxicity': 1.3534418e-06, 'obscene': 2.615597e-05, 'identity_attack': 7.681745e-05, 'insult': 0.00011812797, 'threat': 2.7330221e-05, 'sexual_explicit': 1.4419186e-05}\n" ] } ], "source": [ "from detoxify import Detoxify\n", "\n", "# Load Detoxify models for expert and non-expert analysis\n", "model_expert = Detoxify('original')\n", "model_non_expert = Detoxify('unbiased') # Using a different variant of Detoxify for comparison\n", "\n", "# Example usage: Analyze a text for toxicity with both models\n", "text = \"Your text here to analyze for toxicity.\"\n", "results_expert = model_expert.predict(text)\n", "results_non_expert = model_non_expert.predict(text)\n", "\n", "print(\"Expert Model Results:\", results_expert)\n", "print(\"Non-Expert Model Results:\", results_non_expert)\n" ] }, { "cell_type": "code", "execution_count": 30, "id": "0a7afc7b-e9c8-43b8-aefa-6d20521525d3", "metadata": { "tags": [] }, "outputs": [], "source": [ "def preprocess_func(sample):\n", " # Concatenate prompt and continuation text\n", " sample['text'] = f\"Prompt: {sample['prompt.text']}\\nContinuation: {sample['continuation.text']}\"\n", " return sample\n" ] }, { "cell_type": "code", "execution_count": 31, "id": "2631fc24-eed4-4344-9912-ad9054c2968e", "metadata": { "tags": [] }, "outputs": [], "source": [ "def rephrase_func(sample):\n", " # Analyze text for toxicity using the expert model\n", " results_expert = model_expert.predict(sample['text'])\n", " \n", " # Determine whether the text should be masked/rephrased based on a threshold\n", " if results_expert['toxicity'] > 0.6: # Use your desired threshold\n", " # For simplicity, you might choose to replace the entire text or sensitive parts\n", " sample['text'] = \"This content has been flagged for toxicity and has been rephrased for safety.\"\n", " \n", " return sample\n" ] }, { "cell_type": "code", "execution_count": 32, "id": "4f3e2b99-0e06-429f-a3eb-f64adb35ea83", "metadata": { "tags": [] }, "outputs": [], "source": [ "block_size = 128\n", "\n", "def group_texts(examples):\n", " # Concatenate all texts.\n", " concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}\n", " total_length = len(concatenated_examples[list(examples.keys())[0]])\n", " \n", " # Drop the small remainder, or add padding if the model supported it.\n", " if total_length >= block_size:\n", " total_length = (total_length // block_size) * block_size\n", " \n", " # Split by chunks of block_size.\n", " result = {\n", " k: [t[i : i + block_size] for i in range(0, total_length, block_size)]\n", " for k, t in concatenated_examples.items()\n", " }\n", " \n", " # Set the labels to be the same as the input_ids.\n", " result[\"labels\"] = result[\"input_ids\"].copy()\n", " \n", " return result\n" ] }, { "cell_type": "code", "execution_count": 33, "id": "3b88d48c-ca4b-472a-bd24-8f509398f5d5", "metadata": { "tags": [] }, "outputs": [], "source": [ "# Split the dataset into training and testing sets with an 80/20 split\n", "dataset = raw_dataset.train_test_split(test_size=0.2, shuffle=True, seed=42)\n", "\n", "# Randomly select 1000 samples from the training data\n", "train_data = dataset[\"train\"].select(indices=range(0, 1000))\n", "\n", "# Randomly select 400 samples from the evaluation data\n", "eval_data = dataset[\"test\"].select(indices=range(0, 400))\n" ] }, { "cell_type": "code", "execution_count": 34, "id": "cf46f64b-f63a-4b08-ae18-af87c568fdc9", "metadata": { "tags": [] }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "6f3a40a74067470d90f102dd6a84e5b1", "version_major": 2, "version_minor": 0 }, "text/plain": [ "tokenizer_config.json: 0%| | 0.00/685 [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "ab3ea0505cd3400d982a375fb0616448", "version_major": 2, "version_minor": 0 }, "text/plain": [ "config.json: 0%| | 0.00/644 [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "e0c91e31a5b3465b88a4503381534804", "version_major": 2, "version_minor": 0 }, "text/plain": [ "vocab.json: 0%| | 0.00/899k [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "ef4b3fd3027c4d719c7c99cce7edef22", "version_major": 2, "version_minor": 0 }, "text/plain": [ "merges.txt: 0%| | 0.00/456k [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "e177d350ef0247fea5b21f262c80081a", "version_major": 2, "version_minor": 0 }, "text/plain": [ "special_tokens_map.json: 0%| | 0.00/441 [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "model_id = \"facebook/opt-350m\"\n", "tokenizer = AutoTokenizer.from_pretrained(model_id)\n", "\n", "# Setting the pad token and padding side\n", "tokenizer.pad_token = tokenizer.eos_token\n", "tokenizer.padding_side = \"right\"\n" ] }, { "cell_type": "code", "execution_count": 35, "id": "19920a2b-edc5-474c-b899-2644c16c2ec3", "metadata": { "tags": [] }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "f6b9dbcb58174bd5aaa2b8d2f996890d", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Map: 0%| | 0/1000 [00:00, ? examples/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "581340fcb04a40aeab36dfb15e544bd5", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Map: 0%| | 0/400 [00:00, ? examples/s]" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "train_ds = train_data.map(preprocess_func, remove_columns=train_data.column_names)\n", "eval_ds = eval_data.map(preprocess_func, remove_columns=eval_data.column_names)\n" ] }, { "cell_type": "code", "execution_count": 36, "id": "ca15d394-8893-42fc-8d14-53cb3f2d25ec", "metadata": { "tags": [] }, "outputs": [ { "ename": "IndentationError", "evalue": "unexpected indent (1823206556.py, line 2)", "output_type": "error", "traceback": [ "\u001b[0;36m Cell \u001b[0;32mIn[36], line 2\u001b[0;36m\u001b[0m\n\u001b[0;31m mean_length = np.mean([len(text) for text in train_ds['text']])\u001b[0m\n\u001b[0m ^\u001b[0m\n\u001b[0;31mIndentationError\u001b[0m\u001b[0;31m:\u001b[0m unexpected indent\n" ] } ], "source": [ "# select samples whose length are less than equal to the mean length of the training set\n", " mean_length = np.mean([len(text) for text in train_ds['text']])\n", " train_ds = train_ds.filter(lambda x: len(x['text']) <= mean_length)\n", " tokenized_train_ds = train_ds.map(tokenize_func, batched=True, remove_columns=train_ds.column_names)\n", " tokenized_eval_ds = eval_ds.map(tokenize_func, batched=True, remove_columns=eval_ds.column_names)\n", " print(f\"Size of training set: {len(tokenized_train_ds)}\\nSize of evaluation set: {len(tokenized_eval_ds)}\")\n", " rephrased_train_ds = train_ds.map(rephrase_func)" ] }, { "cell_type": "code", "execution_count": 37, "id": "ebca75e9-3ef7-4998-830c-5f5b53be91f0", "metadata": { "tags": [] }, "outputs": [ { "ename": "SyntaxError", "evalue": "EOL while scanning string literal (3574162024.py, line 58)", "output_type": "error", "traceback": [ "\u001b[0;36m Cell \u001b[0;32mIn[37], line 58\u001b[0;36m\u001b[0m\n\u001b[0;31m train_ds = train_ds.filter(lambda x: len(x['text\u001b[0m\n\u001b[0m ^\u001b[0m\n\u001b[0;31mSyntaxError\u001b[0m\u001b[0;31m:\u001b[0m EOL while scanning string literal\n" ] } ], "source": [ "from detoxify import Detoxify\n", "import numpy as np\n", "from datasets import load_dataset\n", "from transformers import AutoTokenizer\n", "\n", "# Load Detoxify models for expert and non-expert analysis\n", "model_expert = Detoxify('original')\n", "model_non_expert = Detoxify('unbiased') # Using a different variant for comparison\n", "\n", "# Example usage: Analyze a text for toxicity with both models\n", "text = \"Your text here to analyze for toxicity.\"\n", "results_expert = model_expert.predict(text)\n", "results_non_expert = model_non_expert.predict(text)\n", "\n", "print(\"Expert Model Results:\", results_expert)\n", "print(\"Non-Expert Model Results:\", results_non_expert)\n", "\n", "# Dataset processing\n", "dataset_name = \"allenai/real-toxicity-prompts\"\n", "raw_dataset = load_dataset(dataset_name, split=\"train\").flatten()\n", "\n", "def preprocess_func(sample):\n", " # Concatenate prompt and continuation text\n", " sample['text'] = f\"Prompt: {sample['prompt.text']}\\nContinuation: {sample['continuation.text']}\"\n", " return sample\n", "\n", "def tokenize_func(examples):\n", " return tokenizer(examples['text'], padding='max_length', truncation=True)\n", "\n", "# Prepare dataset\n", "block_size = 128\n", "\n", "def group_texts(examples):\n", " concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}\n", " total_length = len(concatenated_examples[list(examples.keys())[0]])\n", " if total_length >= block_size:\n", " total_length = (total_length // block_size) * block_size\n", " result = {\n", " k: [t[i: i + block_size] for i in range(0, total_length, block_size)]\n", " for k, t in concatenated_examples.items()\n", " }\n", " result[\"labels\"] = result[\"input_ids\"].copy()\n", " return result\n", "\n", "dataset = raw_dataset.train_test_split(test_size=0.2, shuffle=True, seed=42)\n", "train_data = dataset[\"train\"].select(indices=range(0, 1000))\n", "eval_data = dataset[\"test\"].select(indices=range(0, 400))\n", "\n", "model_id = \"facebook/opt-350m\"\n", "tokenizer = AutoTokenizer.from_pretrained(model_id)\n", "tokenizer.pad_token = tokenizer.eos_token\n", "tokenizer.padding_side = \"right\"\n", "\n", "train_ds = train_data.map(preprocess_func, remove_columns=train_data.column_names)\n", "eval_ds = eval_data.map(preprocess_func, remove_columns=eval_data.column_names)\n", "\n", "mean_length = np.mean([len(text) for text in train_ds['text']])\n", "train_ds = train_ds.filter(lambda x: len(x['text\n" ] }, { "cell_type": "code", "execution_count": 38, "id": "d0a5a9d7-db99-4010-886d-5e23994a03a6", "metadata": { "tags": [] }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "f1b496dc39324fcfa7ef0d7942ad0bea", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Filter: 0%| | 0/1000 [00:00, ? examples/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "ename": "NameError", "evalue": "name 'tokenize_func' is not defined", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn[38], line 8\u001b[0m\n\u001b[1;32m 5\u001b[0m train_ds \u001b[38;5;241m=\u001b[39m train_ds\u001b[38;5;241m.\u001b[39mfilter(\u001b[38;5;28;01mlambda\u001b[39;00m x: \u001b[38;5;28mlen\u001b[39m(x[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtext\u001b[39m\u001b[38;5;124m'\u001b[39m]) \u001b[38;5;241m<\u001b[39m\u001b[38;5;241m=\u001b[39m mean_length)\n\u001b[1;32m 7\u001b[0m \u001b[38;5;66;03m# Tokenize the filtered training and evaluation datasets\u001b[39;00m\n\u001b[0;32m----> 8\u001b[0m tokenized_train_ds \u001b[38;5;241m=\u001b[39m train_ds\u001b[38;5;241m.\u001b[39mmap(\u001b[43mtokenize_func\u001b[49m, batched\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m, remove_columns\u001b[38;5;241m=\u001b[39mtrain_ds\u001b[38;5;241m.\u001b[39mcolumn_names)\n\u001b[1;32m 9\u001b[0m tokenized_eval_ds \u001b[38;5;241m=\u001b[39m eval_ds\u001b[38;5;241m.\u001b[39mmap(tokenize_func, batched\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m, remove_columns\u001b[38;5;241m=\u001b[39meval_ds\u001b[38;5;241m.\u001b[39mcolumn_names)\n\u001b[1;32m 11\u001b[0m \u001b[38;5;66;03m# Print the sizes of the tokenized datasets\u001b[39;00m\n", "\u001b[0;31mNameError\u001b[0m: name 'tokenize_func' is not defined" ] } ], "source": [ "# Select samples whose length is less than or equal to the mean length of the training set\n", "mean_length = np.mean([len(text) for text in train_ds['text']])\n", "\n", "# Filter the training dataset based on the mean length\n", "train_ds = train_ds.filter(lambda x: len(x['text']) <= mean_length)\n", "\n", "# Tokenize the filtered training and evaluation datasets\n", "tokenized_train_ds = train_ds.map(tokenize_func, batched=True, remove_columns=train_ds.column_names)\n", "tokenized_eval_ds = eval_ds.map(tokenize_func, batched=True, remove_columns=eval_ds.column_names)\n", "\n", "# Print the sizes of the tokenized datasets\n", "print(f\"Size of training set: {len(tokenized_train_ds)}\\nSize of evaluation set: {len(tokenized_eval_ds)}\")\n", "\n", "# Apply the rephrasing function to the training dataset\n", "rephrased_train_ds = train_ds.map(rephrase_func)\n" ] }, { "cell_type": "code", "execution_count": 39, "id": "bd115649-543d-4fbe-9039-f0023c32a57e", "metadata": { "tags": [] }, "outputs": [], "source": [ "def tokenize_func(examples):\n", " return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=128)\n" ] }, { "cell_type": "code", "execution_count": 40, "id": "c267d0e7-7ad8-4fda-835a-ee0b797791fb", "metadata": { "tags": [] }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "4dc5df94dcfd4555ab36f8d17d063990", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Filter: 0%| | 0/557 [00:00, ? examples/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "4a6de693979b4d41a7345ba76fdd78f0", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Map: 0%| | 0/280 [00:00, ? examples/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "5ed9accbdb6b40dc8f6eaf58c057084f", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Map: 0%| | 0/400 [00:00, ? examples/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Size of training set: 280\n", "Size of evaluation set: 400\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "7e63e093c1e9458e8427e40db5a3829a", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Map: 0%| | 0/280 [00:00, ? examples/s]" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Define the tokenization function\n", "def tokenize_func(examples):\n", " return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=128)\n", "\n", "# Select samples whose length are less than or equal to the mean length of the training set\n", "mean_length = np.mean([len(text) for text in train_ds['text']])\n", "train_ds = train_ds.filter(lambda x: len(x['text']) <= mean_length)\n", "\n", "# Tokenize the filtered training and evaluation datasets\n", "tokenized_train_ds = train_ds.map(tokenize_func, batched=True, remove_columns=train_ds.column_names)\n", "tokenized_eval_ds = eval_ds.map(tokenize_func, batched=True, remove_columns=eval_ds.column_names)\n", "\n", "# Print the sizes of the tokenized datasets\n", "print(f\"Size of training set: {len(tokenized_train_ds)}\\nSize of evaluation set: {len(tokenized_eval_ds)}\")\n", "\n", "# Apply the rephrasing function to the training dataset\n", "rephrased_train_ds = train_ds.map(rephrase_func)\n" ] }, { "cell_type": "code", "execution_count": 41, "id": "5fd9c8d2-c3bc-4bb2-b836-3513ac224746", "metadata": { "tags": [] }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "379da08ce039455f8236caa296a800ee", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Map: 0%| | 0/280 [00:00, ? examples/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "21fea9a4d9f147c9aa9eac968db12a65", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Map: 0%| | 0/400 [00:00, ? examples/s]" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "tokenized_train_ds = tokenized_train_ds.map(group_texts, batched=True)\n", "tokenized_eval_ds = tokenized_eval_ds.map(group_texts, batched=True)\n" ] }, { "cell_type": "code", "execution_count": 42, "id": "196df7fa-11a0-4413-b76d-0bb601566fc2", "metadata": { "tags": [] }, "outputs": [ { "ename": "ImportError", "evalue": "cannot import name 'SFTTrainer' from 'peft' (/opt/app-root/lib64/python3.9/site-packages/peft/__init__.py)", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mImportError\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn[42], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mpeft\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m LoraConfig, SFTTrainer\n\u001b[1;32m 2\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtransformers\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m TrainingArguments, AutoTokenizer\n\u001b[1;32m 4\u001b[0m \u001b[38;5;66;03m# Define PEFT configuration for LoRA\u001b[39;00m\n", "\u001b[0;31mImportError\u001b[0m: cannot import name 'SFTTrainer' from 'peft' (/opt/app-root/lib64/python3.9/site-packages/peft/__init__.py)" ] } ], "source": [ "from peft import LoraConfig, SFTTrainer\n", "from transformers import TrainingArguments, AutoTokenizer\n", "\n", "# Define PEFT configuration for LoRA\n", "peft_config = LoraConfig(\n", " r=64,\n", " lora_alpha=16,\n", " lora_dropout=0.1,\n", " bias=\"none\",\n", " task_type=\"CAUSAL_LM\",\n", " target_modules=[\"q_proj\", \"k_proj\", \"v_proj\", \"o_proj\"],\n", ")\n", "\n", "# Training arguments\n", "training_args = TrainingArguments(\n", " output_dir=\"../models/opt-350m_CASUAL_LM\",\n", " evaluation_strategy=\"epoch\",\n", " per_device_train_batch_size=1,\n", " per_device_eval_batch_size=1,\n", " num_train_epochs=5,\n", " learning_rate=1e-04,\n", " max_grad_norm=0.3,\n", " warmup_ratio=0.03,\n", " lr_scheduler_type=\"cosine\"\n", ")\n", "\n", "# Define the model and tokenizer\n", "model_id = \"facebook/opt-350m\"\n", "tokenizer = AutoTokenizer.from_pretrained(model_id)\n", "tokenizer.pad_token = tokenizer.eos_token\n", "\n", "# Prepare the trainer with model initialization arguments\n", "trainer = SFTTrainer(\n", " model=model_id, # Pass model ID directly\n", " tokenizer=tokenizer,\n", " args=training_args,\n", " train_dataset=rephrased_train_ds, # Rephrased training dataset\n", " eval_dataset=tokenized_eval_ds, # Evaluation dataset\n", " dataset_text_field=\"text\",\n", " peft_config=peft_config,\n", " max_seq_length=min(tokenizer.model_max_length, 512)\n", ")\n", "\n", "# Start the training process\n", "trainer.train()\n" ] }, { "cell_type": "code", "execution_count": null, "id": "34e14448-7ba2-4a0d-a662-e41bf92b123b", "metadata": { "tags": [] }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "80ed60292a0d480bbcdca6ba424878c2", "version_major": 2, "version_minor": 0 }, "text/plain": [ "pytorch_model.bin: 0%| | 0.00/663M [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stderr", "output_type": "stream", "text": [ "/opt/app-root/lib64/python3.9/site-packages/transformers/modeling_utils.py:519: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n", " return torch.load(checkpoint_file, map_location=map_location)\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "228fb93bff1b4b50ba679c50a3d63635", "version_major": 2, "version_minor": 0 }, "text/plain": [ "generation_config.json: 0%| | 0.00/137 [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stderr", "output_type": "stream", "text": [ "/opt/app-root/lib64/python3.9/site-packages/bitsandbytes/cextension.py:34: UserWarning: The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.\n", " warn(\"The installed version of bitsandbytes was compiled without GPU support. \"\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "/opt/app-root/lib64/python3.9/site-packages/bitsandbytes/libbitsandbytes_cpu.so: undefined symbol: cadam32bit_grad_fp32\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n" ] }, { "data": { "text/html": [ "\n", "
Epoch | \n", "Training Loss | \n", "Validation Loss | \n", "
---|---|---|
1 | \n", "No log | \n", "0.952133 | \n", "
"
],
"text/plain": [
" "
],
"text/plain": [
"\n",
" \n",
"
\n",
" \n",
" \n",
" \n",
" Epoch \n",
" Training Loss \n",
" Validation Loss \n",
" \n",
" \n",
" 1 \n",
" 0.704300 \n",
" 0.959849 \n",
" \n",
" \n",
" \n",
"2 \n",
" 0.618400 \n",
" 0.948537 \n",
"