File size: 16,628 Bytes

0af3c9d

{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"provenance":[],"machine_shape":"hm","gpuType":"A100","authorship_tag":"ABX9TyOcWa/R2MQZHg1iTqbsixCh"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"},"accelerator":"GPU"},"cells":[{"cell_type":"markdown","source":["# JGLUE"],"metadata":{"id":"vW5pSUvHPrgi"}},{"cell_type":"markdown","source":["https://github.com/Stability-AI/lm-evaluation-harness/tree/jp-stable"],"metadata":{"id":"8jqvft-4Pt3Q"}},{"cell_type":"markdown","source":["## HuggingFace ログイン（事前学習モデルのllama-2-13b-hfのダウンロードに必要）"],"metadata":{"id":"xNAr_zTpS2n1"}},{"cell_type":"code","source":["!pip install huggingface_hub"],"metadata":{"id":"85vaoDDrS708"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["!huggingface-cli login"],"metadata":{"id":"qdMlCfQqS9yO"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["## JGLUEスクリプトの実行"],"metadata":{"id":"xzgw7KFSS30w"}},{"cell_type":"code","execution_count":null,"metadata":{"id":"8dNUdnbwPRpm"},"outputs":[],"source":["!git clone -b jp-stable https://github.com/Stability-AI/lm-evaluation-harness.git\n","%cd lm-evaluation-harness\n","!pip install -e \".[ja]\""]},{"cell_type":"code","source":["!python main.py \\\n","    --model hf-causal-experimental \\\n","    --model_args \"pretrained=meta-llama/Llama-2-13b-hf,peft=HachiML/Llama-2-13b-hf-qlora-dolly-ja-2ep\" \\\n","    --tasks \"jsquad-1.1-0.3\" \\\n","    --num_fewshot \"2\" \\\n","    --batch_size 1 \\\n","    --device \"cuda\" \\\n","    --output_path \"/content/lm-evaluation-harness/result/result_jsquad.json\""],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"JVyyviF9ZiQU","executionInfo":{"status":"ok","timestamp":1691319117294,"user_tz":-540,"elapsed":512615,"user":{"displayName":"八木原統","userId":"03559086887314454384"}},"outputId":"0319169d-c5f4-42a7-da8b-275f53c085de"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["2023-08-06 10:43:29.368525: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n","Selected Tasks: ['jsquad-1.1-0.3']\n","Loading checkpoint shards: 100% 3/3 [00:02<00:00,  1.37it/s]\n","/content/lm-evaluation-harness/lm_eval/tasks/ja/jsquad.py:75: FutureWarning: load_metric is deprecated and will be removed in the next major version of datasets. Use 'evaluate.load' instead, from the new library 🤗 Evaluate: https://huggingface.co/docs/evaluate\n","  self.jasquad_metric = datasets.load_metric(jasquad.__file__)\n","Running greedy_until requests\n","0it [00:00, ?it/s]\n","{\n","  \"results\": {\n","    \"jsquad-1.1-0.3\": {\n","      \"exact_match\": 62.83205763169743,\n","      \"f1\": 77.09913819742155\n","    }\n","  },\n","  \"versions\": {\n","    \"jsquad-1.1-0.3\": 1.1\n","  },\n","  \"config\": {\n","    \"model\": \"hf-causal-experimental\",\n","    \"model_args\": \"pretrained=meta-llama/Llama-2-13b-hf,peft=HachiML/Llama-2-13b-hf-qlora-dolly-ja-2ep\",\n","    \"num_fewshot\": 2,\n","    \"batch_size\": 1,\n","    \"device\": \"cuda\",\n","    \"no_cache\": false,\n","    \"limit\": null,\n","    \"bootstrap_iters\": 100000,\n","    \"description_dict\": {}\n","  }\n","}\n","hf-causal-experimental (pretrained=meta-llama/Llama-2-13b-hf,peft=HachiML/Llama-2-13b-hf-qlora-dolly-ja-2ep), limit: None, provide_description: False, num_fewshot: 2, batch_size: 1\n","|     Task     |Version|  Metric   | Value |   |Stderr|\n","|--------------|------:|-----------|------:|---|------|\n","|jsquad-1.1-0.3|    1.1|exact_match|62.8321|   |      |\n","|              |       |f1         |77.0991|   |      |\n","\n"]}]},{"cell_type":"code","source":["!python main.py \\\n","    --model hf-causal-experimental \\\n","    --model_args \"pretrained=meta-llama/Llama-2-13b-hf,peft=HachiML/Llama-2-13b-hf-qlora-dolly-ja-2ep\" \\\n","    --tasks \"jcommonsenseqa-1.1-0.3\" \\\n","    --num_fewshot \"3\" \\\n","    --batch_size 1 \\\n","    --device \"cuda\" \\\n","    --output_path \"/content/lm-evaluation-harness/result/result_jcommonsenseqa.json\""],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"i1gV8hDmmRRh","executionInfo":{"status":"ok","timestamp":1691322994804,"user_tz":-540,"elapsed":1773681,"user":{"displayName":"八木原統","userId":"03559086887314454384"}},"outputId":"262839c7-b9c1-4d75-dbff-06450456f42a"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["2023-08-06 11:27:05.800305: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n","Selected Tasks: ['jcommonsenseqa-1.1-0.3']\n","Loading checkpoint shards: 100% 3/3 [00:02<00:00,  1.39it/s]\n","Running loglikelihood requests\n","100% 5595/5595 [21:13<00:00,  4.39it/s]\n","{\n","  \"results\": {\n","    \"jcommonsenseqa-1.1-0.3\": {\n","      \"acc\": 0.7578194816800715,\n","      \"acc_stderr\": 0.012812432289317893,\n","      \"acc_norm\": 0.4280607685433423,\n","      \"acc_norm_stderr\": 0.014798127177394432\n","    }\n","  },\n","  \"versions\": {\n","    \"jcommonsenseqa-1.1-0.3\": 1.1\n","  },\n","  \"config\": {\n","    \"model\": \"hf-causal-experimental\",\n","    \"model_args\": \"pretrained=meta-llama/Llama-2-13b-hf,peft=HachiML/Llama-2-13b-hf-qlora-dolly-ja-2ep\",\n","    \"num_fewshot\": 3,\n","    \"batch_size\": 1,\n","    \"device\": \"cuda\",\n","    \"no_cache\": false,\n","    \"limit\": null,\n","    \"bootstrap_iters\": 100000,\n","    \"description_dict\": {}\n","  }\n","}\n","hf-causal-experimental (pretrained=meta-llama/Llama-2-13b-hf,peft=HachiML/Llama-2-13b-hf-qlora-dolly-ja-2ep), limit: None, provide_description: False, num_fewshot: 3, batch_size: 1\n","|         Task         |Version| Metric |Value |   |Stderr|\n","|----------------------|------:|--------|-----:|---|-----:|\n","|jcommonsenseqa-1.1-0.3|    1.1|acc     |0.7578|±  |0.0128|\n","|                      |       |acc_norm|0.4281|±  |0.0148|\n","\n"]}]},{"cell_type":"code","source":["!python main.py \\\n","    --model hf-causal-experimental \\\n","    --model_args \"pretrained=meta-llama/Llama-2-13b-hf,peft=HachiML/Llama-2-13b-hf-qlora-dolly-ja-2ep\" \\\n","    --tasks \"jnli-1.1-0.3\" \\\n","    --num_fewshot \"3\" \\\n","    --batch_size 8 \\\n","    --device \"cuda\" \\\n","    --output_path \"/content/lm-evaluation-harness/result/result_jnli.json\""],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"VM1EUhLTsfi_","executionInfo":{"status":"ok","timestamp":1691326111411,"user_tz":-540,"elapsed":2017998,"user":{"displayName":"八木原統","userId":"03559086887314454384"}},"outputId":"5963f031-6056-4389-a891-cc3ae1cf6d25"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["2023-08-06 12:14:57.989717: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n","Selected Tasks: ['jnli-1.1-0.3']\n","Loading checkpoint shards: 100% 3/3 [00:02<00:00,  1.38it/s]\n","Running loglikelihood requests\n","100% 7006/7006 [25:13<00:00,  4.63it/s]\n","{\n","  \"results\": {\n","    \"jnli-1.1-0.3\": {\n","      \"acc\": 0.5069843878389483,\n","      \"acc_stderr\": 0.010135765974065071,\n","      \"acc_norm\": 0.3056696795398521,\n","      \"acc_norm_stderr\": 0.009339813231542836\n","    }\n","  },\n","  \"versions\": {\n","    \"jnli-1.1-0.3\": 1.1\n","  },\n","  \"config\": {\n","    \"model\": \"hf-causal-experimental\",\n","    \"model_args\": \"pretrained=meta-llama/Llama-2-13b-hf,peft=HachiML/Llama-2-13b-hf-qlora-dolly-ja-2ep\",\n","    \"num_fewshot\": 3,\n","    \"batch_size\": 8,\n","    \"device\": \"cuda\",\n","    \"no_cache\": false,\n","    \"limit\": null,\n","    \"bootstrap_iters\": 100000,\n","    \"description_dict\": {}\n","  }\n","}\n","hf-causal-experimental (pretrained=meta-llama/Llama-2-13b-hf,peft=HachiML/Llama-2-13b-hf-qlora-dolly-ja-2ep), limit: None, provide_description: False, num_fewshot: 3, batch_size: 8\n","|    Task    |Version| Metric |Value |   |Stderr|\n","|------------|------:|--------|-----:|---|-----:|\n","|jnli-1.1-0.3|    1.1|acc     |0.5070|±  |0.0101|\n","|            |       |acc_norm|0.3057|±  |0.0093|\n","\n"]}]},{"cell_type":"code","source":["!python main.py \\\n","    --model hf-causal-experimental \\\n","    --model_args \"pretrained=meta-llama/Llama-2-13b-hf,peft=HachiML/Llama-2-13b-hf-qlora-dolly-ja-2ep\" \\\n","    --tasks \"marc_ja-1.1-0.3\" \\\n","    --num_fewshot \"3\" \\\n","    --batch_size 2 \\\n","    --device \"cuda\" \\\n","    --output_path \"/content/lm-evaluation-harness/result/result_marc_ja.json\""],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"Z1h0sHgoz3hb","executionInfo":{"status":"ok","timestamp":1691330876590,"user_tz":-540,"elapsed":4750329,"user":{"displayName":"八木原統","userId":"03559086887314454384"}},"outputId":"0811254a-fa5c-457a-da8e-7137e3065d63"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["2023-08-06 12:48:50.938519: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n","Selected Tasks: ['marc_ja-1.1-0.3']\n","Loading checkpoint shards: 100% 3/3 [00:02<00:00,  1.39it/s]\n","Running loglikelihood requests\n","100% 10006/10006 [1:10:23<00:00,  2.37it/s]\n","{\n","  \"results\": {\n","    \"marc_ja-1.1-0.3\": {\n","      \"acc\": 0.7964273081004598,\n","      \"acc_stderr\": 0.005355417561710155,\n","      \"acc_norm\": 0.7964273081004598,\n","      \"acc_norm_stderr\": 0.005355417561710155\n","    }\n","  },\n","  \"versions\": {\n","    \"marc_ja-1.1-0.3\": 1.1\n","  },\n","  \"config\": {\n","    \"model\": \"hf-causal-experimental\",\n","    \"model_args\": \"pretrained=meta-llama/Llama-2-13b-hf,peft=HachiML/Llama-2-13b-hf-qlora-dolly-ja-2ep\",\n","    \"num_fewshot\": 3,\n","    \"batch_size\": 2,\n","    \"device\": \"cuda\",\n","    \"no_cache\": false,\n","    \"limit\": null,\n","    \"bootstrap_iters\": 100000,\n","    \"description_dict\": {}\n","  }\n","}\n","hf-causal-experimental (pretrained=meta-llama/Llama-2-13b-hf,peft=HachiML/Llama-2-13b-hf-qlora-dolly-ja-2ep), limit: None, provide_description: False, num_fewshot: 3, batch_size: 2\n","|     Task      |Version| Metric |Value |   |Stderr|\n","|---------------|------:|--------|-----:|---|-----:|\n","|marc_ja-1.1-0.3|    1.1|acc     |0.7964|±  |0.0054|\n","|               |       |acc_norm|0.7964|±  |0.0054|\n","\n"]}]},{"cell_type":"markdown","source":["JGLUE実行後、結果ファイルはローカルに取得したが、Google driveをマウントしていればそこに保存しても良いと思う。なぜか、Colabを数時間実行しているとdriveのマウントが外れるエラーがたまに出るためローカルに落とす様に今回はした。"],"metadata":{"id":"_QTxh2qDVuwI"}},{"cell_type":"code","source":["from google.colab import files\n","\n","files.download('/content/lm-evaluation-harness/result/result_jsquad.json')\n","files.download('/content/lm-evaluation-harness/result/result_jcommonsenseqa.json')\n","files.download('/content/lm-evaluation-harness/result/result_jnli.json')\n","files.download('/content/lm-evaluation-harness/result/result_marc_ja.json')"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":34},"id":"q2IV2w8423zg","executionInfo":{"status":"ok","timestamp":1691330876591,"user_tz":-540,"elapsed":9,"user":{"displayName":"八木原統","userId":"03559086887314454384"}},"outputId":"fe4907d0-2ccf-4c2f-d6ee-cc5f89b355ba"},"execution_count":null,"outputs":[{"output_type":"display_data","data":{"text/plain":["<IPython.core.display.Javascript object>"],"application/javascript":["\n","    async function download(id, filename, size) {\n","      if (!google.colab.kernel.accessAllowed) {\n","        return;\n","      }\n","      const div = document.createElement('div');\n","      const label = document.createElement('label');\n","      label.textContent = `Downloading \"${filename}\": `;\n","      div.appendChild(label);\n","      const progress = document.createElement('progress');\n","      progress.max = size;\n","      div.appendChild(progress);\n","      document.body.appendChild(div);\n","\n","      const buffers = [];\n","      let downloaded = 0;\n","\n","      const channel = await google.colab.kernel.comms.open(id);\n","      // Send a message to notify the kernel that we're ready.\n","      channel.send({})\n","\n","      for await (const message of channel.messages) {\n","        // Send a message to notify the kernel that we're ready.\n","        channel.send({})\n","        if (message.buffers) {\n","          for (const buffer of message.buffers) {\n","            buffers.push(buffer);\n","            downloaded += buffer.byteLength;\n","            progress.value = downloaded;\n","          }\n","        }\n","      }\n","      const blob = new Blob(buffers, {type: 'application/binary'});\n","      const a = document.createElement('a');\n","      a.href = window.URL.createObjectURL(blob);\n","      a.download = filename;\n","      div.appendChild(a);\n","      a.click();\n","      div.remove();\n","    }\n","  "]},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["<IPython.core.display.Javascript object>"],"application/javascript":["download(\"download_500cc0c5-8473-4de0-ac89-39543dc96377\", \"result_marc_ja.json\", 588)"]},"metadata":{}}]},{"cell_type":"markdown","source":["ちなみに、ベンチマークタスクは以下の様に\",\"区切りで指定すれば複数一気に行うことができる。なぜか単発毎よりも実行時時間がかかりそうだったため今回は個別に行った。"],"metadata":{"id":"ce4HppNNVC7B"}},{"cell_type":"code","source":["!python main.py \\\n","    --model hf-causal-experimental \\\n","    --model_args \"pretrained=meta-llama/Llama-2-13b-hf,peft=HachiML/Llama-2-13b-hf-qlora-dolly-ja-2ep\" \\\n","    --tasks \"jsquad-1.1-0.3,jcommonsenseqa-1.1-0.3,jnli-1.1-0.3,marc_ja-1.1-0.3\" \\\n","    --num_fewshot \"2,3,3,3\" \\\n","    --batch_size 1 \\\n","    --device \"cuda\" \\\n","    --output_path \"result.json\""],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"Rho49R3iPulh","executionInfo":{"status":"ok","timestamp":1691316334804,"user_tz":-540,"elapsed":9624749,"user":{"displayName":"八木原統","userId":"03559086887314454384"}},"outputId":"9dc02616-0d85-4ac8-da43-80cc0b7ba760"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["2023-08-06 07:25:14.257072: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n","Selected Tasks: ['jsquad-1.1-0.3', 'jcommonsenseqa-1.1-0.3', 'jnli-1.1-0.3', 'marc_ja-1.1-0.3']\n","Loading checkpoint shards: 100% 3/3 [00:02<00:00,  1.38it/s]\n","/content/lm-evaluation-harness/lm_eval/tasks/ja/jsquad.py:75: FutureWarning: load_metric is deprecated and will be removed in the next major version of datasets. Use 'evaluate.load' instead, from the new library 🤗 Evaluate: https://huggingface.co/docs/evaluate\n","  self.jasquad_metric = datasets.load_metric(jasquad.__file__)\n","Running greedy_until requests\n","100% 4442/4442 [2:16:25<00:00,  1.84s/it]\n","Running loglikelihood requests\n","  5% 1302/24205 [14:46<4:19:50,  1.47it/s]\n","Traceback (most recent call last):\n","  File \"/content/lm-evaluation-harness/main.py\", line 122, in <module>\n","    main()\n","  File \"/content/lm-evaluation-harness/main.py\", line 91, in main\n","    results = evaluator.simple_evaluate(\n","  File \"/content/lm-evaluation-harness/lm_eval/utils.py\", line 185, in _wrapper\n","    return fn(*args, **kwargs)\n","  File \"/content/lm-evaluation-harness/lm_eval/evaluator.py\", line 87, in simple_evaluate\n","    results = evaluate(\n","  File \"/content/lm-evaluation-harness/lm_eval/utils.py\", line 185, in _wrapper\n","    return fn(*args, **kwargs)\n","  File \"/content/lm-evaluation-harness/lm_eval/evaluator.py\", line 273, in evaluate\n","    resps = getattr(lm, reqtype)([req.args for req in reqs])\n","  File \"/content/lm-evaluation-harness/lm_eval/base.py\", line 852, in fn\n","    rem_res = getattr(self.lm, attr)(remaining_reqs)\n","  File \"/content/lm-evaluation-harness/lm_eval/base.py\", line 191, in loglikelihood\n","    return self._loglikelihood_tokens(new_reqs)\n","  File \"/content/lm-evaluation-harness/lm_eval/base.py\", line 302, in _loglikelihood_tokens\n","    ).cpu()  # [batch, padding_length, vocab]\n","KeyboardInterrupt\n","^C\n"]}]}]}