mesolitica
/

malaysian-tinyllama-1.1b-16k-instructions-v3-AWQ

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "19fe0df6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# !pip3 install https://github.com/casper-hansen/AutoAWQ/releases/download/v0.1.8/autoawq-0.1.8+cu118-cp310-cp310-linux_x86_64.whl"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "20861f3e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from awq import AutoAWQForCausalLM\n",
+    "from transformers import AutoConfig, AwqConfig, AutoTokenizer, AutoModelForCausalLM\n",
+    "import torch\n",
+    "\n",
+    "model_path = 'mesolitica/malaysian-tinyllama-1.1b-16k-instructions-v3'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "9939ad4e",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype = torch.bfloat16)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "fdb86f50",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!rm -rf test"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "72e76288",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model.save_pretrained('./test', safe_serialization = False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "aa245150",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model = AutoAWQForCausalLM.from_pretrained('./test')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "d3949cf4",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "AWQ: 100%|██████████| 22/22 [02:25<00:00,  6.59s/it]\n"
+     ]
+    }
+   ],
+   "source": [
+    "quant_path = 'malaysian-tinyllama-1.1b-16k-instructions-v3-awq'\n",
+    "quant_config = { \"zero_point\": True, \"q_group_size\": 128, \"w_bit\": 4, \"version\": \"GEMM\" }\n",
+    "\n",
+    "tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)\n",
+    "model.quantize(tokenizer, quant_config=quant_config, calib_data = 'mesolitica/malaysian-calibration')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "ee290c1e",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "WARNING:root:`quant_config.json` is being deprecated in the future in favor of quantization_config in config.json.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "('malaysian-tinyllama-1.1b-16k-instructions-v3-awq/tokenizer_config.json',\n",
+       " 'malaysian-tinyllama-1.1b-16k-instructions-v3-awq/special_tokens_map.json',\n",
+       " 'malaysian-tinyllama-1.1b-16k-instructions-v3-awq/tokenizer.json')"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "model.save_quantized(quant_path, safetensors = False)\n",
+    "tokenizer.save_pretrained(quant_path)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "737f2403",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "CommitInfo(commit_url='https://huggingface.co/mesolitica/malaysian-tinyllama-1.1b-16k-instructions-v3-AWQ/commit/909193caec2d94a495535d9033f5dcd975686356', commit_message='Upload tokenizer', commit_description='', oid='909193caec2d94a495535d9033f5dcd975686356', pr_url=None, pr_revision=None, pr_num=None)"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "tokenizer.push_to_hub('mesolitica/malaysian-tinyllama-1.1b-16k-instructions-v3-AWQ')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "ed92c8ee",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "CommitInfo(commit_url='https://huggingface.co/mesolitica/malaysian-tinyllama-1.1b-16k-instructions-v3-AWQ/commit/a259a50f290eb1e648396698d9a0dcac7d33d5a2', commit_message='Upload config', commit_description='', oid='a259a50f290eb1e648396698d9a0dcac7d33d5a2', pr_url=None, pr_revision=None, pr_num=None)"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "quantization_config = AwqConfig(\n",
+    "    bits=quant_config['w_bit'],\n",
+    "    group_size=quant_config['q_group_size'],\n",
+    "    zero_point=quant_config['zero_point'],\n",
+    "    backend='autoawq',\n",
+    "    version=quant_config['version'].lower(),\n",
+    ")\n",
+    "\n",
+    "config = AutoConfig.from_pretrained(model_path)\n",
+    "config.quantization_config = quantization_config\n",
+    "\n",
+    "config.push_to_hub('mesolitica/malaysian-tinyllama-1.1b-16k-instructions-v3-AWQ')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "c74b2f45",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "config.json\t\tquant_config.json\t tokenizer_config.json\r\n",
+      "generation_config.json\tspecial_tokens_map.json\r\n",
+      "pytorch_model.bin\ttokenizer.json\r\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
+      "To disable this warning, you can either:\n",
+      "\t- Avoid using `tokenizers` before the fork if possible\n",
+      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"
+     ]
+    }
+   ],
+   "source": [
+    "!ls malaysian-tinyllama-1.1b-16k-instructions-v3-awq"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "2e0fb591",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from huggingface_hub import HfApi\n",
+    "\n",
+    "api = HfApi()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "dd06cfa2",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "2c8f617cd86c41e9ae97e6ed0e9cca0a",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "pytorch_model.bin:   0%|          | 0.00/766M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "CommitInfo(commit_url='https://huggingface.co/mesolitica/malaysian-tinyllama-1.1b-16k-instructions-v3-AWQ/commit/704674a60b13d46f7aa6bc4c7f08fd1fbf52aa01', commit_message='Upload pytorch_model.bin with huggingface_hub', commit_description='', oid='704674a60b13d46f7aa6bc4c7f08fd1fbf52aa01', pr_url=None, pr_revision=None, pr_num=None)"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "api.upload_file(\n",
+    "    path_or_fileobj='malaysian-tinyllama-1.1b-16k-instructions-v3-awq/pytorch_model.bin',\n",
+    "    path_in_repo=\"pytorch_model.bin\",\n",
+    "    repo_id='mesolitica/malaysian-tinyllama-1.1b-16k-instructions-v3-AWQ',\n",
+    "    repo_type=\"model\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "1383ff2c",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "CommitInfo(commit_url='https://huggingface.co/mesolitica/malaysian-tinyllama-1.1b-16k-instructions-v3-AWQ/commit/ff8dcd9a8d0fce1c30a19c0e2c9cea3f4efbcb28', commit_message='Upload quant_config.json with huggingface_hub', commit_description='', oid='ff8dcd9a8d0fce1c30a19c0e2c9cea3f4efbcb28', pr_url=None, pr_revision=None, pr_num=None)"
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "api.upload_file(\n",
+    "    path_or_fileobj='malaysian-tinyllama-1.1b-16k-instructions-v3-awq/quant_config.json',\n",
+    "    path_in_repo=\"quant_config.json\",\n",
+    "    repo_id='mesolitica/malaysian-tinyllama-1.1b-16k-instructions-v3-AWQ',\n",
+    "    repo_type=\"model\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "5852ec02",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The history saving thread hit an unexpected error (OperationalError('database is locked')).History will not be written to the database.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "5429442d416848c99e511592225aae50",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "config.json:   0%|          | 0.00/965 [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "You have loaded an AWQ model on CPU and have a CUDA device available, make sure to set your model on a GPU device in order to run your model.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "4d153ddd40174e7d8c6e6c26e6593c0a",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "pytorch_model.bin:   0%|          | 0.00/766M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "quantized_model = AutoModelForCausalLM.from_pretrained('mesolitica/malaysian-tinyllama-1.1b-16k-instructions-v3-AWQ')\n",
+    "_ = quantized_model.cuda()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "66895e20",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "messages = [\n",
+    "    {'role': 'user', 'content': 'KWSP tu apa'}\n",
+    "]\n",
+    "prompt = tokenizer.apply_chat_template(messages, tokenize = False)\n",
+    "inputs = tokenizer([prompt], return_tensors='pt', add_special_tokens=False).to('cuda')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "4b320f33",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "CPU times: user 6.36 s, sys: 6.36 ms, total: 6.37 s\n",
+      "Wall time: 6.47 s\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "'<s> [INST] KWSP tu apa [/INST]KWSP, singkatan untuk Kumpulan Wang Simpanan Pekerja, ialah Kumpulan Wang Simpanan Pekerja di Malaysia, yang merupakan dana simpanan pekerja. KWSP bertujuan untuk menyediakan simpanan persaraan dan faedah keselamatan sosial untuk ahli KWSP (dana caruman dan majikan) dengan menyediakan pelaburan boleh beli. Dana ini diuruskan oleh KWSP, sebuah syarikat milik kerajaan di Malaysia. KWSP terdiri daripada simpanan caruman pekerja dan simpanan majikan, dengan peruntukan yang berkaitan dengan skim simpanan yang berbeza di Malaysia. Dana ini menggunakan pelaburan yang dibuat oleh pencarum dan pembayar caruman untuk menjana dividen dan faedah, yang seterusnya menyokong matlamat kerajaan untuk menyediakan simpanan persaraan yang selamat dan mencukupi untuk pekerja.</s>'"
+      ]
+     },
+     "execution_count": 17,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "\n",
+    "generate_kwargs = dict(\n",
+    "    inputs,\n",
+    "    max_new_tokens=1024,\n",
+    "    top_p=0.95,\n",
+    "    top_k=50,\n",
+    "    temperature=0.9,\n",
+    "    do_sample=True,\n",
+    "    num_beams=1,\n",
+    ")\n",
+    "r = quantized_model.generate(**generate_kwargs)\n",
+    "tokenizer.decode(r[0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a9a93555",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}