File size: 9,997 Bytes

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a6221e83-9d8f-4716-aeda-b40847931f56",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "%%bash\n",
    "git clone https://github.com/philschmid/llmperf.git\n",
    "cd llmperf\n",
    "pip install -e . -q"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "602a8c54-b434-4d8e-bc72-824c642fbdb5",
   "metadata": {},
   "source": [
    "# Setup"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "73b1aa22-a1e3-4a1e-9dd2-042ab0f5939a",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "import sys\n",
    "import json\n",
    "from getpass import getpass\n",
    "import subprocess\n",
    "import os\n",
    "from datetime import datetime\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "from huggingface_hub import notebook_login, create_inference_endpoint, list_inference_endpoints, whoami, get_inference_endpoint, get_token\n",
    "from pathlib import Path\n",
    "from tqdm.notebook import tqdm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "772897cb-c2b1-4f9a-8143-ad64aed40b5b",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "notebook_login()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8f951213-46a1-4db9-be2c-51c2291ecdc2",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "proj_dir = Path.cwd()\n",
    "print(proj_dir)\n",
    "LLMPerf_path = proj_dir/'llmperf'"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "267ea96b-b756-4e16-b41a-fee2119edf76",
   "metadata": {
    "tags": []
   },
   "source": [
    "# Config"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2d3341f2-217e-42a5-89fb-1653fd418c48",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "# Endpoint\n",
    "ENDPOINT_NAME=\"mixtral-exp\"\n",
    "NAMESPACE = 'HF-test-lab'\n",
    "MODEL = 'TheBloke/mixtral-8x7b-v0.1-GPTQ'\n",
    "INSTANCE_TYPE = 'nvidia-l4_AWQ'\n",
    "\n",
    "# Simulation\n",
    "RESULTS_DIR = proj_dir/'tgi_benchmark_results'/INSTANCE_TYPE\n",
    "tgi_bss = [1]\n",
    "INPUT_TOKENS = 800\n",
    "OUTPUT_TOKENS = 1600"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f6bbb792-b168-42b8-bff1-c6ea9f6daf79",
   "metadata": {},
   "source": [
    "# Endpoint setup"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8610e033-8586-495a-943e-539b7c8304d0",
   "metadata": {},
   "source": [
    "Be sure to configure your endpoint how you desire, I made some guesses on what you might want in the `env`. You can see some settings in the [pricing section](https://huggingface.co/docs/inference-endpoints/en/pricing#gpu-instances) of the docs. I would also recommend manually deploying once and using  `get_inference_endpoint().__dict__` to double check your settings just to double check."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ae923833-8ca1-4d16-85be-a78ffb386c43",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "def create_endpoint(MAX_BATCH_SIZE, name, instance_type):\n",
    "    try:\n",
    "        endpoint = get_inference_endpoint(name=name, namespace=NAMESPACE)\n",
    "        endpoint.wait()\n",
    "        return endpoint\n",
    "    except:\n",
    "        pass\n",
    "    try:\n",
    "        endpoint = create_inference_endpoint(\n",
    "            name,\n",
    "            repository=MODEL,\n",
    "            task=\"text-generation\",\n",
    "            framework=\"pytorch\",\n",
    "            region=\"us-east-1\",\n",
    "            vendor=\"aws\",\n",
    "            accelerator=\"gpu\",\n",
    "            instance_size=\"x4\",\n",
    "            instance_type='nvidia-l4',\n",
    "            min_replica=0,\n",
    "            max_replica=1,\n",
    "            namespace=NAMESPACE,\n",
    "            custom_image={\n",
    "                \"health_route\": \"/health\",\n",
    "                \"env\": {\n",
    "                    \"MAX_INPUT_LENGTH\": f\"{INPUT_TOKENS+50}\",\n",
    "                    \"MAX_TOTAL_TOKENS\": f\"{INPUT_TOKENS + OUTPUT_TOKENS}\",\n",
    "                    \"MAX_BATCH_SIZE\": f\"{MAX_BATCH_SIZE}\",\n",
    "                    \"HF_TOKEN\": get_token(),\n",
    "                    \"QUANTIZE\":\"awq\",\n",
    "                    \"MODEL_ID\": \"/repository\",\n",
    "                },\n",
    "                \"url\": \"ghcr.io/huggingface/text-generation-inference:2.2.0\",\n",
    "            },\n",
    "            type=\"protected\",\n",
    "        )\n",
    "        endpoint.wait()\n",
    "    except Exception as create_error:\n",
    "        print(f\"Failed to create inference endpoint: {str(create_error)}\")\n",
    "        return None\n",
    "\n",
    "    return endpoint"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5e55710d-fa77-41b7-ae9c-a4826140f6b6",
   "metadata": {},
   "source": [
    "Make sure to check the command to make sure it matches what you expect. Also check the summary stats json to see what actually happened."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "491b82b3-4db8-4409-85ce-7c003a6c2f6f",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "def run_command(batch_size, endpoint, tgi_bs):\n",
    "    prefix = f'tgibs_{tgi_bs}__bs_{batch_size}'\n",
    "    vu = batch_size\n",
    "\n",
    "    # Set environment variables\n",
    "    env = os.environ.copy()\n",
    "    env['HUGGINGFACE_API_BASE'] = endpoint.url\n",
    "    env['HUGGINGFACE_API_TOKEN'] = get_token()\n",
    "    env['MODEL_ID'] = MODEL\n",
    "    # Convert pathlib.Path to string and append to PYTHONPATH\n",
    "    env['PYTHONPATH'] = str(LLMPerf_path) + (os.pathsep + env.get('PYTHONPATH', ''))\n",
    "\n",
    "    # Define the benchmark script path\n",
    "    benchmark_script = str(LLMPerf_path / \"token_benchmark_ray.py\")\n",
    "\n",
    "    if not os.path.isfile(benchmark_script):\n",
    "        print(f\"LLMPerf script not found at {benchmark_script}, please ensure the path is correct.\")\n",
    "        return \"Script not found\", False\n",
    "\n",
    "    # Calculate the max number of completed requests\n",
    "    max_requests = vu * 8\n",
    "\n",
    "    # Generate the results directory name\n",
    "    date_str = datetime.now().strftime('%Y-%m-%d-%H-%M-%S')\n",
    "    results_dir = RESULTS_DIR / f\"{date_str}_{prefix}\"\n",
    "\n",
    "    # Construct the command to run the benchmark script\n",
    "    command = [\n",
    "        \"python\", benchmark_script,\n",
    "        \"--model\", f\"{MODEL}\",\n",
    "        \"--mean-input-tokens\", f\"{INPUT_TOKENS}\",\n",
    "        \"--stddev-input-tokens\", \"10\",\n",
    "        \"--mean-output-tokens\", f\"{OUTPUT_TOKENS}\",\n",
    "        \"--stddev-output-tokens\", \"5\",\n",
    "        \"--max-num-completed-requests\", str(min(max_requests, 1500)),\n",
    "        \"--timeout\", \"7200\",\n",
    "        \"--num-concurrent-requests\", str(vu),\n",
    "        \"--results-dir\", str(results_dir),\n",
    "        \"--llm-api\", \"huggingface\",\n",
    "        \"--additional-sampling-params\", '{}'\n",
    "    ]\n",
    "\n",
    "    # Run the command with the modified environment\n",
    "    try:\n",
    "        result = subprocess.check_output(command, stderr=subprocess.STDOUT, env=env).decode('utf-8')\n",
    "        return result, True\n",
    "    except subprocess.CalledProcessError as e:\n",
    "        print(f\"Error with batch size {batch_size}: {e.output.decode()}\")\n",
    "        return e.output.decode(), False\n",
    "\n",
    "def find_max_working_batch_size(endpoint, tgi_bs):\n",
    "    batch_sizes = [8, 16, 32]\n",
    "    max_working = None\n",
    "    for size in tqdm(batch_sizes):\n",
    "        tqdm.write(f\"Running: TGIBS {tgi_bs} Client Requests {size}\")\n",
    "        output, success = run_command(size, endpoint, tgi_bs)\n",
    "        if success:\n",
    "            max_working = size\n",
    "        else:\n",
    "            break\n",
    "    if max_working is None:\n",
    "        return \"No working batch size found in the provided list\"\n",
    "    return max_working"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d32b71a7-371f-4f80-a9f2-2cfc65e04afd",
   "metadata": {},
   "source": [
    "Here Im creating the endpoint and then running the simulation."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "70a11c08-0bea-43d6-85eb-ef014473c9f1",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "for tgi_bs in tqdm(tgi_bss):\n",
    "    name = f\"{ENDPOINT_NAME}--tgibs-{tgi_bs}\"\n",
    "    try:\n",
    "        endpoint = get_inference_endpoint(name, namespace=NAMESPACE)\n",
    "    except:\n",
    "        endpoint = create_endpoint(MAX_BATCH_SIZE=tgi_bs, name=name, instance_type=INSTANCE_TYPE) \n",
    "        pass\n",
    "    endpoint.wait()\n",
    "    tqdm.write(f\"Endpoint Created: {name}\")\n",
    "    max_batch_size = find_max_working_batch_size(endpoint=endpoint, tgi_bs=tgi_bs)\n",
    "    endpoint.delete()\n",
    "    tqdm.write(f\"Endpoint Deleted: {name}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "70a5f441-3da7-4888-9943-112750681067",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.14"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}