Spaces:

ml-energy
/

leaderboard

Running

App Files Files Community

ohjuny commited on Feb 18

Commit

c5e73ca

•

1 Parent(s): 3df33b9

TGI/vLLM benchmarking (#34)

Browse files

Files changed (14) hide show

scripts/inference-server/.gitignore +2 -0
scripts/inference-server/Dockerfile +16 -0
scripts/inference-server/README.md +5 -0
scripts/inference-server/benchmark.py +323 -0
scripts/inference-server/local-tokenizers/README.md +79 -0
scripts/inference-server/local-tokenizers/meta-llama/Llama-2-70b-chat-hf/tokenizer_config.json +36 -0
scripts/inference-server/local-tokenizers/meta-llama/Llama-2-7b-chat-hf/tokenizer_config.json +22 -0
scripts/inference-server/local-tokenizers/mistralai/Mistral-7B-Instruct-v0.2/chat_template.jinja +1 -0
scripts/inference-server/local-tokenizers/mistralai/Mistral-7B-Instruct-v0.2/tokenizer_config.json +5 -0
scripts/inference-server/requirements.txt +7 -0
sharegpt/README.md +31 -6
sharegpt/ShareGPT_V3_filtered_500.json +0 -0
sharegpt/compare_distributions.py +62 -0
sharegpt/filter_dataset.py +107 -0

scripts/inference-server/.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ test*
2	+ temp*

scripts/inference-server/Dockerfile ADDED Viewed

	@@ -0,0 +1,16 @@

+# docker build -t benchmark:latest .
+# Use an official Python runtime as a parent image
+FROM python:3.9
+# Set the working directory in the container
+WORKDIR /benchmark
+# Copy the current directory contents into the container at /benchmark
+COPY . .
+# Install any needed packages specified in requirements.txt
+RUN pip install --no-cache-dir -r requirements.txt
+# Run script.py when the container launches
+ENTRYPOINT ["python", "benchmark.py"]

scripts/inference-server/README.md ADDED Viewed

	@@ -0,0 +1,5 @@

+# About
+This directory contains a script for running benchmarks (including energy comsumption) on models that are hosted on a dedicated inference server. The script is taken and modified from [vllm](https://github.com/vllm-project/vllm/blob/93b38bea5dd03e1b140ca997dfaadef86f8f1855/benchmarks/benchmark_serving.py)
+The current script supports TGI and vLLM. Before running the benchmark script, the inference server hosting the relevant model should be hosted.

scripts/inference-server/benchmark.py ADDED Viewed

	@@ -0,0 +1,323 @@

+"""Taken and modified from vllm: https://github.com/vllm-project/vllm/blob/93b38bea5dd03e1b140ca997dfaadef86f8f1855/benchmarks/benchmark_serving.py
+"""
+import argparse
+import asyncio
+import json
+import random
+import time
+import torch
+from typing import AsyncGenerator, List, Tuple
+import aiohttp
+import numpy as np
+from dataclasses import asdict, dataclass, field
+from tqdm.asyncio import tqdm
+from zeus.monitor import ZeusMonitor
+SYSTEM_PROMPT = "A chat between a human user (prompter) and an artificial intelligence (AI) assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. "
+@dataclass
+class Results:
+    model: str
+    backend: str
+    request_rate: float
+    num_failures: int = 0
+    system_prompt: str = SYSTEM_PROMPT
+    total_time: float = 0.0
+    throughput: float = 0.0
+    total_prompt_tokens: int = 0.0
+    total_completion_tokens: int = 0.0
+    avg_latency: float = 0.0
+    avg_latency_per_token: float = 0.0
+    avg_latency_per_output_token: float = 0.0
+    server_total_energy: float = 0.0
+    server_energy_per_request: float = 0.0
+    server_energy_per_output_token: float = 0.0
+    local_zeus_total_energy: float = 0.0
+    local_zeus_energy_per_request: float = 0.0
+    local_zeus_energy_per_output_token: float = 0.0
+    results: list["Result"] = field(default_factory=list)
+@dataclass
+class Result:
+    success: bool = True
+    latency: float = 0.0
+    prompt: str = ""
+    response: str = ""
+    num_prompt_tokens: int = 0
+    num_completion_tokens: int = 0
+    energy: float = 0.0
+def get_requests(
+    dataset_path: str,
+) -> List[str]:
+    # Load the dataset.
+    with open(dataset_path) as f:
+        dataset = json.load(f)
+    # Only keep the first turn of each conversation.
+    dataset = [data["conversations"][0]["value"] for data in dataset]
+    return dataset
+async def get_request(
+    input_requests: List[str],
+    request_rate: float,
+) -> AsyncGenerator[Tuple[str, int, int], None]:
+    input_requests = iter(input_requests)
+    for i, request in enumerate(input_requests):
+        yield i, request
+        if request_rate == float("inf"):
+            # If the request rate is infinity, then we don't need to wait.
+            continue
+        # Sample the request interval from the exponential distribution.
+        interval = np.random.exponential(1.0 / request_rate)
+        # The next request will be sent after the interval.
+        await asyncio.sleep(interval)
+async def send_request(
+    result: Result,
+    backend: str,
+    model: str,
+    api_url: str,
+    prompt: str,
+    pbar: tqdm,
+) -> None:
+    request_start_time = time.perf_counter()
+    headers = {"Content-Type": "application/json"}
+    # OpenAI Chat Completions API request format
+    pload = {
+        "model": model,
+        "messages": [
+            {"role": "system", "content": SYSTEM_PROMPT},
+            {"role": "user", "content": prompt},
+        ],
+        "stream": False,
+        "max_tokens": 1000,
+    }
+    timeout = aiohttp.ClientTimeout(total=3 * 3600)
+    async with aiohttp.ClientSession(timeout=timeout) as session:
+        async with session.post(api_url, headers=headers, json=pload) as response:
+            # Request failed
+            if response.status // 100 != 2:
+                print('request failed')
+                print(f"response.status {response.status}")
+                result.prompt = prompt
+                result.success = False
+                return
+            chunks = []
+            async for chunk, _ in response.content.iter_chunks():
+                chunks.append(chunk)
+        request_end_time = time.perf_counter()
+        output = b"".join(chunks).decode("utf-8")
+        output = json.loads(output)
+    result.latency = request_end_time - request_start_time
+    result.prompt = prompt
+    result.response = output["choices"][0]["message"]["content"]
+    result.num_prompt_tokens = output["usage"]["prompt_tokens"]
+    result.num_completion_tokens = output["usage"]["completion_tokens"]
+    result.energy = output["usage"]["energy"]
+    pbar.update(1)
+async def benchmark(
+    results: Results,
+    backend: str,
+    model: str,
+    api_url: str,
+    input_requests: List[str],
+    request_rate: float,
+) -> None:
+    tasks: List[asyncio.Task] = []
+    pbar = tqdm(total=len(input_requests))
+    async for i, request in get_request(input_requests, request_rate):
+        prompt = request
+        task = asyncio.create_task(
+            # Ensures results has same ordering as the input dataset
+            send_request(
+                results.results[i],
+                backend,
+                model,
+                api_url,
+                prompt,
+                pbar,
+            )
+        )
+        tasks.append(task)
+    await asyncio.gather(*tasks)
+    pbar.close()
+def run_benchmark(
+    args: argparse.Namespace, api_url: str, input_requests: List[str], out_filename: str
+):
+    results = Results(
+        model=args.model,
+        backend=args.backend,
+        request_rate=args.request_rate,
+        results=[Result() for _ in input_requests],
+    )
+    zeus_monitor = ZeusMonitor()
+    zeus_monitor.begin_window(out_filename)
+    benchmark_start_time = time.perf_counter()
+    asyncio.run(
+        benchmark(
+            results,
+            args.backend,
+            args.model,
+            api_url,
+            input_requests,
+            args.request_rate,
+        )
+    )
+    benchmark_end_time = time.perf_counter()
+    measurements = zeus_monitor.end_window(out_filename)
+    zeus_total_energy = measurements.total_energy
+    # Store aggregated results
+    total_prompt_tokens = 0
+    total_completion_tokens = 0
+    total_latency = 0
+    total_latency_per_token = 0
+    total_latency_per_output_token = 0
+    server_total_energy = 0
+    for result in results.results:
+        if not result.success:
+            results.num_failures += 1
+            continue
+        total_prompt_tokens += result.num_prompt_tokens
+        total_completion_tokens += result.num_completion_tokens
+        total_latency += result.latency
+        total_latency_per_token += result.latency / (
+            result.num_prompt_tokens + result.num_completion_tokens
+        )
+        total_latency_per_output_token += result.latency / result.num_completion_tokens
+        server_total_energy += result.energy
+    num_results = len(results.results) - results.num_failures
+    if num_results == 0:
+        print(f"{out_filename} not generated. All requests in this run failed.")
+        return
+    results.total_time = benchmark_end_time - benchmark_start_time
+    results.throughput = num_results / results.total_time
+    results.total_prompt_tokens = total_prompt_tokens
+    results.total_completion_tokens = total_completion_tokens
+    results.avg_latency = total_latency / num_results
+    results.avg_latency_per_token = total_latency_per_token / num_results
+    results.avg_latency_per_output_token = total_latency_per_output_token / num_results
+    results.server_total_energy = server_total_energy
+    results.server_energy_per_request = results.server_total_energy / num_results
+    results.server_energy_per_output_token = (
+        results.server_total_energy / results.total_completion_tokens
+    )
+    results.local_zeus_total_energy = zeus_total_energy
+    results.local_zeus_energy_per_request = zeus_total_energy / num_results
+    results.local_zeus_energy_per_output_token = (
+        zeus_total_energy / results.total_completion_tokens
+    )
+    with open(out_filename, "w") as f:
+        f.write(json.dumps(asdict(results), indent=2))
+    if args.verbose:
+        print("Benchmark results:")
+        print(f"Model: {results.model}")
+        print(f"Backend: {results.backend}")
+        print(f"Request rate: {results.request_rate} requests/s")
+        print()
+        print(f"Total time: {results.total_time:.2f} s")
+        print(f"Throughput: {results.throughput:.2f} requests/s")
+        print(f"Average latency: {results.avg_latency:.2f} s")
+        print(f"Average latency per token: {results.avg_latency_per_token:.2f} s")
+        print(f"Average latency per output token: {results.avg_latency_per_output_token:.2f} s")
+        print(f"(Zeus) Total energy: {results.local_zeus_total_energy:.2f} J")
+        print(f"(Zeus) Energy per request: {results.local_zeus_energy_per_request:.2f} J")
+        print(f"(Zeus) Energy per token: {results.local_zeus_energy_per_output_token:.2f} J")
+        print(f"(Server) Total energy: {results.server_total_energy:.2f} J")
+        print(f"(Server) Energy per request: {results.server_energy_per_request:.2f} J")
+        print(f"(Server) Energy per token: {results.server_energy_per_output_token:.2f} J")
+    print("Benchmark results written to", out_filename)
+def main(args: argparse.Namespace):
+    if args.backend not in ["tgi", "vllm"]:
+        raise ValueError(f"Unknown backend: {args.backend}")
+    arg_out_filename = f"{args.out_name}-args.json"
+    with open(arg_out_filename, "w") as f:
+        f.write(json.dumps(vars(args), indent=2))
+    if args.verbose:
+        print(args)
+    print("Benchmark args written to", arg_out_filename)
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    out_name = args.out_name
+    api_url = f"{args.protocol}://{args.host}:{args.port}{args.endpoint}"
+    input_requests = get_requests(args.dataset)
+    # Note: output filenames are 1-indexed
+    for i in range(1, args.num_runs + 1):
+        run_benchmark(args, api_url, input_requests, out_name + f"-run{i}.json")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Benchmark the online serving throughput."
+    )
+    parser.add_argument("--backend", type=str, default="vllm", choices=["vllm", "tgi"])
+    parser.add_argument(
+        "--protocol", type=str, default="http", choices=["http", "https"]
+    )
+    parser.add_argument("--host", type=str, default="localhost")
+    parser.add_argument("--port", type=int, default=8000)
+    parser.add_argument("--endpoint", type=str, default="/v1/chat/completions")
+    parser.add_argument("--model", type=str, default=None)
+    parser.add_argument(
+        "--dataset", type=str, required=True, help="Path to the dataset."
+    )
+    parser.add_argument(
+        "--num-runs",
+        type=int,
+        default=3,
+        help="Runs the benchmark num-runs times, writing results to 3 separate files.",
+    )
+    parser.add_argument(
+        "--request-rate",
+        type=float,
+        default=float("inf"),
+        help="Number of requests per second. If this is inf, "
+        "then all the requests are sent at time 0. "
+        "Otherwise, we use Poisson process to synthesize "
+        "the request arrival times.",
+    )
+    parser.add_argument(
+        "--out-name",
+        type=str,
+        default="benchmark_result",
+        help="Name of file to write benchmark results. Note: '-run{i}.json' will be appended for actual outputted files.",
+    )
+    parser.add_argument(
+        "--verbose",
+        type=bool,
+        default=True,
+        help="Set to true to print out benchmark results. Otherwise, only write to file.",
+    )
+    parser.add_argument("--seed", type=int, default=0)
+    args = parser.parse_args()
+    main(args)

scripts/inference-server/local-tokenizers/README.md ADDED Viewed

	@@ -0,0 +1,79 @@

+# TGI
+The local tokenizer config can be supplied to TGI through the flag `--tokenizer-config-path`, documented [here](https://huggingface.co/docs/text-generation-inference/basic_tutorials/launcher#tokenizerconfigpath).
+# vLLM
+A local chat template can be supplied to vLLM through the flag `--chat-template`. It is not explicitly documented, but can be found mentioned in GitHub Issues relating to the topic.
+# Llama-2 models on TGI
+There is a [known bug with TGI](https://github.com/huggingface/text-generation-inference/issues/1534) in which the default `tokenizer_config.json` is not handled properly by TGI by applying chat templating. While this is resolved, we are using a modified `tokenizer_config.json` that is compatible with TGI. Note that the chat templating jinja itself the same, with the exception of removing 2 calls to `.strip()`, which TGI reports errors on.
+For reference, here is the original unmodified chat template:
+```
+{% if messages[0]['role'] == 'system' %}
+    {% set loop_messages = messages[1:] %}
+    {% set system_message = messages[0]['content'] %}
+{% else %}
+    {% set loop_messages = messages %}
+    {% set system_message = false %}
+{% endif %}
+{% for message in loop_messages %}
+    {% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}
+        {{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}
+    {% endif %}
+    {% if loop.index0 == 0 and system_message != false %}
+        {% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}
+    {% else %}
+        {% set content = message['content'] %}
+    {% endif %}
+    {% if message['role'] == 'user' %}
+        {{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}
+    {% elif message['role'] == 'assistant' %}
+        {{ ' '  + content.strip() + ' ' + eos_token }}
+    {% endif %}
+{% endfor %}
+```
+We also note that the `eos_token` and `bos_token` are originally provided as maps, but the TGI implementation only accepts a string. So we also modify them to only contain the `content` string.
+For reference, here is the original unmodified `tokenizer_config.json`:
+```
+{
+    "add_bos_token": true,
+    "add_eos_token": false,
+    "bos_token": {
+      "__type": "AddedToken",
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content + ' ' + eos_token }}{% endif %}{% endfor %}",
+    "clean_up_tokenization_spaces": false,
+    "eos_token": {
+      "__type": "AddedToken",
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    "legacy": false,
+    "model_max_length": 1000000000000000019884624838656,
+    "pad_token": null,
+    "padding_side": "right",
+    "sp_model_kwargs": {},
+    "tokenizer_class": "LlamaTokenizer",
+    "unk_token": {
+      "__type": "AddedToken",
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    }
+}
+```
+# Mistral with chat templating
+Mistral for chatting has not been explicitly trained using a distinct system prompt. Therefore, the default Mistral `tokenizer_config.json` explicitly assumes that the system role does not exist. To keep our benchmarks consistent across models, we reenginered the original Mistral chat template to account for a system prompt. We simply preppend the system prompt to the first user prompt in a given conversation.

scripts/inference-server/local-tokenizers/meta-llama/Llama-2-70b-chat-hf/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,36 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "bos_token": {
+    "__type": "AddedToken",
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content + ' ' + eos_token }}{% endif %}{% endfor %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": {
+    "__type": "AddedToken",
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "legacy": false,
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": null,
+  "padding_side": "right",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": {
+    "__type": "AddedToken",
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

scripts/inference-server/local-tokenizers/meta-llama/Llama-2-7b-chat-hf/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,22 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "bos_token": "</s>",
+  "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content + ' ' + eos_token }}{% endif %}{% endfor %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "legacy": false,
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": null,
+  "padding_side": "right",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": {
+    "__type": "AddedToken",
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

scripts/inference-server/local-tokenizers/mistralai/Mistral-7B-Instruct-v0.2/chat_template.jinja ADDED Viewed

	@@ -0,0 +1 @@

+ {% if (messages[0]['role'] != 'system') %}{{ raise_exception('First role should be system!') }}{% elif (messages[1]['role'] != 'user') %}{{ raise_exception('Second role should be user!') }}{% endif %}{{ bos_token }}{{ '[INST] ' + messages[0]['content'] + ' ' + messages[1]['content'] + ' [/INST]' }}{% for message in messages[2:] %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 1) %}{{ raise_exception('Conversation roles must alternate system/user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only system, user and assistant roles are supported!') }}{% endif %}{% endfor %}

scripts/inference-server/local-tokenizers/mistralai/Mistral-7B-Instruct-v0.2/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "bos_token": "<s>",
+  "chat_template": "{% if (messages[0]['role'] != 'system') %}{{ raise_exception('First role should be system!') }}{% elif (messages[1]['role'] != 'user') %}{{ raise_exception('Second role should be user!') }}{% endif %}{{ bos_token }}{{ '[INST] ' + messages[0]['content'] + ' ' + messages[1]['content'] + ' [/INST]' }}{% for message in messages[2:] %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 1) %}{{ raise_exception('Conversation roles must alternate system/user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only system, user and assistant roles are supported!') }}{% endif %}{% endfor %}",
+  "eos_token": "</s>"
+}

scripts/inference-server/requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+argparse
+asyncio
+aiohttp
+numpy
+torch==2.0.1 --index-url https://download.pytorch.org/whl/cu118
+tqdm
+zeus-ml

sharegpt/README.md CHANGED Viewed

@@ -1,33 +1,58 @@
 # How we used ShareGPT to create our benchmark dataset
-## Download ShareGPT
 ```
 https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/HTML_cleaned_raw_dataset/sg_90k_part1_html_cleaned.json
 ```
-## Install Fastchat
 ```
 pip install fschat
 ```
-## Clean data:
 ```
 pip install polyglot pyicu pycld2
 python -m fastchat.data.optional_clean --in sg_90k_part1_html_cleaned.json --out sg_90k_part1_html_cleaned_lang.json --keep-lang en
 ```
-## Extract first prompt
 ```
 python extract_first.py --in-file sg_90k_part1_html_cleaned_lang.json --out-file sg_90k_part1_html_cleaned_lang_first.json
 ```
-## Sample data
 ```
 python -m fastchat.data.sample --in sg_90k_part1_html_cleaned_lang_first.json --out sg_90k_part1_html_cleaned_lang_first_sampled.json --end 10000 --max-length 10000
 ```
-## Sorted data
 We sort the requests by sequence length, placing the longest sequences first. This approach minimizes the amount of padding required and allows for early detection of out-of-memory.
 ```
 python sort.py --data-dir sg_90k_part1_html_cleaned_lang_first_sampled.json --out-file sg_90k_part1_html_cleaned_lang_first_sampled_sorted.json
 ```

 # How we used ShareGPT to create our benchmark dataset
+## sg_90k_part1_html_cleaned.json
+### Download ShareGPT dataset
 ```
 https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/HTML_cleaned_raw_dataset/sg_90k_part1_html_cleaned.json
 ```
+### Install Fastchat
 ```
 pip install fschat
 ```
+### Clean data:
 ```
 pip install polyglot pyicu pycld2
 python -m fastchat.data.optional_clean --in sg_90k_part1_html_cleaned.json --out sg_90k_part1_html_cleaned_lang.json --keep-lang en
 ```
+### Extract first prompt
 ```
 python extract_first.py --in-file sg_90k_part1_html_cleaned_lang.json --out-file sg_90k_part1_html_cleaned_lang_first.json
 ```
+### Sample data
 ```
 python -m fastchat.data.sample --in sg_90k_part1_html_cleaned_lang_first.json --out sg_90k_part1_html_cleaned_lang_first_sampled.json --end 10000 --max-length 10000
 ```
+### Sorted data
 We sort the requests by sequence length, placing the longest sequences first. This approach minimizes the amount of padding required and allows for early detection of out-of-memory.
 ```
 python sort.py --data-dir sg_90k_part1_html_cleaned_lang_first_sampled.json --out-file sg_90k_part1_html_cleaned_lang_first_sampled_sorted.json
 ```
+## ShareGPT_V3_filtered.json
+### Download ShareGPT dataset
+```
+https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+```
+### Install Transformers
+```
+pip install transformers
+```
+### Filter conversations with too long prompts/responses, extract first turn, and randomly sample 500 prompts
+```
+python filter_dataset.py
+```
+### Compare the response length distribution of sampled dataset with respect to initial dataset
+```
+pip install matplotlib numpy
+python compare_distributions.py
+```

sharegpt/ShareGPT_V3_filtered_500.json ADDED Viewed

The diff for this file is too large to render. See raw diff

sharegpt/compare_distributions.py ADDED Viewed

	@@ -0,0 +1,62 @@

+import json
+import matplotlib.pyplot as plt
+import numpy as np
+from transformers import (
+    AutoTokenizer,
+    PreTrainedTokenizer,
+    PreTrainedTokenizerBase,
+    PreTrainedTokenizerFast,
+)
+# Open datasets
+file_paths = ["ShareGPT_V3_filtered.json", "ShareGPT_V3_filtered_500.json"]
+names = [file_path[:-5] for file_path in file_paths]
+data_lists = []
+for file_path in file_paths:
+    with open(file_path, "r", encoding="utf-8") as file:
+        data_list = json.load(file)
+        data_lists.append(data_list)
+for name, data_list in zip(names, data_lists):
+    print(f"{name}: {len(data_list)}")
+# Get prompt lengths using tokenizer
+tokenizer = AutoTokenizer.from_pretrained("huggyllama/llama-7b")
+all_prompts = [
+    [data["conversations"][0]["value"] for data in data_lists]
+    for data_lists in data_lists
+]
+all_token_ids_per_prompts = [tokenizer(prompts).input_ids for prompts in all_prompts]
+all_prompt_lens = [
+    [len(token_ids) for token_ids in token_ids_per_prompt]
+    for token_ids_per_prompt in all_token_ids_per_prompts
+]
+# Plotting the histograms
+for name, prompt_lens in zip(names, all_prompt_lens):
+    plt.hist(
+        prompt_lens,
+        bins=range(min(prompt_lens), max(prompt_lens) + 1),
+        edgecolor="black",
+    )
+    plt.xlabel("Prompt Length (number of tokens)")
+    plt.ylabel("Frequency")
+    plt.title(f"Histogram of {name}")
+    plt.savefig(f"{name}_distribution.png")
+    plt.close()
+# Plotting the CDF
+for name, prompt_lens in zip(names, all_prompt_lens):
+    values, counts = np.unique(prompt_lens, return_counts=True)
+    relative_frequencies = counts / len(prompt_lens)
+    sorted_data = np.sort(values)
+    cumulative_frequencies = np.cumsum(relative_frequencies)
+    plt.step(sorted_data, cumulative_frequencies, where="post", label=name)
+plt.title(f"Cumulative Distribution Function (CDF) Overlayed")
+plt.xlabel("Prompt Length (number of tokens)")
+plt.ylabel("Cumulative Probability")
+plt.savefig(f"{name}_cdf.png")
+plt.close()

sharegpt/filter_dataset.py ADDED Viewed

	@@ -0,0 +1,107 @@

+"""Taken and modified from vllm: https://github.com/vllm-project/vllm/blob/93b38bea5dd03e1b140ca997dfaadef86f8f1855/benchmarks/benchmark_serving.py
+   Filter dataset to:
+   1. Remove entries that have too long prompts or completions
+   2. Only keep first human prompt for each conversation
+"""
+import json
+import random
+from typing import AsyncGenerator, List, Tuple
+from transformers import (
+    AutoTokenizer,
+    PreTrainedTokenizer,
+    PreTrainedTokenizerBase,
+    PreTrainedTokenizerFast,
+)
+def filter_dataset_to_size(
+    dataset_path: str,
+    size: int,
+) -> List[Tuple[str, int, int]]:
+    # Load the dataset.
+    with open(dataset_path) as f:
+        dataset = json.load(f)
+    # randomly sample dataset
+    return random.sample(dataset, size)
+def filter_dataset(
+    dataset_path: str,
+    tokenizer: PreTrainedTokenizerBase,
+) -> List[Tuple[str, int, int]]:
+    # Load the dataset.
+    with open(dataset_path) as f:
+        dataset = json.load(f)
+    # Filter out the conversations with less than 2 turns.
+    dataset = [data for data in dataset if len(data["conversations"]) >= 2]
+    # Only keep the first two turns of each conversation.
+    dataset = [
+        (
+            data["id"],
+            data["conversations"][0]["value"],
+            data["conversations"][1]["value"],
+        )
+        for data in dataset
+    ]
+    # Tokenize the prompts and completions.
+    conversation_ids = [conv_id for conv_id, _, _ in dataset]
+    prompts = [prompt for _, prompt, _ in dataset]
+    prompt_token_ids = tokenizer(prompts).input_ids
+    completions = [completion for _, _, completion in dataset]
+    completion_token_ids = tokenizer(completions).input_ids
+    tokenized_dataset = []
+    for i in range(len(dataset)):
+        output_len = len(completion_token_ids[i])
+        tokenized_dataset.append(
+            (conversation_ids[i], prompts[i], prompt_token_ids[i], output_len)
+        )
+    # Filter out too long sequences.
+    filtered_dataset_json = []
+    for conv_id, prompt, prompt_token_ids, output_len in tokenized_dataset:
+        prompt_len = len(prompt_token_ids)
+        if prompt_len < 4 or output_len < 4:
+            # Prune too short sequences.
+            # This is because TGI causes errors when the input or output length
+            # is too short.
+            continue
+        # making even shorter than 1024 to account for additional tokens introduced by chat completion wrapper
+        if prompt_len > 800 or output_len > 800:
+            # if prompt_len > 1024 or output_len > 1024:
+            # Prune too long sequences.
+            continue
+        filtered_dataset_json.append(
+            {
+                "id": conv_id,
+                "conversations": [
+                    {
+                        "from": "human",
+                        "value": prompt,
+                    }
+                ],
+            }
+        )
+    return filtered_dataset_json
+def main():
+    tokenizer = AutoTokenizer.from_pretrained("huggyllama/llama-7b")
+    # download: https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+    filtered_dataset = filter_dataset(
+        "ShareGPT_V3_unfiltered_cleaned_split.json", tokenizer
+    )
+    with open("ShareGPT_V3_filtered.json", "w") as f:
+        json.dump(filtered_dataset, f)
+    sampled_dataset = filter_dataset_to_size("ShareGPT_V3_filtered.json", 500)
+    with open("ShareGPT_V3_filtered_500.json", "w") as f:
+        json.dump(sampled_dataset, f)
+if __name__ == "__main__":
+    main()