Spaces:
Runtime error
Runtime error
# Modified from: | |
# vLLM: https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/llm.py | |
from typing import List, Optional, Union | |
import argparse | |
import torch | |
from tqdm import tqdm | |
from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast | |
from vllm.engine.arg_utils import EngineArgs | |
# from vllm.engine.llm_engine import LLMEngine | |
from vllm.lora.request import LoRARequest | |
from vllm.outputs import RequestOutput | |
from vllm.sampling_params import SamplingParams | |
from vllm.sequence import MultiModalData | |
from vllm.usage.usage_lib import UsageContext | |
from vllm.utils import Counter | |
from serve.llm_engine import LLMEngine | |
class LLM: | |
"""An LLM for generating texts from given prompts and sampling parameters. | |
This class includes a tokenizer, a language model (possibly distributed | |
across multiple GPUs), and GPU memory space allocated for intermediate | |
states (aka KV cache). Given a batch of prompts and sampling parameters, | |
this class generates texts from the model, using an intelligent batching | |
mechanism and efficient memory management. | |
NOTE: This class is intended to be used for offline inference. For online | |
serving, use the `AsyncLLMEngine` class instead. | |
NOTE: For the comprehensive list of arguments, see `EngineArgs`. | |
Args: | |
model: The name or path of a HuggingFace Transformers model. | |
tokenizer: The name or path of a HuggingFace Transformers tokenizer. | |
tokenizer_mode: The tokenizer mode. "auto" will use the fast tokenizer | |
if available, and "slow" will always use the slow tokenizer. | |
skip_tokenizer_init: If true, skip initialization of tokenizer and | |
detokenizer. Expect valid prompt_token_ids and None for prompt | |
from the input. | |
trust_remote_code: Trust remote code (e.g., from HuggingFace) when | |
downloading the model and tokenizer. | |
tensor_parallel_size: The number of GPUs to use for distributed | |
execution with tensor parallelism. | |
dtype: The data type for the model weights and activations. Currently, | |
we support `float32`, `float16`, and `bfloat16`. If `auto`, we use | |
the `torch_dtype` attribute specified in the model config file. | |
However, if the `torch_dtype` in the config is `float32`, we will | |
use `float16` instead. | |
quantization: The method used to quantize the model weights. Currently, | |
we support "awq", "gptq", "squeezellm", and "fp8" (experimental). | |
If None, we first check the `quantization_config` attribute in the | |
model config file. If that is None, we assume the model weights are | |
not quantized and use `dtype` to determine the data type of | |
the weights. | |
revision: The specific model version to use. It can be a branch name, | |
a tag name, or a commit id. | |
tokenizer_revision: The specific tokenizer version to use. It can be a | |
branch name, a tag name, or a commit id. | |
seed: The seed to initialize the random number generator for sampling. | |
gpu_memory_utilization: The ratio (between 0 and 1) of GPU memory to | |
reserve for the model weights, activations, and KV cache. Higher | |
values will increase the KV cache size and thus improve the model's | |
throughput. However, if the value is too high, it may cause out-of- | |
memory (OOM) errors. | |
swap_space: The size (GiB) of CPU memory per GPU to use as swap space. | |
This can be used for temporarily storing the states of the requests | |
when their `best_of` sampling parameters are larger than 1. If all | |
requests will have `best_of=1`, you can safely set this to 0. | |
Otherwise, too small values may cause out-of-memory (OOM) errors. | |
enforce_eager: Whether to enforce eager execution. If True, we will | |
disable CUDA graph and always execute the model in eager mode. | |
If False, we will use CUDA graph and eager execution in hybrid. | |
max_context_len_to_capture: Maximum context len covered by CUDA graphs. | |
When a sequence has context length larger than this, we fall back | |
to eager mode. | |
disable_custom_all_reduce: See ParallelConfig | |
""" | |
def __init__( | |
self, | |
args: argparse.ArgumentParser, | |
model: str, | |
tokenizer: Optional[str] = None, | |
tokenizer_mode: str = "auto", | |
skip_tokenizer_init: bool = False, | |
trust_remote_code: bool = False, | |
tensor_parallel_size: int = 1, | |
dtype: str = "auto", | |
quantization: Optional[str] = None, | |
revision: Optional[str] = None, | |
tokenizer_revision: Optional[str] = None, | |
seed: int = 0, | |
gpu_memory_utilization: float = 0.9, | |
swap_space: int = 4, | |
enforce_eager: bool = False, | |
max_context_len_to_capture: int = 8192, | |
disable_custom_all_reduce: bool = False, | |
**kwargs, | |
) -> None: | |
if "disable_log_stats" not in kwargs: | |
kwargs["disable_log_stats"] = True | |
engine_args = EngineArgs( | |
model=model, | |
tokenizer=tokenizer, | |
tokenizer_mode=tokenizer_mode, | |
skip_tokenizer_init=skip_tokenizer_init, | |
trust_remote_code=trust_remote_code, | |
tensor_parallel_size=tensor_parallel_size, | |
dtype=dtype, | |
quantization=quantization, | |
revision=revision, | |
tokenizer_revision=tokenizer_revision, | |
seed=seed, | |
gpu_memory_utilization=gpu_memory_utilization, | |
swap_space=swap_space, | |
enforce_eager=enforce_eager, | |
max_context_len_to_capture=max_context_len_to_capture, | |
disable_custom_all_reduce=disable_custom_all_reduce, | |
**kwargs, | |
) | |
self.llm_engine = LLMEngine.from_engine_args( | |
engine_args, usage_context=UsageContext.LLM_CLASS, args=args) | |
self.request_counter = Counter() | |
def get_tokenizer( | |
self) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]: | |
return self.llm_engine.tokenizer.tokenizer | |
def set_tokenizer( | |
self, | |
tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast], | |
) -> None: | |
self.llm_engine.tokenizer.tokenizer = tokenizer | |
def generate( | |
self, | |
prompts: Optional[Union[str, List[str]]] = None, | |
sampling_params: Optional[Union[SamplingParams, | |
List[SamplingParams]]] = None, | |
prompt_token_ids: Optional[List[List[int]]] = None, | |
use_tqdm: bool = True, | |
lora_request: Optional[LoRARequest] = None, | |
multi_modal_data: Optional[MultiModalData] = None, | |
) -> List[RequestOutput]: | |
"""Generates the completions for the input prompts. | |
NOTE: This class automatically batches the given prompts, considering | |
the memory constraint. For the best performance, put all of your prompts | |
into a single list and pass it to this method. | |
Args: | |
prompts: A list of prompts to generate completions for. | |
sampling_params: The sampling parameters for text generation. If | |
None, we use the default sampling parameters. | |
When it is a single value, it is applied to every prompt. | |
When it is a list, the list must have the same length as the | |
prompts and it is paired one by one with the prompt. | |
prompt_token_ids: A list of token IDs for the prompts. If None, we | |
use the tokenizer to convert the prompts to token IDs. | |
use_tqdm: Whether to use tqdm to display the progress bar. | |
lora_request: LoRA request to use for generation, if any. | |
multi_modal_data: Multi modal data. | |
Returns: | |
A list of `RequestOutput` objects containing the generated | |
completions in the same order as the input prompts. | |
""" | |
if prompts is None and prompt_token_ids is None: | |
raise ValueError("Either prompts or prompt_token_ids must be " | |
"provided.") | |
if self.llm_engine.model_config.skip_tokenizer_init \ | |
and prompts is not None: | |
raise ValueError("prompts must be None if skip_tokenizer_init " | |
"is True") | |
if isinstance(prompts, str): | |
# Convert a single prompt to a list. | |
prompts = [prompts] | |
if (prompts is not None and prompt_token_ids is not None | |
and len(prompts) != len(prompt_token_ids)): | |
raise ValueError("The lengths of prompts and prompt_token_ids " | |
"must be the same.") | |
if prompts is not None: | |
num_requests = len(prompts) | |
else: | |
assert prompt_token_ids is not None | |
num_requests = len(prompt_token_ids) | |
if sampling_params is None: | |
# Use default sampling params. | |
sampling_params = SamplingParams() | |
elif isinstance(sampling_params, | |
list) and len(sampling_params) != num_requests: | |
raise ValueError("The lengths of prompts and sampling_params " | |
"must be the same.") | |
if multi_modal_data: | |
multi_modal_data.data = multi_modal_data.data.to(torch.float16) | |
# Add requests to the engine. | |
for i in range(num_requests): | |
prompt = prompts[i] if prompts is not None else None | |
token_ids = None if prompt_token_ids is None else prompt_token_ids[i] | |
self._add_request( | |
prompt, | |
sampling_params[i] | |
if isinstance(sampling_params, list) else sampling_params, | |
token_ids, | |
lora_request=lora_request, | |
# Get ith image while maintaining the batch dim. | |
multi_modal_data=MultiModalData( | |
type=multi_modal_data.type, | |
data=multi_modal_data.data[i].unsqueeze(0)) | |
if multi_modal_data else None, | |
) | |
return self._run_engine(use_tqdm) | |
def _add_request( | |
self, | |
prompt: Optional[str], | |
sampling_params: SamplingParams, | |
prompt_token_ids: Optional[List[int]], | |
lora_request: Optional[LoRARequest] = None, | |
multi_modal_data: Optional[MultiModalData] = None, | |
) -> None: | |
request_id = str(next(self.request_counter)) | |
self.llm_engine.add_request(request_id, | |
prompt, | |
sampling_params, | |
prompt_token_ids, | |
lora_request=lora_request, | |
multi_modal_data=multi_modal_data) | |
def _run_engine(self, use_tqdm: bool) -> List[RequestOutput]: | |
# Initialize tqdm. | |
if use_tqdm: | |
num_requests = self.llm_engine.get_num_unfinished_requests() | |
pbar = tqdm( | |
total=num_requests, | |
desc="Processed prompts", | |
dynamic_ncols=True, | |
postfix=f"Generation Speed: {0:.2f} toks/s", | |
) | |
# Run the engine. | |
outputs: List[RequestOutput] = [] | |
while self.llm_engine.has_unfinished_requests(): | |
step_outputs = self.llm_engine.step() | |
for output in step_outputs: | |
if output.finished: | |
outputs.append(output) | |
if use_tqdm: | |
total_toks += (sum( | |
len(stp.token_ids) for stp in output.outputs)) | |
spd = total_toks / pbar.format_dict["elapsed"] | |
pbar.postfix = f"Generation Speed: {spd:.2f} toks/s" | |
pbar.update(1) | |
if use_tqdm: | |
pbar.close() | |
# Sort the outputs by request ID. | |
# This is necessary because some requests may be finished earlier than | |
# its previous requests. | |
outputs = sorted(outputs, key=lambda x: int(x.request_id)) | |
return outputs | |