Spaces:
Runtime error
Runtime error
File size: 4,552 Bytes
31ba7c5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 |
"""
A model worker with transformers libs executes the model.
Run BF16 inference with:
python model_server.py --host localhost --model-path THUDM/glm-4-voice-9b --port 10000 --dtype bfloat16 --device cuda:0
Run Int4 inference with:
python model_server.py --host localhost --model-path THUDM/glm-4-voice-9b --port 10000 --dtype int4 --device cuda:0
"""
import argparse
import json
from fastapi import FastAPI, Request
from fastapi.responses import StreamingResponse
from transformers import AutoModel, AutoTokenizer, BitsAndBytesConfig
from transformers.generation.streamers import BaseStreamer
import torch
import uvicorn
from threading import Thread
from queue import Queue
class TokenStreamer(BaseStreamer):
def __init__(self, skip_prompt: bool = False, timeout=None):
self.skip_prompt = skip_prompt
# variables used in the streaming process
self.token_queue = Queue()
self.stop_signal = None
self.next_tokens_are_prompt = True
self.timeout = timeout
def put(self, value):
if len(value.shape) > 1 and value.shape[0] > 1:
raise ValueError("TextStreamer only supports batch size 1")
elif len(value.shape) > 1:
value = value[0]
if self.skip_prompt and self.next_tokens_are_prompt:
self.next_tokens_are_prompt = False
return
for token in value.tolist():
self.token_queue.put(token)
def end(self):
self.token_queue.put(self.stop_signal)
def __iter__(self):
return self
def __next__(self):
value = self.token_queue.get(timeout=self.timeout)
if value == self.stop_signal:
raise StopIteration()
else:
return value
class ModelWorker:
def __init__(self, model_path, dtype="bfloat16", device='cuda'):
self.device = device
self.bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16
) if dtype == "int4" else None
self.glm_model = AutoModel.from_pretrained(
model_path,
trust_remote_code=True,
quantization_config=self.bnb_config if self.bnb_config else None,
device_map={"": 0}
).eval()
self.glm_tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
@torch.inference_mode()
def generate_stream(self, params):
tokenizer, model = self.glm_tokenizer, self.glm_model
prompt = params["prompt"]
temperature = float(params.get("temperature", 1.0))
top_p = float(params.get("top_p", 1.0))
max_new_tokens = int(params.get("max_new_tokens", 256))
inputs = tokenizer([prompt], return_tensors="pt")
inputs = inputs.to(self.device)
streamer = TokenStreamer(skip_prompt=True)
thread = Thread(
target=model.generate,
kwargs=dict(
**inputs,
max_new_tokens=int(max_new_tokens),
temperature=float(temperature),
top_p=float(top_p),
streamer=streamer
)
)
thread.start()
for token_id in streamer:
yield (json.dumps({"token_id": token_id, "error_code": 0}) + "\n").encode()
def generate_stream_gate(self, params):
try:
for x in self.generate_stream(params):
yield x
except Exception as e:
print("Caught Unknown Error", e)
ret = {
"text": "Server Error",
"error_code": 1,
}
yield (json.dumps(ret) + "\n").encode()
app = FastAPI()
@app.post("/generate_stream")
async def generate_stream(request: Request):
params = await request.json()
generator = worker.generate_stream_gate(params)
return StreamingResponse(generator)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--host", type=str, default="localhost")
parser.add_argument("--dtype", type=str, default="bfloat16")
parser.add_argument("--device", type=str, default="cuda:0")
parser.add_argument("--port", type=int, default=10000)
parser.add_argument("--model-path", type=str, default="THUDM/glm-4-voice-9b")
args = parser.parse_args()
worker = ModelWorker(args.model_path, args.dtype, args.device)
uvicorn.run(app, host=args.host, port=args.port, log_level="info")
|