from typing import Dict, List, Any from transformers import AutoTokenizer, AutoModelForCausalLM import torch class EndpointHandler: def __init__(self, path: str = ""): self.tokenizer = AutoTokenizer.from_pretrained(path) self.model = AutoModelForCausalLM.from_pretrained( path, device_map="auto" ) self.model.eval() # Default generation parameters self.default_params = { "max_new_tokens": 100, "temperature": 0.0, "top_p": 0.9, "top_k": 50, "repetition_penalty": 1.1, "do_sample": True } def __call__(self, data: Dict[str, Any]) -> List[Dict[str, str]]: """Handle chat completion requests. Args: data: Dictionary containing: - messages: List of message dictionaries with 'role' and 'content' - generation_params: Optional dictionary of generation parameters Returns: List containing the generated response message """ messages = data.get("messages", []) if not messages: return [{"role": "assistant", "content": "No input messages provided"}] # Get generation parameters, use defaults for missing values gen_params = {**self.default_params, **data.get("generation_params", {})} # Apply the chat template prompt = self.tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) # Tokenize the prompt inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device) # Generate response with torch.no_grad(): output_tokens = self.model.generate( **inputs, **gen_params ) # Decode the response output_text = self.tokenizer.batch_decode(output_tokens)[0] # Extract the assistant's response by removing the input prompt response = output_text[len(prompt):].strip() return [{"role": "assistant", "content": response}]