from typing import Any, Dict, List from langchain.llms import HuggingFacePipeline import torch import transformers from transformers import AutoModelForCausalLM, AutoTokenizer dtype = torch.bfloat16 if torch.cuda.get_device_capability()[0] == 8 else torch.float16 class EndpointHandler: def __init__(self, model_path=""): tokenizer=AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained( model_path, return_dict=True, device_map="auto", torch_dtype = dtype, trust_remote_code=True ) generation_config = model.generation_config generation_config.max_new_tokens = 1700 generation_config.min_length = 20 generation_config.temperature = 1 generation_config.top_p = 0.7 generation_config.num_return_sequences = 1 generation_config.pad_token_id = tokenizer.eos_token_id generation_config.eos_token_id = tokenizer.eos_token_id generation_config.repetition_penalty = 1.1 gpipeline = transformers.pipeline( model=model, tokenizer=tokenizer, return_full_text=True, task="text-generation", stopping_criteria=stopping_criteria, generation_config=generation_config ) self.llm = HuggingFacePipeline(pipeline=gpipeline) def __call__(self, data:Dict[str, Any]) -> Dict[str, Any]: prompt = data.pop("inputs", data) result = self.llm(prompt) return result