try1_deploy_falcon / handler.py
ClaudiaIoana550's picture
Create handler.py
bc11d4b verified
raw
history blame
1.45 kB
from typing import Any, Dict, List
from langchain.llms import HuggingFacePipeline
import torch
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer
dtype = torch.bfloat16 if torch.cuda.get_device_capability()[0] == 8 else torch.float16
class EndpointHandler:
def __init__(self, model_path=""):
tokenizer=AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_path,
return_dict=True,
device_map="auto",
torch_dtype = dtype,
trust_remote_code=True
)
generation_config = model.generation_config
generation_config.max_new_tokens = 1700
generation_config.min_length = 20
generation_config.temperature = 1
generation_config.top_p = 0.7
generation_config.num_return_sequences = 1
generation_config.pad_token_id = tokenizer.eos_token_id
generation_config.eos_token_id = tokenizer.eos_token_id
generation_config.repetition_penalty = 1.1
gpipeline = transformers.pipeline(
model=model,
tokenizer=tokenizer,
return_full_text=True,
task="text-generation",
stopping_criteria=stopping_criteria,
generation_config=generation_config
)
self.llm = HuggingFacePipeline(pipeline=gpipeline)
def __call__(self, data:Dict[str, Any]) -> Dict[str, Any]:
prompt = data.pop("inputs", data)
result = self.llm(prompt)
return result