multi-meeting-QnA

Sleeping

multi-meeting-QnA / connections /model_unsloth.py

Rename connections/models.py to connections/model_unsloth.py

4d90423 verified 5 months ago

1.16 kB

	import spaces

	@spaces.GPU
	def get_unsloth():
	from unsloth import FastLanguageModel
	return FastLanguageModel

	FastLanguageModel = get_unsloth()


	class InferencePipeline:
	def __init__(self, conf, api_key):
	self.conf = conf
	self.token = api_key
	self.model, self.tokenizer = self.get_model()

	def get_model(self):
	model, tokenizer = FastLanguageModel.from_pretrained(
	model_name = self.conf["model"]["model_name"],
	max_seq_length = self.conf["model"]["max_seq_length"],
	dtype = self.conf["model"]["dtype"],
	load_in_4bit = self.conf["model"]["load_in_4bit"],
	token = self.token
	)

	FastLanguageModel.for_inference(model) # Enable native 2x faster inference
	return model, tokenizer

	def infer(self, prompt):
	inputs = self.tokenizer([prompt], return_tensors = "pt").to("cuda")
	outputs = model.generate(**inputs,
	max_new_tokens = self.conf["model"]["max_new_tokens"],
	use_cache = True)
	outputs = tokenizer.batch_decode(outputs)
	return outputs