Spaces:

izumi-lab
/

llama-13b-japanese-lora-v0-1ep

Paused

App Files Files Community

llama-13b-japanese-lora-v0-1ep / app.py

masanorihirano

added

135325d over 1 year ago

raw

history blame

4.26 kB

	from typing import Optional

	import gradio as gr
	import torch
	from peft import PeftModel
	from transformers import GenerationConfig
	from transformers import LlamaForCausalLM
	from transformers import LlamaTokenizer

	print("starting server ...")


	BASE_MODEL = "decapoda-research/llama-13b-hf"
	LORA_WEIGHTS = "izumi-lab/llama-13b-japanese-lora-v0-1ep"

	tokenizer = LlamaTokenizer.from_pretrained(BASE_MODEL)

	if torch.cuda.is_available():
	device = "cuda"
	else:
	device = "cpu"

	try:
	if torch.backends.mps.is_available():
	device = "mps"
	except Exception:
	pass

	if device == "cuda":
	model = LlamaForCausalLM.from_pretrained(
	BASE_MODEL,
	load_in_8bit=False,
	torch_dtype=torch.float16,
	device_map="auto",
	)
	model = PeftModel.from_pretrained(model, LORA_WEIGHTS, torch_dtype=torch.float16)
	elif device == "mps":
	model = LlamaForCausalLM.from_pretrained(
	BASE_MODEL,
	device_map={"": device},
	torch_dtype=torch.float16,
	)
	model = PeftModel.from_pretrained(
	model,
	LORA_WEIGHTS,
	device_map={"": device},
	torch_dtype=torch.float16,
	)
	else:
	model = LlamaForCausalLM.from_pretrained(
	BASE_MODEL, device_map={"": device}, low_cpu_mem_usage=True
	)
	model = PeftModel.from_pretrained(
	model,
	LORA_WEIGHTS,
	device_map={"": device},
	)


	def generate_prompt(instruction: str, input: Optional[str] = None):
	if input:
	return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
	### Instruction:
	{instruction}
	### Input:
	{input}
	### Response:"""
	else:
	return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.
	### Instruction:
	{instruction}
	### Response:"""


	if device != "cpu":
	model.half()
	model.eval()
	if torch.__version__ >= "2":
	model = torch.compile(model)


	def evaluate(
	instruction: str,
	input: Optional[str] = None,
	temperature: float = 0.7,
	top_p: float = 1.0,
	top_k: int = 40,
	num_beams: int = 4,
	max_new_tokens: int = 256,
	**kwargs,
	):
	prompt = generate_prompt(instruction, input)
	inputs = tokenizer(prompt, return_tensors="pt")
	input_ids = inputs["input_ids"].to(device)
	generation_config = GenerationConfig(
	temperature=temperature,
	top_p=top_p,
	top_k=top_k,
	num_beams=num_beams,
	**kwargs,
	)
	with torch.no_grad():
	generation_output = model.generate(
	input_ids=input_ids,
	generation_config=generation_config,
	return_dict_in_generate=True,
	output_scores=True,
	max_new_tokens=max_new_tokens,
	)
	s = generation_output.sequences[0]
	output = tokenizer.decode(s)
	return output.split("### Response:")[1].strip()


	g = gr.Interface(
	fn=evaluate,
	inputs=[
	gr.components.Textbox(lines=2, label="Instruction", placeholder="東京から大阪に行くには？"),
	gr.components.Textbox(lines=2, label="Input", placeholder="none"),
	gr.components.Slider(minimum=0, maximum=1, value=0.7, label="Temperature"),
	gr.components.Slider(minimum=0, maximum=1, value=1.0, label="Top p"),
	gr.components.Slider(minimum=0, maximum=100, step=1, value=40, label="Top k"),
	gr.components.Slider(minimum=1, maximum=4, step=1, value=4, label="Beams"),
	gr.components.Slider(
	minimum=1, maximum=512, step=1, value=128, label="Max tokens"
	),
	],
	outputs=[
	gr.inputs.Textbox(
	lines=5,
	label="Output",
	)
	],
	title="izumi-lab/calm-7b-lora-v0-1ep",
	description="izumi-lab/calm-7b-lora-v0-1ep is a 7B-parameter Calm model finetuned to follow instructions. It is trained on the [izumi-lab/llm-japanese-dataset](https://huggingface.co/datasets/izumi-lab/llm-japanese-dataset) dataset and makes use of the Huggingface Calm-7b implementation. For more information, please visit [the project's website](https://llm.msuzuki.me).",
	)
	g.queue(concurrency_count=1)
	print("loading completed")
	g.launch()