LoneStriker
/

Vistral-7B-ChatML-5.0bpw-h6-exl2

Text Generation

text-generation-inference

Inference Endpoints

Model card Files Files and versions Community

Vistral-7B-ChatML-5.0bpw-h6-exl2 / run.py

LoneStriker's picture

Upload folder using huggingface_hub

818f8ab verified 10 months ago

history blame contribute delete

2.31 kB

	import torch
	from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, HfArgumentParser, TrainingArguments, pipeline, logging, TextStreamer
	from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
	import os, torch, wandb, platform, warnings
	from datasets import load_dataset
	from trl import SFTTrainer

	hf_token = '..........'

	tokenizer = AutoTokenizer.from_pretrained('./vistral-tokenizer')
	bnb_config = BitsAndBytesConfig(
	load_in_4bit=True,
	bnb_4bit_quant_type="nf4",
	bnb_4bit_compute_dtype=torch.bfloat16,
	bnb_4bit_use_double_quant=True,
	)
	model = AutoModelForCausalLM.from_pretrained(
	'Viet-Mistral/Vistral-7B-Chat',
	device_map="auto",
	token=hf_token,
	quantization_config=bnb_config,
	)
	ft_model = PeftModel.from_pretrained(model, CHECKPOINT_PATH)

	#torch.backends.cuda.enable_mem_efficient_sdp(False)
	#torch.backends.cuda.enable_flash_sdp(False)

	system_prompt = "Bạn là một trợ lí Tiếng Việt nhiệt tình và trung thực. Hãy luôn trả lời một cách hữu ích nhất có thể, đồng thời giữ an toàn."

	stop_tokens = [tokenizer.eos_token_id, tokenizer('<\|im_end\|>')['input_ids'].pop()]

	def chat_test():
	conversation = [{"role": "system", "content": system_prompt }]
	while True:
	human = input("Human: ")
	if human.lower() == "reset":
	conversation = [{"role": "system", "content": system_prompt }]
	print("The chat history has been cleared!")
	continue

	if human.lower() == "exit":
	break

	conversation.append({"role": "user", "content": human })
	formatted = tokenizer.apply_chat_template(conversation, tokenize=False) + "<\|im_start\|>assistant"
	tok = tokenizer(formatted, return_tensors="pt").to(ft_model.device)
	input_ids = tok['input_ids']

	out_ids = ft_model.generate(
	input_ids=input_ids,
	attention_mask=tok['attention_mask'],
	eos_token_id=stop_tokens,
	max_new_tokens=50,
	do_sample=True,
	top_p=0.95,
	top_k=40,
	temperature=0.1,
	repetition_penalty=1.05,
	)
	assistant = tokenizer.batch_decode(out_ids[:, input_ids.size(1): ], skip_special_tokens=True)[0].strip()
	print("Assistant: ", assistant)
	conversation.append({"role": "assistant", "content": assistant })

	chat_test()