Converted LLaMA from InternLM2.5-7B-Chat
Descritpion
This is a converted model from InternLM2.5-7B-Chat to LLaMA format. This conversion allows you to use InternLM2.5-7B-Chat as if it were a LLaMA model, which is convenient for some inference use cases. The precision is excatly the same as the original model.
Usage
You can load the model using the LlamaForCausalLM
class as shown below:
from transformers import AutoTokenizer, LlamaForCausalLM
device = "cpu" # cpu is exacatly the same
attn_impl = 'eager' # the attention implementation to use
meta_instruction = ("You are an AI assistant whose name is InternLM (书生·浦语).\n"
"- InternLM (书生·浦语) is a conversational language model that is developed by Shanghai AI Laboratory "
"(上海人工智能实验室). It is designed to be helpful, honest, and harmless.\n"
"- InternLM (书生·浦语) can understand and communicate fluently in the language chosen by the user such "
"as English and 中文."
)
prompt1 = "介绍下你自己"
prompt2 = "介绍下上海人工智能实验室"
def build_inputs(tokenizer, query: str, history: List[Tuple[str, str]] = None, meta_instruction=meta_instruction):
if history is None:
history = []
if tokenizer.add_bos_token:
prompt = ""
else:
prompt = tokenizer.bos_token
if meta_instruction:
prompt += f"""<|im_start|>system\n{meta_instruction}<|im_end|>\n"""
for record in history:
prompt += f"""<|im_start|>user\n{record[0]}<|im_end|>\n<|im_start|>assistant\n{record[1]}<|im_end|>\n"""
prompt += f"""<|im_start|>user\n{query}<|im_end|>\n<|im_start|>assistant\n"""
return tokenizer([prompt], return_tensors="pt")
@torch.inference_mode()
def chat(
model: Union[AutoModelForCausalLM, LlamaForCausalLM],
tokenizer,
query: str,
history: Optional[List[Tuple[str, str]]] = None,
streamer: Optional[BaseStreamer] = None,
max_new_tokens: int = 1024,
do_sample: bool = True,
temperature: float = 0.8,
top_p: float = 0.8,
meta_instruction: str = meta_instruction,
**kwargs,
):
if history is None:
history = []
inputs = build_inputs(tokenizer, query, history, meta_instruction)
inputs = {k: v.to(model.device) for k, v in inputs.items() if torch.is_tensor(v)}
# also add end-of-assistant token in eos token id to avoid unnecessary generation
eos_token_id = [tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids(["<|im_end|>"])[0]]
outputs = model.generate(
**inputs,
streamer=streamer,
max_new_tokens=max_new_tokens,
do_sample=do_sample,
temperature=temperature,
top_p=top_p,
eos_token_id=eos_token_id,
**kwargs,
)
outputs = outputs[0].cpu().tolist()[len(inputs["input_ids"][0]) :]
response = tokenizer.decode(outputs, skip_special_tokens=True)
response = response.split("<|im_end|>")[0]
history = history + [(query, response)]
return response, history
# use the official tokenizer
tokenizer = AutoTokenizer.from_pretrained("internlm/internlm2_5-7b-chat", trust_remote_code=True)
# use the converted LlaMA model
llama_model = LlamaForCausalLM.from_pretrained(
"silence09/InternLM2.5-7B-Chat-Converted-LlaMA",
torch_dtype='auto',
attn_implementation=attn_impl).to(device)
llama_model.eval()
response_llama_and_splitfunc_1, history = chat(llama_model, tokenizer, prompt1, history=[], do_sample=False)
print(f"User Input: {prompt1}\nConverted LlaMA Response: {response_llama_and_splitfunc_1}")
response_llama_and_splitfunc_2, history = chat(llama_model, tokenizer, prompt2, history=history, do_sample=False)
print(f"User Input: {prompt2}\nConverted LlaMA Response: {response_llama_and_splitfunc_2}")
Precision Guarantee
To comare result with the original model, you can use this code
More Info
It was converted using the python script available at this repository
- Downloads last month
- 26
Model tree for silence09/InternLM2.5-7B-Chat-Converted-LlaMA
Base model
internlm/internlm2_5-7b-chat