output Nonsense compared to llama.cpp
#5
by
umiuni
- opened
import torch
from auto_gptq import AutoGPTQForCausalLM
from transformers import AutoTokenizer
# Load the AutoGPTQForCausalLM model and tokenizer
model_name_or_path = "/scratch/yerong/.cache/pyllama/Llama-2-7B-GPTQ"
model = AutoGPTQForCausalLM.from_quantized(model_name_or_path, model_basename="model", inject_fused_attention=False, use_safetensors=True, trust_remote_code=False, device="cuda:0", use_triton=False, quantize_config=None)
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)
# Set the padding token to be equal to the end of sequence token (eos_token)
tokenizer.pad_token = tokenizer.eos_token
# Set the device (CPU or GPU)
# device = "cuda" if torch.cuda.is_available() else "cpu"
# model.to(device)
# Define a list of prompts (text) for batch generation
sentence = "Elon Musk is a South African-born Canadian-American business magnate, investor and inventor. He is the founder, CEO, and chief engineer/designer of SpaceX; co-founder, CEO, and product architect",
# sentences = [
# "Elon Musk is a South African-born Canadian-American business magnate, investor and inventor. He is the founder, CEO, and chief engineer/designer of SpaceX; co-founder, CEO, and product architect",
# # "Prompt 2: Bill Gates dropped out of Harvard",
# # "Prompt 3: Another prompt to generate text.",
# # "Prompt 4: Yet another prompt for AutoGPT.",
# # "Prompt 5: A fifth prompt to see what it generates.",
# # "Prompt 6: AutoGPT can generate text creatively.",
# # "Prompt 7: Let's test another prompt.",
# # "Prompt 8: The final prompt for this batch.",
# ]
input_ids = tokenizer(sentence, return_tensors='pt', truncation=True, padding="max_length", max_length=512).input_ids.cuda()
# input_ids = tokenizer(sentences, return_tensors='pt', truncation=True, padding="max_length", max_length=512).input_ids.cuda()
with torch.no_grad():
outputs = model.generate(
input_ids=input_ids,
max_new_tokens=256,
do_sample=True,
top_p=0.9,
temperature=float(0.01),
top_k=40
)
# print(tokenizer.batch_decode(outputs[:, input_ids.shape[1]:], max_new_tokens=256, skip_special_tokens=True))
print(tokenizer.batch_decode(outputs, max_new_tokens=256, skip_special_tokens=True))
skip module injection for FusedLlamaMLPForQuantizedModel not support integrate without triton yet.
['nobody nobody nobody nobody\n everybody nobody nobody nobody nobody nobody nobody nobody\n nobody\n nobody\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n
\n\n\n\n\n._\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n
\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n nobody\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n nobody\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n']
['Elon Musk is a South African-born Canadian-American business magnate, investor and inventor. He is the founder, CEO, and chief engineer/designer of SpaceX; co-founder, CEO, and product architect nobody nobody nobody nobody\n everybody nobody nobody nobody nobody nobody nobody nobody\n nobody\n nobody\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n._\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n nobody\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n nobody\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n']
I saw something similar, and solve the issue by setting temperature to 0.