meta-llama/Llama-3.1-8B-Instruct · Why does mine never generate the "eos

always end until up to max_tokens, It takes me whole day but didn't figure out. tried many methods but didn't work out
and this keep showing up no matter how :Setting pad_token_id to eos_token_id:None for open-end generation.
import torch
import numpy as np
import h5py
from transformers import (
LlamaForCausalLM,
LlamaTokenizer,
AutoTokenizer
)
model = LlamaForCausalLM.from_pretrained(
"/root/autodl-tmp/llama3.1_8B",
device_map="auto",
torch_dtype=torch.float16,
)
tokenizer = AutoTokenizer.from_pretrained(
"/root/autodl-tmp/llama3.1_8B",
use_fast=False,
)
def prompt_int():
instru='[9, 8, 8, 3, 0, 2, 1, 3, 6, 5, 0, 3, 3, 7, 3, 0, 7, 2, 6, 0, 0, 0, 0, 5, 0, 5, 1, 1, 5, 0, 6, 2, 8, 7, 9, 4, 7, 3, 4, 3, 8, 0, 9, 5, 1, 4, 5, 6, 5, 6, 9, 6, 9, 2, 3, 5, 3, 5, 9, 9, 3, 8, 4, 6, 2, 6, 0, 6, 7, 8, 8, 1, 0, 7, 2, 1, 5, 0, 7, 5, 8, 2, 7, 6, 2, 0, 8, 8, 6, 8, 1, 7, 5, 4, 1, 0, 3, 9, 0, 6, 6, 1, 4, 9, 5, 0, 3, 5, 5, 3, 7, 8, 1, 2, 4, 0, 5, 7, 2, 7, 8, 6, 2, 3, 5, 3, 2, 2, 3]How many numbers are there in this list? '
#instru="Good morning! Who are you? "
return instru
device_id = 0
batch=tokenizer(prompt_int(),
return_tensors="pt",
add_special_tokens=True
)
device = torch.device(f"cuda:{device_id}")
batch = {k: v.to(device) for k, v in batch.items()}
for frame in range(0,20):
with torch.no_grad():
predicted_sentence=model.generate(
**batch,
max_length=2048
)
#print(input_tokens)
generated_text = tokenizer.decode(predicted_sentence[0], skip_special_tokens=True)
with open("/root/autodl-tmp/test.txt", 'a',encoding='utf-8') as f:
f.write(generated_text)
with open("/root/autodl-tmp/test.txt", 'a',encoding='utf-8') as f:
f.write(f'\n----------------{frame}----------------------\n')
print(generated_text)

meta-llama
/

Llama-3.1-8B-Instruct

Why does mine never generate the "eos_token"