|
However, in assisted decoding, reducing the temperature may help improve the latency. |
|
thon |
|
|
|
from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed |
|
set_seed(42) # For reproducibility |
|
prompt = "Alice and Bob" |
|
checkpoint = "EleutherAI/pythia-1.4b-deduped" |
|
assistant_checkpoint = "EleutherAI/pythia-160m-deduped" |
|
tokenizer = AutoTokenizer.from_pretrained(checkpoint) |
|
inputs = tokenizer(prompt, return_tensors="pt") |
|
model = AutoModelForCausalLM.from_pretrained(checkpoint) |
|
assistant_model = AutoModelForCausalLM.from_pretrained(assistant_checkpoint) |
|
outputs = model.generate(**inputs, assistant_model=assistant_model, do_sample=True, temperature=0.5) |
|
tokenizer.batch_decode(outputs, skip_special_tokens=True) |
|
['Alice and Bob are going to the same party. |