Sample Script Here

#6
by ctranslate2-4you - opened

I hate it when repo owners don't give detailed examples so here you go people...The vision capabilities were pretty good actually:

from transformers import LlavaNextForConditionalGeneration, LlavaNextProcessor, BitsAndBytesConfig
import torch
from PIL import Image
import warnings

def process_image(image_path):
    model_id = r"[PATH TO LOCAL DIRECTOR ON COMPUTER OR THE REPOSITORY ID NOT IN A RAW STRING OBVIOUSLY"
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
    )

    # instantiate model
    model = LlavaNextForConditionalGeneration.from_pretrained(
        model_id,
        quantization_config=quantization_config,
        torch_dtype=torch.bfloat16,
        low_cpu_mem_usage=True,
        device_map="auto",
        trust_remote_code=True
    )

    # instantiate processor
    processor = LlavaNextProcessor.from_pretrained(model_id, tokenizer_class='PreTrainedTokenizerFast', trust_remote_code=True)

    image = Image.open(image_path)
    instruction = "Describe this image in detail as possible but be succinct and don't repeat yourself."
    prompt = f"User:<image>\n{instruction} Falcon:"
    inputs = processor(text=prompt, images=image, return_tensors="pt", padding=True).to("cuda:0")

    output = model.generate(**inputs, max_new_tokens=512)

    prompt_length = inputs['input_ids'].shape[1]
    model_response = processor.decode(output[0][prompt_length:], skip_special_tokens=True).strip()

    print(f"\n{model_response}\n")

if __name__ == "__main__":
    input_image_path = r"[PATH TO A LOCAL FILE ON YOUR COMPUTER]"
    process_image(input_image_path)

Could you please provide information on which version of Python and what requirements are needed? Could you also upload the details?
Thanks Mirosalv

Sign up or log in to comment