Dolphin-Inference-MGPU

Build error

File size: 2,562 Bytes

789acc7
 
fd950ef
2a5c763
789acc7
fd950ef
 
 
 
 
 
 
5220358
 
 
fd950ef
41b96d9
fd950ef
2a5c763
 
 
 
 
 
 
 
fd950ef
 
2a5c763
5220358
4f9f0e6
f8d9f18
4f9f0e6
fd950ef
 
4f9f0e6
 
fd950ef
 
 
 
 
 
 
 
 
 
 
 
5220358
225c3f2
5220358
fd950ef
225c3f2
 
 
 
fd950ef
cd44f8b
2a5c763
5220358
 
 
 
 
 
fd950ef
 
789acc7
 
5ee7893
fd950ef
 
 
 
 
 
 
 
789acc7
f8d9f18

import gradio as gr
import torch
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from PIL import Image
import warnings

# disable some warnings
transformers.logging.set_verbosity_error()
transformers.logging.disable_progress_bar()
warnings.filterwarnings('ignore')

# Set device to GPU if available, else CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

model_name = 'cognitivecomputations/dolphin-vision-72b'

# Configure 8-bit quantization
quantization_config = BitsAndBytesConfig(
    load_in_8bit=True,
    llm_int8_threshold=6.0,
    llm_int8_has_fp16_weight=False
)

# create model and load it to the specified device with 8-bit quantization
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quantization_config,
    device_map="auto",  # This will automatically use the GPU if available
    trust_remote_code=True
)

tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    trust_remote_code=True
)

def inference(prompt, image):
    messages = [
        {"role": "user", "content": f'<image>\n{prompt}'}
    ]
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )

    text_chunks = [tokenizer(chunk).input_ids for chunk in text.split('<image>')]
    input_ids = torch.tensor(text_chunks[0] + [-200] + text_chunks[1], dtype=torch.long).unsqueeze(0).to(device)

    image_tensor = model.process_images([image], model.config).to(device)

    # Add debug prints
    print(f"Device of model: {next(model.parameters()).device}")
    print(f"Device of input_ids: {input_ids.device}")
    print(f"Device of image_tensor: {image_tensor.device}")

    # generate
    with torch.inference_mode():
        output_ids = model.generate(
            input_ids,
            images=image_tensor,
            max_new_tokens=1024,
            use_cache=True
        )[0]

    return tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()

with gr.Blocks() as demo:
    with gr.Row():
        with gr.Column():
            prompt_input = gr.Textbox(label="Prompt", placeholder="Describe this image in detail")
            image_input = gr.Image(label="Image", type="pil")
            submit_button = gr.Button("Submit")
        with gr.Column():
            output_text = gr.Textbox(label="Output")

    submit_button.click(fn=inference, inputs=[prompt_input, image_input], outputs=output_text)

demo.launch(share=True)