Dolphin-Inference-MGPU

Build error

File size: 2,946 Bytes

789acc7
 
fd950ef
ef394e0
789acc7
fd950ef
 
 
 
 
 
 
ef394e0
 
 
 
323d186
85baff2
ef394e0
 
85baff2
ef394e0
323d186
 
 
ef394e0
323d186
ef394e0
 
 
 
323d186
85baff2
323d186
 
ef394e0
323d186
ef394e0
 
 
 
 
 
 
 
85baff2
ef394e0
85baff2
ef394e0
 
85baff2
ef394e0
323d186
 
ef394e0
 
323d186
 
 
ef394e0
 
 
 
 
323d186
85baff2
ef394e0
 
 
 
85baff2
ef394e0
323d186
 
 
 
 
 
 
 
 
 
85baff2
323d186
 
 
 
 
85baff2
ef394e0

import gradio as gr
import torch
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer
from PIL import Image
import warnings

# disable some warnings
transformers.logging.set_verbosity_error()
transformers.logging.disable_progress_bar()
warnings.filterwarnings('ignore')

model_name = 'cognitivecomputations/dolphin-vision-72b'

# Set up GPU memory optimization
torch.cuda.empty_cache()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

# Load model with memory optimizations
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True,
    device_map="auto",
    trust_remote_code=True,
    offload_folder="offload",  # Offload to disk if necessary
    offload_state_dict=True,   # Offload state dict to CPU
    max_memory={0: "40GB"}     # Limit GPU memory usage
)

def inference(prompt, image, temperature, beam_size):
    messages = [
        {"role": "user", "content": f'<image>\n{prompt}'}
    ]
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )

    text_chunks = [tokenizer(chunk).input_ids for chunk in text.split('<image>')]
    input_ids = torch.tensor(text_chunks[0] + [-200] + text_chunks[1], dtype=torch.long).unsqueeze(0).to(device)

    image_tensor = model.process_images([image], model.config).to(device)

    # Clear GPU memory
    torch.cuda.empty_cache()

    # Generate with memory optimization
    with torch.cuda.amp.autocast():
        output_ids = model.generate(
            input_ids,
            images=image_tensor,
            max_new_tokens=1024,
            temperature=temperature,
            num_beams=beam_size,
            use_cache=True,
            do_sample=True,
            repetition_penalty=1.1,
            length_penalty=1.0,
            no_repeat_ngram_size=3
        )[0]

    # Clear GPU memory again
    torch.cuda.empty_cache()

    return tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()

# Create Gradio interface
with gr.Blocks() as demo:
    with gr.Row():
        with gr.Column():
            prompt_input = gr.Textbox(label="Prompt", placeholder="Describe this image in detail")
            image_input = gr.Image(label="Image", type="pil")
            temperature_input = gr.Slider(minimum=0.1, maximum=2.0, value=0.7, step=0.1, label="Temperature")
            beam_size_input = gr.Slider(minimum=1, maximum=10, value=4, step=1, label="Beam Size")
            submit_button = gr.Button("Submit")
        with gr.Column():
            output_text = gr.Textbox(label="Output")

    submit_button.click(
        fn=inference, 
        inputs=[prompt_input, image_input, temperature_input, beam_size_input], 
        outputs=output_text
    )

# Launch the app
demo.launch()