Spaces:
Build error
Build error
File size: 2,946 Bytes
789acc7 fd950ef ef394e0 789acc7 fd950ef ef394e0 323d186 85baff2 ef394e0 85baff2 ef394e0 323d186 ef394e0 323d186 ef394e0 323d186 85baff2 323d186 ef394e0 323d186 ef394e0 85baff2 ef394e0 85baff2 ef394e0 85baff2 ef394e0 323d186 ef394e0 323d186 ef394e0 323d186 85baff2 ef394e0 85baff2 ef394e0 323d186 85baff2 323d186 85baff2 ef394e0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 |
import gradio as gr
import torch
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer
from PIL import Image
import warnings
# disable some warnings
transformers.logging.set_verbosity_error()
transformers.logging.disable_progress_bar()
warnings.filterwarnings('ignore')
model_name = 'cognitivecomputations/dolphin-vision-72b'
# Set up GPU memory optimization
torch.cuda.empty_cache()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
# Load model with memory optimizations
model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype=torch.float16,
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True,
offload_folder="offload", # Offload to disk if necessary
offload_state_dict=True, # Offload state dict to CPU
max_memory={0: "40GB"} # Limit GPU memory usage
)
def inference(prompt, image, temperature, beam_size):
messages = [
{"role": "user", "content": f'<image>\n{prompt}'}
]
text = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
text_chunks = [tokenizer(chunk).input_ids for chunk in text.split('<image>')]
input_ids = torch.tensor(text_chunks[0] + [-200] + text_chunks[1], dtype=torch.long).unsqueeze(0).to(device)
image_tensor = model.process_images([image], model.config).to(device)
# Clear GPU memory
torch.cuda.empty_cache()
# Generate with memory optimization
with torch.cuda.amp.autocast():
output_ids = model.generate(
input_ids,
images=image_tensor,
max_new_tokens=1024,
temperature=temperature,
num_beams=beam_size,
use_cache=True,
do_sample=True,
repetition_penalty=1.1,
length_penalty=1.0,
no_repeat_ngram_size=3
)[0]
# Clear GPU memory again
torch.cuda.empty_cache()
return tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
# Create Gradio interface
with gr.Blocks() as demo:
with gr.Row():
with gr.Column():
prompt_input = gr.Textbox(label="Prompt", placeholder="Describe this image in detail")
image_input = gr.Image(label="Image", type="pil")
temperature_input = gr.Slider(minimum=0.1, maximum=2.0, value=0.7, step=0.1, label="Temperature")
beam_size_input = gr.Slider(minimum=1, maximum=10, value=4, step=1, label="Beam Size")
submit_button = gr.Button("Submit")
with gr.Column():
output_text = gr.Textbox(label="Output")
submit_button.click(
fn=inference,
inputs=[prompt_input, image_input, temperature_input, beam_size_input],
outputs=output_text
)
# Launch the app
demo.launch() |