Dolphin-Inference-MGPU

Build error

App Files Files Community

Ketengan-Diffusion-Lab commited on Sep 15

Commit

ef394e0

•

1 Parent(s): df0f804

Update app.py

Browse files

Files changed (1) hide show

app.py +40 -34

app.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import gradio as gr
 import torch
 import transformers
-from transformers import AutoModelForCausalLM, AutoTokenizer, AutoImageProcessor
 from PIL import Image
 import warnings
@@ -10,61 +10,66 @@ transformers.logging.set_verbosity_error()
 transformers.logging.disable_progress_bar()
 warnings.filterwarnings('ignore')
-# Set device to GPU if available, else CPU
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-print(f"Using device: {device}")
-# Update model path to your local path
-model_name = 'failspy/kappa-3-phi-abliterated'
-# create model and load it to the specified device
 model = AutoModelForCausalLM.from_pretrained(
     model_name,
     torch_dtype=torch.float16,
     device_map="auto",
-    trust_remote_code=True
-)
-tokenizer = AutoTokenizer.from_pretrained(
-    model_name,
-    trust_remote_code=True
 )
 def inference(prompt, image, temperature, beam_size):
-    # Phi-3 uses a chat template
     messages = [
-        {"role": "user", "content": f"Can you describe this image?\n{prompt}"}
     ]
-    # Apply chat template and add generation prompt
-    inputs = tokenizer.apply_chat_template(
-        messages,
-        add_generation_prompt=True,
-        return_tensors="pt"
-    ).to(device)
-    # Process the image using AutoImageProcessor
-    image_processor = AutoImageProcessor.from_pretrained(model_name)
-    pixel_values = image_processor(image, return_tensors="pt").pixel_values.to(device)
-    # Add debug prints
-    print(f"Device of model: {next(model.parameters()).device}")
-    print(f"Device of inputs: {inputs.input_ids.device}")
-    print(f"Device of pixel_values: {pixel_values.device}")
-    # generate
     with torch.cuda.amp.autocast():
         output_ids = model.generate(
-            inputs.input_ids,
-            pixel_values=pixel_values,
             max_new_tokens=1024,
             temperature=temperature,
             num_beams=beam_size,
-            use_cache=True
         )[0]
-    return tokenizer.decode(output_ids[inputs.input_ids.shape[1]:], skip_special_tokens=True).strip()
 with gr.Blocks() as demo:
     with gr.Row():
         with gr.Column():
@@ -82,4 +87,5 @@ with gr.Blocks() as demo:
         outputs=output_text
     )
-demo.launch(share=True)

 import gradio as gr
 import torch
 import transformers
+from transformers import AutoModelForCausalLM, AutoTokenizer
 from PIL import Image
 import warnings
 transformers.logging.disable_progress_bar()
 warnings.filterwarnings('ignore')
+model_name = 'cognitivecomputations/dolphin-vision-72b'
+# Set up GPU memory optimization
+torch.cuda.empty_cache()
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# Load tokenizer
+tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+# Load model with memory optimizations
 model = AutoModelForCausalLM.from_pretrained(
     model_name,
     torch_dtype=torch.float16,
+    low_cpu_mem_usage=True,
     device_map="auto",
+    trust_remote_code=True,
+    offload_folder="offload",  # Offload to disk if necessary
+    offload_state_dict=True,   # Offload state dict to CPU
+    max_memory={0: "40GB"}     # Limit GPU memory usage
 )
 def inference(prompt, image, temperature, beam_size):
     messages = [
+        {"role": "user", "content": f'<image>\n{prompt}'}
     ]
+    text = tokenizer.apply_chat_template(
+        messages,
+        tokenize=False,
+        add_generation_prompt=True
+    )
+    text_chunks = [tokenizer(chunk).input_ids for chunk in text.split('<image>')]
+    input_ids = torch.tensor(text_chunks[0] + [-200] + text_chunks[1], dtype=torch.long).unsqueeze(0).to(device)
+    image_tensor = model.process_images([image], model.config).to(device)
+    # Clear GPU memory
+    torch.cuda.empty_cache()
+    # Generate with memory optimization
     with torch.cuda.amp.autocast():
         output_ids = model.generate(
+            input_ids,
+            images=image_tensor,
             max_new_tokens=1024,
             temperature=temperature,
             num_beams=beam_size,
+            use_cache=True,
+            do_sample=True,
+            repetition_penalty=1.1,
+            length_penalty=1.0,
+            no_repeat_ngram_size=3
         )[0]
+    # Clear GPU memory again
+    torch.cuda.empty_cache()
+    return tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
+# Create Gradio interface
 with gr.Blocks() as demo:
     with gr.Row():
         with gr.Column():
         outputs=output_text
     )
+# Launch the app
+demo.launch()