GPTV

Sleeping

App Files Files Community

Abhaykoul commited on Mar 1

Commit

6757f4d

•

1 Parent(s): 33d8c37

Create app.py

Browse files

Files changed (1) hide show

app.py +97 -0

app.py ADDED Viewed

	@@ -0,0 +1,97 @@

+from __future__ import annotations
+import os
+import hashlib
+import torch
+from threading import Thread
+from transformers import AutoModel, AutoProcessor, TextIteratorStreamer
+import gradio as gr
+# Initialize the model and processor
+def initialize_model_and_processor():
+    model = AutoModel.from_pretrained("OEvortex/HelpingAI-Vision", torch_dtype=torch.float16, trust_remote_code=True).to("cuda" if torch.cuda.is_available() else "cpu")
+    processor = AutoProcessor.from_pretrained("OEvortex/HelpingAI-Vision", trust_remote_code=True)
+    return model, processor
+# Function to process images and cache results
+def cached_vision_process(image, max_crops, num_tokens):
+    image_hash = hashlib.sha256(image.tobytes()).hexdigest()
+    cache_path = f"visual_cache/{image_hash}-{max_crops}-{num_tokens}.pt"
+    if os.path.exists(cache_path):
+        return torch.load(cache_path).to(model.device, dtype=model.dtype)
+    else:
+        processor_outputs = processor.image_processor([image], max_crops)
+        pixel_values = [value.to(model.device, model.dtype) for value in processor_outputs["pixel_values"]]
+        coords = [value.to(model.device, model.dtype) for value in processor_outputs["coords"]]
+        image_outputs = model.vision_model(pixel_values, coords, num_tokens)
+        image_features = model.multi_modal_projector(image_outputs)
+        os.makedirs("visual_cache", exist_ok=True)
+        torch.save(image_features, cache_path)
+        return image_features.to(model.device, model.dtype)
+# Function to answer questions about images
+def answer_question(image, question, max_crops, num_tokens, sample, temperature, top_k):
+    if not question.strip() or not image:
+        return "Please provide both an image and a question."
+    prompt = f"""user
+<image>
+{question}
+assistant
+"""
+    streamer = TextIteratorStreamer(processor.tokenizer, skip_special_tokens=True)
+    with torch.inference_mode():
+        inputs = processor(prompt, [image], model, max_crops=max_crops, num_tokens=num_tokens)
+    generation_kwargs = {
+        "input_ids": inputs["input_ids"],
+        "attention_mask": inputs["attention_mask"],
+        "image_features": inputs["image_features"],
+        "streamer": streamer,
+        "max_length": 1000,
+        "use_cache": True,
+        "eos_token_id": processor.tokenizer.eos_token_id,
+        "pad_token_id": processor.tokenizer.eos_token_id,
+        "temperature": temperature,
+        "do_sample": sample,
+        "top_k": top_k,
+    }
+    thread = Thread(target=model.generate, kwargs=generation_kwargs)
+    thread.start()
+    buffer = ""
+    output_started = False
+    for new_text in streamer:
+        if not output_started:
+            if "assistant" in new_text:
+                output_started = True
+            continue
+        buffer += new_text
+        if len(buffer) > 1:
+            yield buffer
+    return buffer
+# Initialize the model and processor
+model, processor = initialize_model_and_processor()
+# Gradio interface setup
+with gr.Blocks() as demo:
+    with gr.Group():
+        with gr.Row():
+            prompt = gr.Textbox(label="Question", placeholder="e.g. Describe this?", scale=4)
+            submit = gr.Button("Send", scale=1)
+        with gr.Row():
+            max_crops = gr.Slider(minimum=0, maximum=200, step=5, value=0, label="Max crops")
+            num_tokens = gr.Slider(minimum=728, maximum=2184, step=10, value=728, label="Number of image tokens")
+        with gr.Row():
+            img = gr.Image(type="pil", label="Upload or Drag an Image")
+            output = gr.TextArea(label="Answer")
+        with gr.Row():
+            sample = gr.Checkbox(label="Sample", value=False)
+            temperature = gr.Slider(minimum=0, maximum=1, step=0.1, value=0, label="Temperature")
+            top_k = gr.Slider(minimum=0, maximum=50, step=1, value=0, label="Top-K")
+    submit.click(answer_question, [img, prompt, max_crops, num_tokens, sample, temperature, top_k], output)
+    prompt.submit(answer_question, [img, prompt, max_crops, num_tokens, sample, temperature, top_k], output)
+demo.queue().launch(debug=True)