Spaces:
Runtime error
Runtime error
import streamlit as st | |
import torch | |
from PIL import Image | |
import requests | |
import os | |
##CLIP | |
scene_labels=['Arrest', | |
'Arson', | |
'Explosion', | |
'public fight', | |
'Normal', | |
'Road Accident', | |
'Robbery', | |
'Shooting', | |
'Stealing', | |
'Vandalism', | |
'Suspicious activity', | |
'Tailgating', | |
'Unauthorized entry', | |
'Protest/Demonstration', | |
'Drone suspicious activity', | |
'Fire/Smoke detection', | |
'Medical emergency', | |
'Suspicious package/object', | |
'Threatening', | |
'Attack', | |
'Shoplifting', | |
'burglary ', | |
'distress', | |
'assault'] | |
from transformers import CLIPProcessor, CLIPModel | |
model_id = "openai/clip-vit-large-patch14" | |
processor = CLIPProcessor.from_pretrained(model_id) | |
model = CLIPModel.from_pretrained(model_id) | |
##BLIP | |
from transformers import pipeline | |
image_to_text = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large") | |
##LLM | |
st.info("Please check here- https://ai.google.dev/tutorials/web_quickstart , to get your GOOGLE API KEY. This is mandatory to use this app") | |
GOOGLE_API_KEY = st.text_input("Please enter your GOOGLE API KEY", type="password") | |
os.environ['GOOGLE_API_KEY'] = GOOGLE_API_KEY | |
from langchain_google_genai import ChatGoogleGenerativeAI | |
from langchain.prompts import PromptTemplate | |
from google.generativeai.types.safety_types import HarmBlockThreshold, HarmCategory | |
template="""You are an advanced image captioning AI assistant for surveillance related images. | |
Your task is to refine and improve an initial image caption using relevant contextual information provided. | |
You will receive two inputs: | |
Input 1: {initial_caption} - This is the initial caption for the image, most likely grammatically incorrect | |
and incomplete sentence, generated by a separate not so good image captioning model. | |
Input 2: {context} - This is the contextual information that provides more details about the background | |
Your goal is to take the initial caption and the additional context, and produce a new, refined caption that | |
incorporates the contextual details. | |
Please do not speculate things which are not provided. The final caption should be grammatically correct. | |
Please output only the final caption.""" | |
prompt_template = PromptTemplate( | |
template=template, | |
input_variables=["initial_caption", "context"], | |
) | |
# Title | |
st.title("Image Caption Surveillance") | |
# Input field for URL | |
image_url = st.text_input("Enter the URL of the image:") | |
# Display image if a valid URL is provided | |
if image_url: | |
try: | |
st.image(image_url, caption="Uploaded Image") | |
##CLIP | |
image = Image.open(requests.get(image_url, stream=True).raw) | |
inputs = processor(text=scene_labels, images=image, return_tensors="pt", padding=True) | |
outputs = model(**inputs) | |
logits_per_image = outputs.logits_per_image # this is the image-text similarity score | |
probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities | |
raw_context= scene_labels[probs.argmax(-1)] | |
context= 'the image is depicting scene of '+ raw_context | |
#context= 'the image is depicting scene of attack' | |
##BLIP | |
caption = image_to_text(image_url, max_new_tokens=200) | |
initial_caption= caption[0]['generated_text'] | |
##gemini-1.0-pro-latest | |
##LLM | |
llm = ChatGoogleGenerativeAI(model="gemini-1.0-pro", google_api_key=GOOGLE_API_KEY, temperature=0.2, top_p=1, top_k=1, | |
safety_settings={ | |
HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE, | |
HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE, | |
HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE, | |
HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE, | |
}, | |
) | |
prompt=prompt_template.format(initial_caption=initial_caption, context=context) | |
response = llm.invoke(prompt) | |
final_caption = response.content | |
##Output | |
#st.write("context: ", context) | |
#st.write("initial_caption: ", initial_caption) | |
st.write("CAPTION: ", final_caption) | |
except Exception as e: | |
st.error(f"Error: {e}") | |
else: | |
st.warning("Please enter a valid image URL.") | |