import streamlit as st import torch from PIL import Image import requests ##CLIP scene_labels=['Arrest', 'Arson', 'Explosion', 'public fight', 'Normal', 'Road Accident', 'Robbery', 'Shooting', 'Stealing', 'Vandalism', 'Suspicious activity', 'Tailgating', 'Unauthorized entry', 'Protest/Demonstration', 'Drone suspicious activity', 'Fire/Smoke detection', 'Medical emergency', 'Suspicious package/object', 'Threatening', 'Attack', 'Shoplifting', 'burglary ', 'distress', 'assault'] from transformers import CLIPProcessor, CLIPModel model_id = "openai/clip-vit-large-patch14" processor = CLIPProcessor.from_pretrained(model_id) model = CLIPModel.from_pretrained(model_id) ##BLIP from transformers import pipeline image_to_text = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large") # Title st.title("Image Caption Surveillance") # Input field for URL image_url = st.text_input("Enter the URL of the image:") # Display image if a valid URL is provided if image_url: try: st.image(image_url, caption="Uploaded Image") ##CLIP image = Image.open(requests.get(image_url, stream=True).raw) inputs = processor(text=scene_labels, images=image, return_tensors="pt", padding=True) outputs = model(**inputs) logits_per_image = outputs.logits_per_image # this is the image-text similarity score probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities raw_context= scene_labels[probs.argmax(-1)] context= 'the image is depicting scene of '+ raw_context ##BLIP caption = image_to_text(image_url, max_new_tokens=200) initial_caption= caption[0]['generated_text'] ##Output st.write("context: ", context) st.write("initial_caption: ", initial_caption) except Exception as e: st.error(f"Error: {e}") else: st.warning("Please enter a valid image URL.")