import streamlit as st ##CLIP scene_labels=['Arrest', 'Arson', 'Explosion', 'public fight', 'Normal', 'Road Accident', 'Robbery', 'Shooting', 'Stealing', 'Vandalism', 'Suspicious activity', 'Tailgating', 'Unauthorized entry', 'Protest/Demonstration', 'Drone suspicious activity', 'Fire/Smoke detection', 'Medical emergency', 'Suspicious package/object', 'Threatening', 'Attack', 'Shoplifting', 'burglary ', 'distress', 'assault'] from transformers import CLIPProcessor, CLIPModel model_id = "openai/clip-vit-large-patch14" processor = CLIPProcessor.from_pretrained(model_id) model = CLIPModel.from_pretrained(model_id) # Title st.title("Image Caption Surveillance") # Input field for URL image_url = st.text_input("Enter the URL of the image:") # Display image if a valid URL is provided if image_url: try: st.image(image_url, caption="Uploaded Image") image = Image.open(requests.get(image_url, stream=True).raw) inputs = processor(text=scene_labels, images=image, return_tensors="pt", padding=True) outputs = model(**inputs) logits_per_image = outputs.logits_per_image # this is the image-text similarity score probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities context= scene_labels[probs.argmax(-1)] st.write("context: ", context) except Exception as e: st.error(f"Error: {e}") else: st.warning("Please enter a valid image URL.")