Spaces:

ChandraP12330
/

image-caption

Runtime error

File size: 1,969 Bytes

11c68ac
6d9ecb7
4d94a5f
 
11c68ac
e356bc7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
02de360
 
 
 
 
2edac28
e356bc7
c8b7141
2edac28
 
e356bc7
 
 
11c68ac
2edac28
 
 
 
02de360
0ba7cdf
 
 
 
 
02de360
77496db
02de360
 
 
 
0ba7cdf
02de360
e356bc7
02de360
2edac28
 
 
e356bc7

import streamlit as st
import torch
from PIL import Image
import requests


##CLIP
scene_labels=['Arrest',
 'Arson',
 'Explosion',
 'public fight',
 'Normal',
 'Road Accident',
 'Robbery',
 'Shooting',
 'Stealing',
 'Vandalism',
 'Suspicious activity',
 'Tailgating',
 'Unauthorized entry',
 'Protest/Demonstration',
 'Drone suspicious activity',
 'Fire/Smoke detection',
 'Medical emergency',
 'Suspicious package/object',
 'Threatening',
 'Attack',
 'Shoplifting',
 'burglary ',
 'distress',
 'assault']
from transformers import CLIPProcessor, CLIPModel
model_id = "openai/clip-vit-large-patch14"
processor = CLIPProcessor.from_pretrained(model_id)
model = CLIPModel.from_pretrained(model_id)

##BLIP
from transformers import pipeline
image_to_text = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large")


# Title
st.title("Image Caption Surveillance")

# Input field for URL
image_url = st.text_input("Enter the URL of the image:")




# Display image if a valid URL is provided
if image_url:
    try:
        st.image(image_url, caption="Uploaded Image")
        ##CLIP
        image = Image.open(requests.get(image_url, stream=True).raw)
        inputs = processor(text=scene_labels, images=image, return_tensors="pt", padding=True)
        outputs = model(**inputs)
        logits_per_image = outputs.logits_per_image # this is the image-text similarity score
        probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities
        raw_context= scene_labels[probs.argmax(-1)]
        context= 'the image is depicting scene of '+ raw_context

        ##BLIP
        caption = image_to_text(image_url, max_new_tokens=200)
        initial_caption= caption[0]['generated_text']

        ##Output
        st.write("context: ", context)
        st.write("initial_caption: ", initial_caption)
    except Exception as e:
        st.error(f"Error: {e}")
else:
    st.warning("Please enter a valid image URL.")