File size: 1,969 Bytes
11c68ac
6d9ecb7
4d94a5f
 
11c68ac
e356bc7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
02de360
 
 
 
 
2edac28
e356bc7
c8b7141
2edac28
 
e356bc7
 
 
11c68ac
2edac28
 
 
 
02de360
0ba7cdf
 
 
 
 
02de360
77496db
02de360
 
 
 
0ba7cdf
02de360
e356bc7
02de360
2edac28
 
 
e356bc7
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import streamlit as st
import torch
from PIL import Image
import requests


##CLIP
scene_labels=['Arrest',
 'Arson',
 'Explosion',
 'public fight',
 'Normal',
 'Road Accident',
 'Robbery',
 'Shooting',
 'Stealing',
 'Vandalism',
 'Suspicious activity',
 'Tailgating',
 'Unauthorized entry',
 'Protest/Demonstration',
 'Drone suspicious activity',
 'Fire/Smoke detection',
 'Medical emergency',
 'Suspicious package/object',
 'Threatening',
 'Attack',
 'Shoplifting',
 'burglary ',
 'distress',
 'assault']
from transformers import CLIPProcessor, CLIPModel
model_id = "openai/clip-vit-large-patch14"
processor = CLIPProcessor.from_pretrained(model_id)
model = CLIPModel.from_pretrained(model_id)

##BLIP
from transformers import pipeline
image_to_text = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large")


# Title
st.title("Image Caption Surveillance")

# Input field for URL
image_url = st.text_input("Enter the URL of the image:")




# Display image if a valid URL is provided
if image_url:
    try:
        st.image(image_url, caption="Uploaded Image")
        ##CLIP
        image = Image.open(requests.get(image_url, stream=True).raw)
        inputs = processor(text=scene_labels, images=image, return_tensors="pt", padding=True)
        outputs = model(**inputs)
        logits_per_image = outputs.logits_per_image # this is the image-text similarity score
        probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities
        raw_context= scene_labels[probs.argmax(-1)]
        context= 'the image is depicting scene of '+ raw_context

        ##BLIP
        caption = image_to_text(image_url, max_new_tokens=200)
        initial_caption= caption[0]['generated_text']

        ##Output
        st.write("context: ", context)
        st.write("initial_caption: ", initial_caption)
    except Exception as e:
        st.error(f"Error: {e}")
else:
    st.warning("Please enter a valid image URL.")