File size: 4,226 Bytes
11c68ac
6d9ecb7
4d94a5f
 
95d4bcd
11c68ac
e356bc7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
02de360
 
 
 
abbb301
658e032
 
abbb301
 
 
 
 
 
ce6ebb2
abbb301
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
02de360
2edac28
e356bc7
c8b7141
2edac28
 
e356bc7
 
 
11c68ac
2edac28
 
 
 
02de360
0ba7cdf
 
 
 
 
02de360
99c25ce
 
02de360
 
 
 
437ae23
abbb301
9ce3e9a
ce6ebb2
 
 
 
 
 
 
 
d341d77
abbb301
 
 
 
02de360
7a598bf
 
 
2edac28
 
 
e356bc7
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import streamlit as st
import torch
from PIL import Image
import requests
import os


##CLIP
scene_labels=['Arrest',
 'Arson',
 'Explosion',
 'public fight',
 'Normal',
 'Road Accident',
 'Robbery',
 'Shooting',
 'Stealing',
 'Vandalism',
 'Suspicious activity',
 'Tailgating',
 'Unauthorized entry',
 'Protest/Demonstration',
 'Drone suspicious activity',
 'Fire/Smoke detection',
 'Medical emergency',
 'Suspicious package/object',
 'Threatening',
 'Attack',
 'Shoplifting',
 'burglary ',
 'distress',
 'assault']
from transformers import CLIPProcessor, CLIPModel
model_id = "openai/clip-vit-large-patch14"
processor = CLIPProcessor.from_pretrained(model_id)
model = CLIPModel.from_pretrained(model_id)

##BLIP
from transformers import pipeline
image_to_text = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large")

##LLM
st.info("Please check here- https://ai.google.dev/tutorials/web_quickstart , to get your GOOGLE API KEY. This is mandatory to use this app")
GOOGLE_API_KEY = st.text_input("Please enter your GOOGLE API KEY", type="password")
os.environ['GOOGLE_API_KEY'] = GOOGLE_API_KEY

from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.prompts import PromptTemplate
from google.generativeai.types.safety_types import HarmBlockThreshold, HarmCategory


template="""You are an advanced image captioning AI assistant for surveillance related images.
Your task is to refine and improve an initial image caption using relevant contextual information provided.
You will receive two inputs:
Input 1: {initial_caption} - This is the initial caption for the image, most likely grammatically incorrect
and incomplete sentence, generated by a separate not so good image captioning model.
Input 2: {context} - This is the contextual information that provides more details about the background
Your goal is to take the initial caption and the additional context, and produce a new, refined caption that
incorporates the contextual details.
Please do not speculate things which are not provided. The final caption should be grammatically correct.
Please output only the final caption."""

prompt_template = PromptTemplate(
    template=template,
    input_variables=["initial_caption", "context"],
)


# Title
st.title("Image Caption Surveillance")

# Input field for URL
image_url = st.text_input("Enter the URL of the image:")




# Display image if a valid URL is provided
if image_url:
    try:
        st.image(image_url, caption="Uploaded Image")
        ##CLIP
        image = Image.open(requests.get(image_url, stream=True).raw)
        inputs = processor(text=scene_labels, images=image, return_tensors="pt", padding=True)
        outputs = model(**inputs)
        logits_per_image = outputs.logits_per_image # this is the image-text similarity score
        probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities
        raw_context= scene_labels[probs.argmax(-1)]
        context= 'the image is depicting scene of '+ raw_context
        #context= 'the image is depicting scene of attack'

        ##BLIP
        caption = image_to_text(image_url, max_new_tokens=200)
        initial_caption= caption[0]['generated_text']
        ##gemini-1.0-pro-latest
        ##LLM
        llm = ChatGoogleGenerativeAI(model="gemini-1.0-pro", google_api_key=GOOGLE_API_KEY, temperature=0.2, top_p=1, top_k=1,
                             safety_settings={
        HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
        HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
        HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
        HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,

    },
      )
        
        prompt=prompt_template.format(initial_caption=initial_caption, context=context)
        response = llm.invoke(prompt)
        final_caption = response.content

        ##Output
        #st.write("context: ", context)
        #st.write("initial_caption: ", initial_caption)
        st.write("CAPTION: ", final_caption)
    except Exception as e:
        st.error(f"Error: {e}")
else:
    st.warning("Please enter a valid image URL.")