Spaces:

ChandraP12330
/

image-caption

Runtime error

App Files Files Community

image-caption / app.py

ChandraP12330

Update app.py

9ce3e9a verified 5 months ago

raw

history blame contribute delete

No virus

4.23 kB

	import streamlit as st
	import torch
	from PIL import Image
	import requests
	import os


	##CLIP
	scene_labels=['Arrest',
	'Arson',
	'Explosion',
	'public fight',
	'Normal',
	'Road Accident',
	'Robbery',
	'Shooting',
	'Stealing',
	'Vandalism',
	'Suspicious activity',
	'Tailgating',
	'Unauthorized entry',
	'Protest/Demonstration',
	'Drone suspicious activity',
	'Fire/Smoke detection',
	'Medical emergency',
	'Suspicious package/object',
	'Threatening',
	'Attack',
	'Shoplifting',
	'burglary ',
	'distress',
	'assault']
	from transformers import CLIPProcessor, CLIPModel
	model_id = "openai/clip-vit-large-patch14"
	processor = CLIPProcessor.from_pretrained(model_id)
	model = CLIPModel.from_pretrained(model_id)

	##BLIP
	from transformers import pipeline
	image_to_text = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large")

	##LLM
	st.info("Please check here- https://ai.google.dev/tutorials/web_quickstart , to get your GOOGLE API KEY. This is mandatory to use this app")
	GOOGLE_API_KEY = st.text_input("Please enter your GOOGLE API KEY", type="password")
	os.environ['GOOGLE_API_KEY'] = GOOGLE_API_KEY

	from langchain_google_genai import ChatGoogleGenerativeAI
	from langchain.prompts import PromptTemplate
	from google.generativeai.types.safety_types import HarmBlockThreshold, HarmCategory


	template="""You are an advanced image captioning AI assistant for surveillance related images.
	Your task is to refine and improve an initial image caption using relevant contextual information provided.
	You will receive two inputs:
	Input 1: {initial_caption} - This is the initial caption for the image, most likely grammatically incorrect
	and incomplete sentence, generated by a separate not so good image captioning model.
	Input 2: {context} - This is the contextual information that provides more details about the background
	Your goal is to take the initial caption and the additional context, and produce a new, refined caption that
	incorporates the contextual details.
	Please do not speculate things which are not provided. The final caption should be grammatically correct.
	Please output only the final caption."""

	prompt_template = PromptTemplate(
	template=template,
	input_variables=["initial_caption", "context"],
	)


	# Title
	st.title("Image Caption Surveillance")

	# Input field for URL
	image_url = st.text_input("Enter the URL of the image:")




	# Display image if a valid URL is provided
	if image_url:
	try:
	st.image(image_url, caption="Uploaded Image")
	##CLIP
	image = Image.open(requests.get(image_url, stream=True).raw)
	inputs = processor(text=scene_labels, images=image, return_tensors="pt", padding=True)
	outputs = model(**inputs)
	logits_per_image = outputs.logits_per_image # this is the image-text similarity score
	probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities
	raw_context= scene_labels[probs.argmax(-1)]
	context= 'the image is depicting scene of '+ raw_context
	#context= 'the image is depicting scene of attack'

	##BLIP
	caption = image_to_text(image_url, max_new_tokens=200)
	initial_caption= caption[0]['generated_text']
	##gemini-1.0-pro-latest
	##LLM
	llm = ChatGoogleGenerativeAI(model="gemini-1.0-pro", google_api_key=GOOGLE_API_KEY, temperature=0.2, top_p=1, top_k=1,
	safety_settings={
	HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
	HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
	HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
	HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,

	},
	)

	prompt=prompt_template.format(initial_caption=initial_caption, context=context)
	response = llm.invoke(prompt)
	final_caption = response.content

	##Output
	#st.write("context: ", context)
	#st.write("initial_caption: ", initial_caption)
	st.write("CAPTION: ", final_caption)
	except Exception as e:
	st.error(f"Error: {e}")
	else:
	st.warning("Please enter a valid image URL.")