import streamlit as st from transformers import BlipProcessor, BlipForConditionalGeneration from PIL import Image import torch # Title and description st.title("Image Captioning App") st.write("This app converts an uploaded image into a text description using the BLIP model.") # Load model and processor @st.cache_resource def load_model(): processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base") model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base") return processor, model processor, model = load_model() # Upload image uploaded_file = st.file_uploader("Upload an image", type=["jpg", "png", "jpeg"]) if uploaded_file is not None: image = Image.open(uploaded_file).convert("RGB") st.image(image, caption="Uploaded Image", use_column_width=True) # Preprocess the image inputs = processor(image, return_tensors="pt") # Generate the caption (inference) generated_ids = model.generate(**inputs) # Decode the generated caption generated_text = processor.decode(generated_ids[0], skip_special_tokens=True) # Display the generated caption st.write("Generated Caption:") st.success(generated_text)