Spaces:
Sleeping
Sleeping
from PIL import Image, ImageDraw | |
from helper import summarize_predictions_natural_language, render_results_in_image | |
from transformers import pipeline | |
from tokenizers import Tokenizer, Encoding | |
from tokenizers import decoders | |
from tokenizers import models | |
from tokenizers import normalizers | |
from tokenizers import pre_tokenizers | |
from tokenizers import processors | |
# Load object detection pipeline | |
object_detection_pipe = pipeline("object-detection", model="facebook/detr-resnet-50") | |
# Load text-to-speech pipeline | |
tts_pipe = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs") | |
def get_predictions(uploaded_image): | |
pil_image = Image.open(uploaded_image) | |
# Perform object detection | |
pipeline_output = object_detection_pipe(pil_image) | |
processed_image = render_results_in_image(pil_image, pipeline_output) | |
# Summarize predictions | |
text = summarize_predictions_natural_language(pipeline_output) | |
# Generate audio from text | |
narrated_text = tts_pipe(text) | |
return processed_image, text, narrated_text["audio"][0] |