AIL-Caption-lalala-csv

Sleeping

App Files Files Community

Monius commited on Jun 5

Commit

b7c7aa0

•

1 Parent(s): 2f26dc4

init by <M0n-ius>

Browse files

Files changed (9) hide show

.gitignore +164 -0
Dockerfile +10 -0
README.md +5 -3
app.py +6 -0
constraint.py +9 -0
run.py +57 -0
utils/__init__.py +2 -0
utils/azure.py +36 -0
utils/video.py +62 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,164 @@

+.DS_Store
+.vscode/
+flagged/
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+test/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/

Dockerfile ADDED Viewed

	@@ -0,0 +1,10 @@

+FROM monius/docker-ai-infra
+USER infra
+COPY --chown=infra . /opt/run/
+RUN echo "av" > /opt/run/ai-infra.txt
+RUN cat /opt/run/ai-infra.txt
+CMD ["ai-infra"]

README.md CHANGED Viewed

@@ -1,11 +1,13 @@
 ---
 title: AIL Caption
-emoji: 🐢
 colorFrom: pink
 colorTo: blue
 sdk: docker
-pinned: false
 license: apache-2.0
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: AIL Caption
+emoji: 🧙‍♀️🪄🌟🎭
 colorFrom: pink
 colorTo: blue
 sdk: docker
+pinned: False
 license: apache-2.0
+app_file: app.py
+app_port: 9100
+short_description: AlL-in video caption
 ---

app.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from fastapi import FastAPI
+import gradio as gr
+from run import Core
+app = FastAPI()
+app = gr.mount_gradio_app(app, Core, path="/")

constraint.py ADDED Viewed

	@@ -0,0 +1,9 @@

+SYS_PROMPT = """Create a detailed and accurate video description, starting from a specific scene and possibly transitioning through various themes and settings. Begin by describing the initial scene in detail, including the environment, key objects, any characters or creatures and their actions, and the overall atmosphere, considering specific aspects such as shot sizes (extreme close-up, close-up, medium, full, wide, etc.), camera movements (push, pull, shake, pan, tilt, rise, descend, etc.), and more. For example, if the scene involves a person like a young man sitting on a chair reading a book, describe his appearance and the surrounding environment, including basic features such as the character's gender, age, race, etc., as well as actions, emotions, dialogues, and performance content. If the scene includes animals or natural elements such as cats, the sky, or landscapes, vividly describe these elements and their behaviors or states, and consider the emotions and thematic elements introduced in this opening scene. Then, as the video progresses, describe the evolving visual effects, how they present a more vivid and rich picture through camera movements and special effects, considering aesthetics (style, tone, color palette, atmosphere, emotions, etc.). If the scene changes, explain how it transitions, what new elements are introduced, whether the atmosphere remains consistent or changes, and how this affects the overall narrative or theme of the video. If the video contains multiple scenes, describe the connections between them, whether creating a story, presenting a contrast, or highlighting different aspects of a theme, considering scenes (day, night, indoor, outdoor, etc.), props (relationship with characters and scenes, relationship with camera and scheduling), and scene scheduling (single character, multiple characters with camera and narrative association, and how they relate to scene props). Finally, conclude with a summary that encapsulates the essence of the video, combining all the described elements into a cohesive narrative or message, emphasizing the sensory and emotional experience provided by the video, and speculating on the impact or message intended for the audience, allowing viewers to engage in profound reflection and insight during the viewing process, thus achieving a deeper impact. The generated description should adhere to English grammar and be no less than 120 words in length.
+"""
+USER_PROMPT = "Here are the frames from the video."
+SKIP = 2
+TEMP = 0.3
+TOP = 0.75
+MAX_TOKEN = 512

run.py ADDED Viewed

	@@ -0,0 +1,57 @@

+# app.py
+import gradio as gr
+from utils import VideoProcessor, AzureAPI
+from constraint import SYS_PROMPT, USER_PROMPT
+def process_caption(prompt, temp, top_p, max_tokens, model, key, endpoint, video, frame_format, frame_skip, group_size):
+    processor = VideoProcessor(frame_format=frame_format, frame_skip=frame_skip, group_size=group_size)
+    frames = processor.decode(video)
+    concatenated_images = processor.group_and_concatenate(frames)
+    base64_list = processor.to_base64_list(concatenated_images)
+    debug_image = processor.concatenate(concatenated_images, "vertical")
+    if not key or not endpoint:
+        return "", f"API key or endpoint is missing. Processed {len(frames)} frames.", debug_image
+    api = AzureAPI(key=key, endpoint=endpoint, model=model, temp=temp, top_p=top_p, max_tokens=max_tokens)
+    caption = api.get_caption(prompt, USER_PROMPT, base64_list)
+    return f"{caption}", f"Using model '{model}' with {len(frames)} frames extracted.", debug_image
+with gr.Blocks() as Core:
+    with gr.Row(variant="panel"):
+        with gr.Column(scale=6):
+            with gr.Accordion("Debug", open=False):
+                info = gr.Textbox(label="Info", interactive=False)
+                frame = gr.Image(label="Frame", interactive=False)
+            with gr.Accordion("Configuration", open=False):
+                with gr.Row():
+                    temp = gr.Slider(0, 1, 0.3, step=0.1, label="Temperature")
+                    top_p = gr.Slider(0, 1, 0.75, step=0.1, label="Top-P")
+                    max_tokens = gr.Slider(512, 4096, 3000, step=1, label="Max Tokens")
+                with gr.Row():
+                    frame_format = gr.Dropdown(label="Frame Format", value="JPEG", choices=["JPEG", "PNG"], interactive=False)
+                    frame_skip = gr.Slider(2, 100, 2, step=1, label="Frame Skip")
+                    group_size = gr.Slider(1, 100, 10, step=1, label="Group Size")
+            prompt = gr.Textbox(SYS_PROMPT, label="Prompt", lines=10, max_lines=100, show_copy_button=True)
+            result = gr.Textbox(label="Result", lines=15, max_lines=100, show_copy_button=True, interactive=False)
+        with gr.Column(scale=2):
+            with gr.Column():
+                with gr.Accordion("Model Provider", open=True):
+                    with gr.Tabs():
+                        with gr.Tab("Azure"):
+                            model = gr.Dropdown(label="Model", value="GPT-4o", choices=["GPT-4o", "GPT-4v"], interactive=False)
+                            key = gr.Textbox(label="Azure API Key")
+                            endpoint = gr.Textbox(label="Azure Endpoint")
+                with gr.Accordion("Data Source", open=True):
+                    with gr.Tabs():
+                        with gr.Tab("Upload"):
+                            video = gr.Video(sources="upload", show_label=False, show_share_button=False, mirror_webcam=False)
+                caption_button = gr.Button("Caption", variant="primary", size="lg")
+        caption_button.click(
+            process_caption,
+            inputs=[prompt, temp, top_p, max_tokens, model, key, endpoint, video, frame_format, frame_skip, group_size],
+            outputs=[result, info, frame]
+        )
+if __name__ == "__main__":
+    Core.launch()

utils/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .azure import AzureAPI
2	+ from .video import VideoProcessor

utils/azure.py ADDED Viewed

	@@ -0,0 +1,36 @@

+# azure_api.py
+import requests
+from typing import List
+class AzureAPI:
+    def __init__(self, key: str, endpoint: str, model: str, temp: float = 0.3, top_p: float = 0.75, max_tokens: int = 1024):
+        self.key = key
+        self.endpoint = endpoint
+        self.model = model
+        self.temp = temp
+        self.top_p = top_p
+        self.max_tokens = max_tokens
+        self.version = "2024-02-15-preview"
+    def get_caption(self, prompt: str, user_prompt: str, images_base64: List[str]) -> str:
+        headers = {
+            'content-type': 'application/json',
+            "api-key": self.key,
+        }
+        system_msg = {"role": "system", "content": prompt}
+        user_msg = [{"type": "text", "text": user_prompt}]
+        img_msg = [
+            {"type": "image_url", "image_url": {"url": f'data:image/jpg;base64,{image}'}}
+            for image in images_base64
+        ]
+        payload = {
+            'messages': [system_msg, {"role": "user", "content": user_msg + img_msg}],
+            'temperature': self.temp,
+            'top_p': self.top_p,
+            'max_tokens': self.max_tokens,
+            'model': self.model
+        }
+        url = f'{self.endpoint}/openai/deployments/{self.model}/chat/completions?api-version={self.version}'
+        response = requests.post(url, headers=headers, json=payload)
+        response.raise_for_status()
+        return response.json()['choices'][0]['message']['content']

utils/video.py ADDED Viewed

	@@ -0,0 +1,62 @@

+# video_processor.py
+from io import BytesIO
+import av
+import base64
+from PIL import Image
+from typing import List
+from dataclasses import dataclass
+@dataclass
+class VideoProcessor:
+    frame_format: str = "JPEG"
+    frame_skip: int = 10
+    group_size: int = 10
+    def decode(self, video_path: str) -> List[Image.Image]:
+        frames = []
+        container = av.open(video_path)
+        for i, frame in enumerate(container.decode(video=0)):
+            if i % self.frame_skip:
+                continue
+            im = frame.to_image()
+            frames.append(im)
+        return frames
+    def concatenate(self, frames: List[Image.Image], direction: str = "horizontal") -> Image.Image:
+        widths, heights = zip(*(frame.size for frame in frames))
+        if direction == "horizontal":
+            total_width = sum(widths)
+            max_height = max(heights)
+            concatenated_image = Image.new('RGB', (total_width, max_height))
+            x_offset = 0
+            for frame in frames:
+                concatenated_image.paste(frame, (x_offset, 0))
+                x_offset += frame.width
+        else:
+            max_width = max(widths)
+            total_height = sum(heights)
+            concatenated_image = Image.new('RGB', (max_width, total_height))
+            y_offset = 0
+            for frame in frames:
+                concatenated_image.paste(frame, (0, y_offset))
+                y_offset += frame.height
+        return concatenated_image
+    def group_and_concatenate(self, frames: List[Image.Image], limit=10) -> List[Image.Image]:
+        xs = len(frames) // self.group_size
+        groups = [frames[i:i + xs] for i in range(0, len(frames), xs)]
+        sampled_groups = []
+        for group in groups:
+            interval = max(1, len(group) // limit)
+            sampled_groups.append([group[i] for i in range(0, len(group), interval)])
+        return [self.concatenate(group) for group in sampled_groups]
+    def to_base64_list(self, images: List[Image.Image]) -> List[str]:
+        base64_list = []
+        for image in images:
+            buffered = BytesIO()
+            image.save(buffered, format=self.frame_format)
+            base64_list.append(base64.b64encode(buffered.getvalue()).decode('utf-8'))
+        return base64_list