Monius commited on
Commit
b7c7aa0
β€’
1 Parent(s): 2f26dc4

init by <M0n-ius>

Browse files
Files changed (9) hide show
  1. .gitignore +164 -0
  2. Dockerfile +10 -0
  3. README.md +5 -3
  4. app.py +6 -0
  5. constraint.py +9 -0
  6. run.py +57 -0
  7. utils/__init__.py +2 -0
  8. utils/azure.py +36 -0
  9. utils/video.py +62 -0
.gitignore ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .DS_Store
2
+ .vscode/
3
+ flagged/
4
+
5
+ __pycache__/
6
+ *.py[cod]
7
+ *$py.class
8
+
9
+ # C extensions
10
+ *.so
11
+
12
+ # Distribution / packaging
13
+ .Python
14
+ test/
15
+ build/
16
+ develop-eggs/
17
+ dist/
18
+ downloads/
19
+ eggs/
20
+ .eggs/
21
+ lib/
22
+ lib64/
23
+ parts/
24
+ sdist/
25
+ var/
26
+ wheels/
27
+ share/python-wheels/
28
+ *.egg-info/
29
+ .installed.cfg
30
+ *.egg
31
+ MANIFEST
32
+
33
+ # PyInstaller
34
+ # Usually these files are written by a python script from a template
35
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
36
+ *.manifest
37
+ *.spec
38
+
39
+ # Installer logs
40
+ pip-log.txt
41
+ pip-delete-this-directory.txt
42
+
43
+ # Unit test / coverage reports
44
+ htmlcov/
45
+ .tox/
46
+ .nox/
47
+ .coverage
48
+ .coverage.*
49
+ .cache
50
+ nosetests.xml
51
+ coverage.xml
52
+ *.cover
53
+ *.py,cover
54
+ .hypothesis/
55
+ .pytest_cache/
56
+ cover/
57
+
58
+ # Translations
59
+ *.mo
60
+ *.pot
61
+
62
+ # Django stuff:
63
+ *.log
64
+ local_settings.py
65
+ db.sqlite3
66
+ db.sqlite3-journal
67
+
68
+ # Flask stuff:
69
+ instance/
70
+ .webassets-cache
71
+
72
+ # Scrapy stuff:
73
+ .scrapy
74
+
75
+ # Sphinx documentation
76
+ docs/_build/
77
+
78
+ # PyBuilder
79
+ .pybuilder/
80
+ target/
81
+
82
+ # Jupyter Notebook
83
+ .ipynb_checkpoints
84
+
85
+ # IPython
86
+ profile_default/
87
+ ipython_config.py
88
+
89
+ # pyenv
90
+ # For a library or package, you might want to ignore these files since the code is
91
+ # intended to run in multiple environments; otherwise, check them in:
92
+ # .python-version
93
+
94
+ # pipenv
95
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
96
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
97
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
98
+ # install all needed dependencies.
99
+ #Pipfile.lock
100
+
101
+ # poetry
102
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
103
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
104
+ # commonly ignored for libraries.
105
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
106
+ #poetry.lock
107
+
108
+ # pdm
109
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
110
+ #pdm.lock
111
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
112
+ # in version control.
113
+ # https://pdm.fming.dev/#use-with-ide
114
+ .pdm.toml
115
+
116
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
117
+ __pypackages__/
118
+
119
+ # Celery stuff
120
+ celerybeat-schedule
121
+ celerybeat.pid
122
+
123
+ # SageMath parsed files
124
+ *.sage.py
125
+
126
+ # Environments
127
+ .env
128
+ .venv
129
+ env/
130
+ venv/
131
+ ENV/
132
+ env.bak/
133
+ venv.bak/
134
+
135
+ # Spyder project settings
136
+ .spyderproject
137
+ .spyproject
138
+
139
+ # Rope project settings
140
+ .ropeproject
141
+
142
+ # mkdocs documentation
143
+ /site
144
+
145
+ # mypy
146
+ .mypy_cache/
147
+ .dmypy.json
148
+ dmypy.json
149
+
150
+ # Pyre type checker
151
+ .pyre/
152
+
153
+ # pytype static type analyzer
154
+ .pytype/
155
+
156
+ # Cython debug symbols
157
+ cython_debug/
158
+
159
+ # PyCharm
160
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
161
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
162
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
163
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
164
+ #.idea/
Dockerfile ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM monius/docker-ai-infra
2
+
3
+ USER infra
4
+
5
+ COPY --chown=infra . /opt/run/
6
+ RUN echo "av" > /opt/run/ai-infra.txt
7
+ RUN cat /opt/run/ai-infra.txt
8
+
9
+ CMD ["ai-infra"]
10
+
README.md CHANGED
@@ -1,11 +1,13 @@
1
  ---
2
  title: AIL Caption
3
- emoji: 🐒
4
  colorFrom: pink
5
  colorTo: blue
6
  sdk: docker
7
- pinned: false
8
  license: apache-2.0
 
 
 
9
  ---
10
 
11
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
  title: AIL Caption
3
+ emoji: πŸ§™β€β™€οΈπŸͺ„πŸŒŸπŸŽ­
4
  colorFrom: pink
5
  colorTo: blue
6
  sdk: docker
7
+ pinned: False
8
  license: apache-2.0
9
+ app_file: app.py
10
+ app_port: 9100
11
+ short_description: AlL-in video caption
12
  ---
13
 
 
app.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI
2
+ import gradio as gr
3
+ from run import Core
4
+
5
+ app = FastAPI()
6
+ app = gr.mount_gradio_app(app, Core, path="/")
constraint.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ SYS_PROMPT = """Create a detailed and accurate video description, starting from a specific scene and possibly transitioning through various themes and settings. Begin by describing the initial scene in detail, including the environment, key objects, any characters or creatures and their actions, and the overall atmosphere, considering specific aspects such as shot sizes (extreme close-up, close-up, medium, full, wide, etc.), camera movements (push, pull, shake, pan, tilt, rise, descend, etc.), and more. For example, if the scene involves a person like a young man sitting on a chair reading a book, describe his appearance and the surrounding environment, including basic features such as the character's gender, age, race, etc., as well as actions, emotions, dialogues, and performance content. If the scene includes animals or natural elements such as cats, the sky, or landscapes, vividly describe these elements and their behaviors or states, and consider the emotions and thematic elements introduced in this opening scene. Then, as the video progresses, describe the evolving visual effects, how they present a more vivid and rich picture through camera movements and special effects, considering aesthetics (style, tone, color palette, atmosphere, emotions, etc.). If the scene changes, explain how it transitions, what new elements are introduced, whether the atmosphere remains consistent or changes, and how this affects the overall narrative or theme of the video. If the video contains multiple scenes, describe the connections between them, whether creating a story, presenting a contrast, or highlighting different aspects of a theme, considering scenes (day, night, indoor, outdoor, etc.), props (relationship with characters and scenes, relationship with camera and scheduling), and scene scheduling (single character, multiple characters with camera and narrative association, and how they relate to scene props). Finally, conclude with a summary that encapsulates the essence of the video, combining all the described elements into a cohesive narrative or message, emphasizing the sensory and emotional experience provided by the video, and speculating on the impact or message intended for the audience, allowing viewers to engage in profound reflection and insight during the viewing process, thus achieving a deeper impact. The generated description should adhere to English grammar and be no less than 120 words in length.
2
+ """
3
+
4
+ USER_PROMPT = "Here are the frames from the video."
5
+
6
+ SKIP = 2
7
+ TEMP = 0.3
8
+ TOP = 0.75
9
+ MAX_TOKEN = 512
run.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py
2
+ import gradio as gr
3
+ from utils import VideoProcessor, AzureAPI
4
+ from constraint import SYS_PROMPT, USER_PROMPT
5
+
6
+ def process_caption(prompt, temp, top_p, max_tokens, model, key, endpoint, video, frame_format, frame_skip, group_size):
7
+ processor = VideoProcessor(frame_format=frame_format, frame_skip=frame_skip, group_size=group_size)
8
+ frames = processor.decode(video)
9
+ concatenated_images = processor.group_and_concatenate(frames)
10
+ base64_list = processor.to_base64_list(concatenated_images)
11
+ debug_image = processor.concatenate(concatenated_images, "vertical")
12
+
13
+ if not key or not endpoint:
14
+ return "", f"API key or endpoint is missing. Processed {len(frames)} frames.", debug_image
15
+
16
+ api = AzureAPI(key=key, endpoint=endpoint, model=model, temp=temp, top_p=top_p, max_tokens=max_tokens)
17
+ caption = api.get_caption(prompt, USER_PROMPT, base64_list)
18
+ return f"{caption}", f"Using model '{model}' with {len(frames)} frames extracted.", debug_image
19
+
20
+ with gr.Blocks() as Core:
21
+ with gr.Row(variant="panel"):
22
+ with gr.Column(scale=6):
23
+ with gr.Accordion("Debug", open=False):
24
+ info = gr.Textbox(label="Info", interactive=False)
25
+ frame = gr.Image(label="Frame", interactive=False)
26
+ with gr.Accordion("Configuration", open=False):
27
+ with gr.Row():
28
+ temp = gr.Slider(0, 1, 0.3, step=0.1, label="Temperature")
29
+ top_p = gr.Slider(0, 1, 0.75, step=0.1, label="Top-P")
30
+ max_tokens = gr.Slider(512, 4096, 3000, step=1, label="Max Tokens")
31
+ with gr.Row():
32
+ frame_format = gr.Dropdown(label="Frame Format", value="JPEG", choices=["JPEG", "PNG"], interactive=False)
33
+ frame_skip = gr.Slider(2, 100, 2, step=1, label="Frame Skip")
34
+ group_size = gr.Slider(1, 100, 10, step=1, label="Group Size")
35
+ prompt = gr.Textbox(SYS_PROMPT, label="Prompt", lines=10, max_lines=100, show_copy_button=True)
36
+ result = gr.Textbox(label="Result", lines=15, max_lines=100, show_copy_button=True, interactive=False)
37
+ with gr.Column(scale=2):
38
+ with gr.Column():
39
+ with gr.Accordion("Model Provider", open=True):
40
+ with gr.Tabs():
41
+ with gr.Tab("Azure"):
42
+ model = gr.Dropdown(label="Model", value="GPT-4o", choices=["GPT-4o", "GPT-4v"], interactive=False)
43
+ key = gr.Textbox(label="Azure API Key")
44
+ endpoint = gr.Textbox(label="Azure Endpoint")
45
+ with gr.Accordion("Data Source", open=True):
46
+ with gr.Tabs():
47
+ with gr.Tab("Upload"):
48
+ video = gr.Video(sources="upload", show_label=False, show_share_button=False, mirror_webcam=False)
49
+ caption_button = gr.Button("Caption", variant="primary", size="lg")
50
+ caption_button.click(
51
+ process_caption,
52
+ inputs=[prompt, temp, top_p, max_tokens, model, key, endpoint, video, frame_format, frame_skip, group_size],
53
+ outputs=[result, info, frame]
54
+ )
55
+
56
+ if __name__ == "__main__":
57
+ Core.launch()
utils/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ from .azure import AzureAPI
2
+ from .video import VideoProcessor
utils/azure.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # azure_api.py
2
+ import requests
3
+ from typing import List
4
+
5
+ class AzureAPI:
6
+ def __init__(self, key: str, endpoint: str, model: str, temp: float = 0.3, top_p: float = 0.75, max_tokens: int = 1024):
7
+ self.key = key
8
+ self.endpoint = endpoint
9
+ self.model = model
10
+ self.temp = temp
11
+ self.top_p = top_p
12
+ self.max_tokens = max_tokens
13
+ self.version = "2024-02-15-preview"
14
+
15
+ def get_caption(self, prompt: str, user_prompt: str, images_base64: List[str]) -> str:
16
+ headers = {
17
+ 'content-type': 'application/json',
18
+ "api-key": self.key,
19
+ }
20
+ system_msg = {"role": "system", "content": prompt}
21
+ user_msg = [{"type": "text", "text": user_prompt}]
22
+ img_msg = [
23
+ {"type": "image_url", "image_url": {"url": f'data:image/jpg;base64,{image}'}}
24
+ for image in images_base64
25
+ ]
26
+ payload = {
27
+ 'messages': [system_msg, {"role": "user", "content": user_msg + img_msg}],
28
+ 'temperature': self.temp,
29
+ 'top_p': self.top_p,
30
+ 'max_tokens': self.max_tokens,
31
+ 'model': self.model
32
+ }
33
+ url = f'{self.endpoint}/openai/deployments/{self.model}/chat/completions?api-version={self.version}'
34
+ response = requests.post(url, headers=headers, json=payload)
35
+ response.raise_for_status()
36
+ return response.json()['choices'][0]['message']['content']
utils/video.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # video_processor.py
2
+ from io import BytesIO
3
+ import av
4
+ import base64
5
+ from PIL import Image
6
+ from typing import List
7
+ from dataclasses import dataclass
8
+
9
+ @dataclass
10
+ class VideoProcessor:
11
+ frame_format: str = "JPEG"
12
+ frame_skip: int = 10
13
+ group_size: int = 10
14
+
15
+ def decode(self, video_path: str) -> List[Image.Image]:
16
+ frames = []
17
+ container = av.open(video_path)
18
+ for i, frame in enumerate(container.decode(video=0)):
19
+ if i % self.frame_skip:
20
+ continue
21
+ im = frame.to_image()
22
+ frames.append(im)
23
+ return frames
24
+
25
+ def concatenate(self, frames: List[Image.Image], direction: str = "horizontal") -> Image.Image:
26
+ widths, heights = zip(*(frame.size for frame in frames))
27
+
28
+ if direction == "horizontal":
29
+ total_width = sum(widths)
30
+ max_height = max(heights)
31
+ concatenated_image = Image.new('RGB', (total_width, max_height))
32
+ x_offset = 0
33
+ for frame in frames:
34
+ concatenated_image.paste(frame, (x_offset, 0))
35
+ x_offset += frame.width
36
+ else:
37
+ max_width = max(widths)
38
+ total_height = sum(heights)
39
+ concatenated_image = Image.new('RGB', (max_width, total_height))
40
+ y_offset = 0
41
+ for frame in frames:
42
+ concatenated_image.paste(frame, (0, y_offset))
43
+ y_offset += frame.height
44
+
45
+ return concatenated_image
46
+
47
+ def group_and_concatenate(self, frames: List[Image.Image], limit=10) -> List[Image.Image]:
48
+ xs = len(frames) // self.group_size
49
+ groups = [frames[i:i + xs] for i in range(0, len(frames), xs)]
50
+ sampled_groups = []
51
+ for group in groups:
52
+ interval = max(1, len(group) // limit)
53
+ sampled_groups.append([group[i] for i in range(0, len(group), interval)])
54
+ return [self.concatenate(group) for group in sampled_groups]
55
+
56
+ def to_base64_list(self, images: List[Image.Image]) -> List[str]:
57
+ base64_list = []
58
+ for image in images:
59
+ buffered = BytesIO()
60
+ image.save(buffered, format=self.frame_format)
61
+ base64_list.append(base64.b64encode(buffered.getvalue()).decode('utf-8'))
62
+ return base64_list