Spaces:
Runtime error
Runtime error
Upload app.py with huggingface_hub
Browse files
app.py
ADDED
@@ -0,0 +1,157 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pathlib import Path
|
2 |
+
|
3 |
+
import gradio as gr
|
4 |
+
import librosa
|
5 |
+
import numpy as np
|
6 |
+
import requests
|
7 |
+
import torch
|
8 |
+
from PIL import Image
|
9 |
+
from torchvision.io import write_video
|
10 |
+
from torchvision.transforms.functional import pil_to_tensor
|
11 |
+
|
12 |
+
|
13 |
+
def get_rgb_image(r=255, g=255, b=255, size=(1400, 900), overlay_im=None, return_pil=False):
|
14 |
+
image = Image.new("RGBA", size, (r, g, b, 255))
|
15 |
+
|
16 |
+
if overlay_im:
|
17 |
+
img_w, img_h = overlay_im.size
|
18 |
+
bg_w, bg_h = image.size
|
19 |
+
offset = ((bg_w - img_w) // 2, (bg_h - img_h) // 2)
|
20 |
+
image.alpha_composite(overlay_im, offset)
|
21 |
+
image = image.convert("RGB")
|
22 |
+
return image if return_pil else np.array(image)
|
23 |
+
|
24 |
+
|
25 |
+
def write_frames_between(image_a, image_b, out_dir="./images", n=500, skip_existing=False):
|
26 |
+
out_dir = Path(out_dir)
|
27 |
+
out_dir.mkdir(exist_ok=True, parents=True)
|
28 |
+
|
29 |
+
for i, t in enumerate(np.linspace(0.0, 1.0, n)):
|
30 |
+
out_file = out_dir / f"image{i:06d}.jpg"
|
31 |
+
if out_file.exists() and skip_existing:
|
32 |
+
continue
|
33 |
+
im_arr = torch.lerp(torch.tensor(image_a).float(), torch.tensor(image_b).float(), float(t))
|
34 |
+
im = Image.fromarray(np.around(im_arr.numpy()).astype(np.uint8))
|
35 |
+
im.save(out_file)
|
36 |
+
|
37 |
+
|
38 |
+
def get_timesteps_arr(audio_filepath, offset, duration, fps=30, margin=1.0, smooth=0.0):
|
39 |
+
y, sr = librosa.load(audio_filepath, offset=offset, duration=duration)
|
40 |
+
|
41 |
+
# librosa.stft hardcoded defaults...
|
42 |
+
# n_fft defaults to 2048
|
43 |
+
# hop length is win_length // 4
|
44 |
+
# win_length defaults to n_fft
|
45 |
+
D = librosa.stft(y, n_fft=2048, hop_length=2048 // 4, win_length=2048)
|
46 |
+
|
47 |
+
# Extract percussive elements
|
48 |
+
D_harmonic, D_percussive = librosa.decompose.hpss(D, margin=margin)
|
49 |
+
y_percussive = librosa.istft(D_percussive, length=len(y))
|
50 |
+
|
51 |
+
# Get normalized melspectrogram
|
52 |
+
spec_raw = librosa.feature.melspectrogram(y=y_percussive, sr=sr)
|
53 |
+
spec_max = np.amax(spec_raw, axis=0)
|
54 |
+
spec_norm = (spec_max - np.min(spec_max)) / np.ptp(spec_max)
|
55 |
+
|
56 |
+
# Resize cumsum of spec norm to our desired number of interpolation frames
|
57 |
+
x_norm = np.linspace(0, spec_norm.shape[-1], spec_norm.shape[-1])
|
58 |
+
y_norm = np.cumsum(spec_norm)
|
59 |
+
y_norm /= y_norm[-1]
|
60 |
+
x_resize = np.linspace(0, y_norm.shape[-1], int(duration * fps))
|
61 |
+
|
62 |
+
T = np.interp(x_resize, x_norm, y_norm)
|
63 |
+
|
64 |
+
# Apply smoothing
|
65 |
+
return T * (1 - smooth) + np.linspace(0.0, 1.0, T.shape[0]) * smooth
|
66 |
+
|
67 |
+
|
68 |
+
def make_fast_frame_video(
|
69 |
+
frames_or_frame_dir="images",
|
70 |
+
audio_filepath="music/thoughts.mp3",
|
71 |
+
output_filepath="output.mp4",
|
72 |
+
sr=44100,
|
73 |
+
offset=7,
|
74 |
+
duration=5,
|
75 |
+
fps=30,
|
76 |
+
margin=1.0,
|
77 |
+
smooth=0.1,
|
78 |
+
frame_filename_ext=".jpg",
|
79 |
+
):
|
80 |
+
|
81 |
+
if isinstance(frames_or_frame_dir, list):
|
82 |
+
frame_filepaths = frames_or_frame_dir
|
83 |
+
else:
|
84 |
+
frame_filepaths = sorted(Path(frames_or_frame_dir).glob(f"**/*{frame_filename_ext}"))
|
85 |
+
|
86 |
+
num_frames = len(frame_filepaths)
|
87 |
+
T = get_timesteps_arr(audio_filepath, offset, duration, fps=fps, margin=margin, smooth=smooth)
|
88 |
+
yp = np.arange(num_frames)
|
89 |
+
xp = np.linspace(0.0, 1.0, num_frames)
|
90 |
+
|
91 |
+
frame_idxs = np.around(np.interp(T, xp, yp)).astype(np.int32)
|
92 |
+
|
93 |
+
frames = None
|
94 |
+
for img_path in [frame_filepaths[x] for x in frame_idxs]:
|
95 |
+
frame = pil_to_tensor(Image.open(img_path)).unsqueeze(0)
|
96 |
+
frames = frame if frames is None else torch.cat([frames, frame])
|
97 |
+
frames = frames.permute(0, 2, 3, 1)
|
98 |
+
|
99 |
+
y, sr = librosa.load(audio_filepath, sr=sr, mono=True, offset=offset, duration=duration)
|
100 |
+
audio_tensor = torch.tensor(y).unsqueeze(0)
|
101 |
+
|
102 |
+
write_video(
|
103 |
+
output_filepath,
|
104 |
+
frames,
|
105 |
+
fps=fps,
|
106 |
+
audio_array=audio_tensor,
|
107 |
+
audio_fps=sr,
|
108 |
+
audio_codec="aac",
|
109 |
+
options={"crf": "23", "pix_fmt": "yuv420p"},
|
110 |
+
)
|
111 |
+
|
112 |
+
return output_filepath
|
113 |
+
|
114 |
+
|
115 |
+
OUTPUT_DIR = "multicolor_images_sm"
|
116 |
+
N = 500
|
117 |
+
IMAGE_SIZE = (640, 360)
|
118 |
+
MAX_DURATION = 10
|
119 |
+
|
120 |
+
if not Path(OUTPUT_DIR).exists():
|
121 |
+
overlay_image_url = "https://huggingface.co/datasets/nateraw/misc/resolve/main/Group%20122.png"
|
122 |
+
overlay_image = Image.open(requests.get(overlay_image_url, stream=True).raw, "r")
|
123 |
+
hex_codes = ["#5e6179", "#ffbb9f", "#dfeaf2", "#75e9e5", "#ff6b6b"]
|
124 |
+
|
125 |
+
rgb_vals = [tuple(int(hex.lstrip("#")[i : i + 2], 16) for i in (0, 2, 4)) for hex in hex_codes]
|
126 |
+
|
127 |
+
for i, (rgb_a, rgb_b) in enumerate(zip(rgb_vals, rgb_vals[1:])):
|
128 |
+
out_dir_step = Path(OUTPUT_DIR) / f"{i:06d}"
|
129 |
+
image_a = get_rgb_image(*rgb_a, size=IMAGE_SIZE, overlay_im=overlay_image)
|
130 |
+
image_b = get_rgb_image(*rgb_b, size=IMAGE_SIZE, overlay_im=overlay_image)
|
131 |
+
write_frames_between(image_a, image_b, out_dir=out_dir_step, n=N)
|
132 |
+
|
133 |
+
|
134 |
+
def fn(audio_filepath):
|
135 |
+
return make_fast_frame_video(
|
136 |
+
OUTPUT_DIR,
|
137 |
+
audio_filepath,
|
138 |
+
"out.mp4",
|
139 |
+
sr=44100,
|
140 |
+
offset=0,
|
141 |
+
duration=min(MAX_DURATION, librosa.get_duration(filename=audio_filepath)),
|
142 |
+
fps=18,
|
143 |
+
)
|
144 |
+
|
145 |
+
|
146 |
+
interface = gr.Interface(
|
147 |
+
fn=fn,
|
148 |
+
inputs=gr.Audio(type="filepath"),
|
149 |
+
outputs="video",
|
150 |
+
title="Music Visualizer",
|
151 |
+
description="Create a simple music visualizer video with a cute 🤗 logo on top",
|
152 |
+
article="<p style='text-align: center'><a href='https://github.com/nateraw/my-huggingface-repos/tree/main/spaces/music-visualizer' target='_blank'>Github Repo</a></p>",
|
153 |
+
examples=[["https://huggingface.co/datasets/nateraw/misc/resolve/main/quick_example_loop.wav"]],
|
154 |
+
)
|
155 |
+
|
156 |
+
if __name__ == "__main__":
|
157 |
+
interface.launch(debug=True)
|