nateraw commited on
Commit
982a043
1 Parent(s): ea24c89

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +157 -0
app.py ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+
3
+ import gradio as gr
4
+ import librosa
5
+ import numpy as np
6
+ import requests
7
+ import torch
8
+ from PIL import Image
9
+ from torchvision.io import write_video
10
+ from torchvision.transforms.functional import pil_to_tensor
11
+
12
+
13
+ def get_rgb_image(r=255, g=255, b=255, size=(1400, 900), overlay_im=None, return_pil=False):
14
+ image = Image.new("RGBA", size, (r, g, b, 255))
15
+
16
+ if overlay_im:
17
+ img_w, img_h = overlay_im.size
18
+ bg_w, bg_h = image.size
19
+ offset = ((bg_w - img_w) // 2, (bg_h - img_h) // 2)
20
+ image.alpha_composite(overlay_im, offset)
21
+ image = image.convert("RGB")
22
+ return image if return_pil else np.array(image)
23
+
24
+
25
+ def write_frames_between(image_a, image_b, out_dir="./images", n=500, skip_existing=False):
26
+ out_dir = Path(out_dir)
27
+ out_dir.mkdir(exist_ok=True, parents=True)
28
+
29
+ for i, t in enumerate(np.linspace(0.0, 1.0, n)):
30
+ out_file = out_dir / f"image{i:06d}.jpg"
31
+ if out_file.exists() and skip_existing:
32
+ continue
33
+ im_arr = torch.lerp(torch.tensor(image_a).float(), torch.tensor(image_b).float(), float(t))
34
+ im = Image.fromarray(np.around(im_arr.numpy()).astype(np.uint8))
35
+ im.save(out_file)
36
+
37
+
38
+ def get_timesteps_arr(audio_filepath, offset, duration, fps=30, margin=1.0, smooth=0.0):
39
+ y, sr = librosa.load(audio_filepath, offset=offset, duration=duration)
40
+
41
+ # librosa.stft hardcoded defaults...
42
+ # n_fft defaults to 2048
43
+ # hop length is win_length // 4
44
+ # win_length defaults to n_fft
45
+ D = librosa.stft(y, n_fft=2048, hop_length=2048 // 4, win_length=2048)
46
+
47
+ # Extract percussive elements
48
+ D_harmonic, D_percussive = librosa.decompose.hpss(D, margin=margin)
49
+ y_percussive = librosa.istft(D_percussive, length=len(y))
50
+
51
+ # Get normalized melspectrogram
52
+ spec_raw = librosa.feature.melspectrogram(y=y_percussive, sr=sr)
53
+ spec_max = np.amax(spec_raw, axis=0)
54
+ spec_norm = (spec_max - np.min(spec_max)) / np.ptp(spec_max)
55
+
56
+ # Resize cumsum of spec norm to our desired number of interpolation frames
57
+ x_norm = np.linspace(0, spec_norm.shape[-1], spec_norm.shape[-1])
58
+ y_norm = np.cumsum(spec_norm)
59
+ y_norm /= y_norm[-1]
60
+ x_resize = np.linspace(0, y_norm.shape[-1], int(duration * fps))
61
+
62
+ T = np.interp(x_resize, x_norm, y_norm)
63
+
64
+ # Apply smoothing
65
+ return T * (1 - smooth) + np.linspace(0.0, 1.0, T.shape[0]) * smooth
66
+
67
+
68
+ def make_fast_frame_video(
69
+ frames_or_frame_dir="images",
70
+ audio_filepath="music/thoughts.mp3",
71
+ output_filepath="output.mp4",
72
+ sr=44100,
73
+ offset=7,
74
+ duration=5,
75
+ fps=30,
76
+ margin=1.0,
77
+ smooth=0.1,
78
+ frame_filename_ext=".jpg",
79
+ ):
80
+
81
+ if isinstance(frames_or_frame_dir, list):
82
+ frame_filepaths = frames_or_frame_dir
83
+ else:
84
+ frame_filepaths = sorted(Path(frames_or_frame_dir).glob(f"**/*{frame_filename_ext}"))
85
+
86
+ num_frames = len(frame_filepaths)
87
+ T = get_timesteps_arr(audio_filepath, offset, duration, fps=fps, margin=margin, smooth=smooth)
88
+ yp = np.arange(num_frames)
89
+ xp = np.linspace(0.0, 1.0, num_frames)
90
+
91
+ frame_idxs = np.around(np.interp(T, xp, yp)).astype(np.int32)
92
+
93
+ frames = None
94
+ for img_path in [frame_filepaths[x] for x in frame_idxs]:
95
+ frame = pil_to_tensor(Image.open(img_path)).unsqueeze(0)
96
+ frames = frame if frames is None else torch.cat([frames, frame])
97
+ frames = frames.permute(0, 2, 3, 1)
98
+
99
+ y, sr = librosa.load(audio_filepath, sr=sr, mono=True, offset=offset, duration=duration)
100
+ audio_tensor = torch.tensor(y).unsqueeze(0)
101
+
102
+ write_video(
103
+ output_filepath,
104
+ frames,
105
+ fps=fps,
106
+ audio_array=audio_tensor,
107
+ audio_fps=sr,
108
+ audio_codec="aac",
109
+ options={"crf": "23", "pix_fmt": "yuv420p"},
110
+ )
111
+
112
+ return output_filepath
113
+
114
+
115
+ OUTPUT_DIR = "multicolor_images_sm"
116
+ N = 500
117
+ IMAGE_SIZE = (640, 360)
118
+ MAX_DURATION = 10
119
+
120
+ if not Path(OUTPUT_DIR).exists():
121
+ overlay_image_url = "https://huggingface.co/datasets/nateraw/misc/resolve/main/Group%20122.png"
122
+ overlay_image = Image.open(requests.get(overlay_image_url, stream=True).raw, "r")
123
+ hex_codes = ["#5e6179", "#ffbb9f", "#dfeaf2", "#75e9e5", "#ff6b6b"]
124
+
125
+ rgb_vals = [tuple(int(hex.lstrip("#")[i : i + 2], 16) for i in (0, 2, 4)) for hex in hex_codes]
126
+
127
+ for i, (rgb_a, rgb_b) in enumerate(zip(rgb_vals, rgb_vals[1:])):
128
+ out_dir_step = Path(OUTPUT_DIR) / f"{i:06d}"
129
+ image_a = get_rgb_image(*rgb_a, size=IMAGE_SIZE, overlay_im=overlay_image)
130
+ image_b = get_rgb_image(*rgb_b, size=IMAGE_SIZE, overlay_im=overlay_image)
131
+ write_frames_between(image_a, image_b, out_dir=out_dir_step, n=N)
132
+
133
+
134
+ def fn(audio_filepath):
135
+ return make_fast_frame_video(
136
+ OUTPUT_DIR,
137
+ audio_filepath,
138
+ "out.mp4",
139
+ sr=44100,
140
+ offset=0,
141
+ duration=min(MAX_DURATION, librosa.get_duration(filename=audio_filepath)),
142
+ fps=18,
143
+ )
144
+
145
+
146
+ interface = gr.Interface(
147
+ fn=fn,
148
+ inputs=gr.Audio(type="filepath"),
149
+ outputs="video",
150
+ title="Music Visualizer",
151
+ description="Create a simple music visualizer video with a cute 🤗 logo on top",
152
+ article="<p style='text-align: center'><a href='https://github.com/nateraw/my-huggingface-repos/tree/main/spaces/music-visualizer' target='_blank'>Github Repo</a></p>",
153
+ examples=[["https://huggingface.co/datasets/nateraw/misc/resolve/main/quick_example_loop.wav"]],
154
+ )
155
+
156
+ if __name__ == "__main__":
157
+ interface.launch(debug=True)