Spaces:
Running
Running
kevinwang676
commited on
Commit
•
4aa0161
1
Parent(s):
42503c1
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,297 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re, os
|
2 |
+
import requests
|
3 |
+
import json
|
4 |
+
|
5 |
+
headers = {
|
6 |
+
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36"
|
7 |
+
}
|
8 |
+
pattern = r'//www\.bilibili\.com/video[^"]*'
|
9 |
+
|
10 |
+
def get_bilibili_video_id(url):
|
11 |
+
match = re.search(r'/video/([a-zA-Z0-9]+)/', url)
|
12 |
+
extracted_value = match.group(1)
|
13 |
+
return extracted_value
|
14 |
+
|
15 |
+
# Get bilibili audio
|
16 |
+
def find_first_appearance_with_neighborhood(text, pattern):
|
17 |
+
match = re.search(pattern, text)
|
18 |
+
|
19 |
+
if match:
|
20 |
+
return match.group()
|
21 |
+
else:
|
22 |
+
return None
|
23 |
+
|
24 |
+
def search_bilibili(keyword):
|
25 |
+
if keyword.startswith("BV"):
|
26 |
+
req = requests.get("https://search.bilibili.com/all?keyword={}&duration=1".format(keyword), headers=headers).text
|
27 |
+
else:
|
28 |
+
req = requests.get("https://search.bilibili.com/all?keyword={}&duration=1&tids=3&page=1".format(keyword), headers=headers).text
|
29 |
+
|
30 |
+
video_link = "https:" + find_first_appearance_with_neighborhood(req, pattern)
|
31 |
+
|
32 |
+
return video_link
|
33 |
+
|
34 |
+
def get_response(html_url):
|
35 |
+
headers = {
|
36 |
+
"referer": "https://www.bilibili.com/",
|
37 |
+
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36"
|
38 |
+
}
|
39 |
+
response = requests.get(html_url, headers=headers)
|
40 |
+
return response
|
41 |
+
|
42 |
+
def get_video_info(html_url):
|
43 |
+
response = get_response(html_url)
|
44 |
+
html_data = re.findall('<script>window.__playinfo__=(.*?)</script>', response.text)[0]
|
45 |
+
json_data = json.loads(html_data)
|
46 |
+
if json_data['data']['dash']['audio'][0]['backupUrl']!=None:
|
47 |
+
audio_url = json_data['data']['dash']['audio'][0]['backupUrl'][0]
|
48 |
+
else:
|
49 |
+
audio_url = json_data['data']['dash']['audio'][0]['baseUrl']
|
50 |
+
video_url = json_data['data']['dash']['video'][0]['baseUrl']
|
51 |
+
return audio_url, video_url
|
52 |
+
|
53 |
+
def save_audio(title, html_url):
|
54 |
+
audio_url = get_video_info(html_url)[0]
|
55 |
+
#video_url = get_video_info(html_url)[1]
|
56 |
+
|
57 |
+
audio_content = get_response(audio_url).content
|
58 |
+
#video_content = get_response(video_url).content
|
59 |
+
|
60 |
+
with open(title + '.mp3', mode='wb') as f:
|
61 |
+
f.write(audio_content)
|
62 |
+
print("音乐内容保存完成")
|
63 |
+
#with open(title + '.mp4', mode='wb') as f:
|
64 |
+
# f.write(video_content)
|
65 |
+
#print("视频内容保存完成"
|
66 |
+
|
67 |
+
from uvr5.vr import AudioPre
|
68 |
+
weight_uvr5_root = "uvr5/uvr_model"
|
69 |
+
uvr5_names = []
|
70 |
+
for name in os.listdir(weight_uvr5_root):
|
71 |
+
if name.endswith(".pth") or "onnx" in name:
|
72 |
+
uvr5_names.append(name.replace(".pth", ""))
|
73 |
+
|
74 |
+
func = AudioPre
|
75 |
+
pre_fun_hp2 = func(
|
76 |
+
agg=int(10),
|
77 |
+
model_path=os.path.join(weight_uvr5_root, "UVR-HP2.pth"),
|
78 |
+
device="cuda",
|
79 |
+
is_half=True,
|
80 |
+
)
|
81 |
+
|
82 |
+
pre_fun_hp5 = func(
|
83 |
+
agg=int(10),
|
84 |
+
model_path=os.path.join(weight_uvr5_root, "UVR-HP5.pth"),
|
85 |
+
device="cuda",
|
86 |
+
is_half=True,
|
87 |
+
)
|
88 |
+
|
89 |
+
import webrtcvad
|
90 |
+
from pydub import AudioSegment
|
91 |
+
from pydub.utils import make_chunks
|
92 |
+
|
93 |
+
import os
|
94 |
+
import librosa
|
95 |
+
import soundfile
|
96 |
+
import gradio as gr
|
97 |
+
|
98 |
+
|
99 |
+
def vad(audio_name):
|
100 |
+
audio = AudioSegment.from_file(audio_name, format="wav")
|
101 |
+
# Set the desired sample rate (WebRTC VAD supports only 8000, 16000, 32000, or 48000 Hz)
|
102 |
+
audio = audio.set_frame_rate(48000)
|
103 |
+
# Set single channel (mono)
|
104 |
+
audio = audio.set_channels(1)
|
105 |
+
|
106 |
+
# Initialize VAD
|
107 |
+
vad = webrtcvad.Vad()
|
108 |
+
# Set aggressiveness mode (an integer between 0 and 3, 3 is the most aggressive)
|
109 |
+
vad.set_mode(3)
|
110 |
+
|
111 |
+
# Convert pydub audio to bytes
|
112 |
+
frame_duration = 30 # Duration of a frame in ms
|
113 |
+
frame_width = int(audio.frame_rate * frame_duration / 1000) # width of a frame in samples
|
114 |
+
frames = make_chunks(audio, frame_duration)
|
115 |
+
|
116 |
+
# Perform voice activity detection
|
117 |
+
voiced_frames = []
|
118 |
+
for frame in frames:
|
119 |
+
if len(frame.raw_data) < frame_width * 2: # Ensure frame is correct length
|
120 |
+
break
|
121 |
+
is_speech = vad.is_speech(frame.raw_data, audio.frame_rate)
|
122 |
+
if is_speech:
|
123 |
+
voiced_frames.append(frame)
|
124 |
+
|
125 |
+
# Combine voiced frames back to an audio segment
|
126 |
+
voiced_audio = sum(voiced_frames, AudioSegment.silent(duration=0))
|
127 |
+
|
128 |
+
voiced_audio.export("voiced_audio.wav", format="wav")
|
129 |
+
|
130 |
+
|
131 |
+
|
132 |
+
|
133 |
+
def youtube_downloader(
|
134 |
+
video_identifier,
|
135 |
+
filename,
|
136 |
+
split_model,
|
137 |
+
start_time
|
138 |
+
):
|
139 |
+
print(video_identifier)
|
140 |
+
video_info = get_video_info(video_identifier)[0]
|
141 |
+
print(video_info)
|
142 |
+
audio_content = get_response(video_info).content
|
143 |
+
with open(filename.strip() + ".wav", mode="wb") as f:
|
144 |
+
f.write(audio_content)
|
145 |
+
audio_path = filename.strip() + ".wav"
|
146 |
+
start_ms = start_time * 1000
|
147 |
+
end_ms = start_ms + 45000
|
148 |
+
# make dir output
|
149 |
+
os.makedirs("output", exist_ok=True)
|
150 |
+
|
151 |
+
if split_model=="UVR-HP2":
|
152 |
+
pre_fun = pre_fun_hp2
|
153 |
+
else:
|
154 |
+
pre_fun = pre_fun_hp5
|
155 |
+
|
156 |
+
audio_orig = AudioSegment.from_file(audio_path)
|
157 |
+
if len(audio_orig) > end_ms:
|
158 |
+
|
159 |
+
# Extract the segment
|
160 |
+
segment = audio_orig[start_ms:end_ms]
|
161 |
+
segment.export(filename.strip() + ".wav", format="wav")
|
162 |
+
pre_fun._path_audio_(filename.strip() + ".wav", f"./output/{split_model}/{filename}/", f"./output/{split_model}/{filename}/", "wav")
|
163 |
+
os.remove(filename.strip()+".wav")
|
164 |
+
else:
|
165 |
+
segment = audio_orig[start_ms:len(audio_orig)]
|
166 |
+
segment.export(filename.strip() + ".wav", format="wav")
|
167 |
+
pre_fun._path_audio_(filename.strip() + ".wav", f"./output/{split_model}/{filename}/", f"./output/{split_model}/{filename}/", "wav")
|
168 |
+
os.remove(filename.strip()+".wav")
|
169 |
+
|
170 |
+
|
171 |
+
return f"./output/{split_model}/{filename}/vocal_{filename}.wav_10.wav", f"./output/{split_model}/{filename}/instrument_{filename}.wav_10.wav"
|
172 |
+
|
173 |
+
|
174 |
+
def youtube_downloader_100s(
|
175 |
+
video_identifier,
|
176 |
+
filename,
|
177 |
+
split_model
|
178 |
+
):
|
179 |
+
print(video_identifier)
|
180 |
+
video_info = get_video_info(video_identifier)[0]
|
181 |
+
print(video_info)
|
182 |
+
audio_content = get_response(video_info).content
|
183 |
+
with open(filename.strip() + ".wav", mode="wb") as f:
|
184 |
+
f.write(audio_content)
|
185 |
+
audio_path = filename.strip() + ".wav"
|
186 |
+
if split_model=="UVR-HP2":
|
187 |
+
pre_fun = pre_fun_hp2
|
188 |
+
else:
|
189 |
+
pre_fun = pre_fun_hp5
|
190 |
+
|
191 |
+
os.makedirs("output", exist_ok=True)
|
192 |
+
audio_orig = AudioSegment.from_file(audio_path)
|
193 |
+
|
194 |
+
if len(audio_orig) > 120000:
|
195 |
+
start_ms = 10000
|
196 |
+
end_ms = start_ms + 110000
|
197 |
+
|
198 |
+
# Extract the segment
|
199 |
+
|
200 |
+
segment = audio_orig[start_ms:end_ms]
|
201 |
+
|
202 |
+
segment.export(filename.strip() + ".wav", format="wav")
|
203 |
+
|
204 |
+
pre_fun._path_audio_(filename.strip() + ".wav", f"./output/{split_model}/{filename}/", f"./output/{split_model}/{filename}/", "wav")
|
205 |
+
os.remove(filename.strip()+".wav")
|
206 |
+
else:
|
207 |
+
pre_fun._path_audio_(filename.strip() + ".wav", f"./output/{split_model}/{filename}/", f"./output/{split_model}/{filename}/", "wav")
|
208 |
+
os.remove(filename.strip()+".wav")
|
209 |
+
|
210 |
+
return f"./output/{split_model}/{filename}/vocal_{filename}.wav_10.wav", f"./output/{split_model}/{filename}/instrument_{filename}.wav_10.wav"
|
211 |
+
|
212 |
+
|
213 |
+
def convert(start_time, song_name_src, song_name_ref, check_song, key_shift, vocal_vol, inst_vol):
|
214 |
+
split_model = "UVR-HP5"
|
215 |
+
song_name_ref = song_name_ref.strip().replace(" ", "")
|
216 |
+
video_identifier = search_bilibili(song_name_ref)
|
217 |
+
song_id = get_bilibili_video_id(video_identifier)
|
218 |
+
|
219 |
+
song_name_src = song_name_src.strip().replace(" ", "")
|
220 |
+
video_identifier_src = search_bilibili(song_name_src)
|
221 |
+
song_id_src = get_bilibili_video_id(video_identifier_src)
|
222 |
+
|
223 |
+
if os.path.isdir(f"./output/{split_model}/{song_id}")==False:
|
224 |
+
audio, sr = librosa.load(youtube_downloader_100s(video_identifier, song_id, split_model)[0], sr=24000, mono=True)
|
225 |
+
soundfile.write("audio_ref.wav", audio, sr)
|
226 |
+
else:
|
227 |
+
audio, sr = librosa.load(f"./output/{split_model}/{song_id}/vocal_{song_id}.wav_10.wav", sr=24000, mono=True)
|
228 |
+
soundfile.write("audio_ref.wav", audio, sr)
|
229 |
+
|
230 |
+
vad("audio_ref.wav")
|
231 |
+
|
232 |
+
#if os.path.isdir(f"./output/{split_model}/{song_id_src}")==False:
|
233 |
+
audio_src, sr_src = librosa.load(youtube_downloader(video_identifier_src, song_id_src, split_model, start_time)[0], sr=24000, mono=True)
|
234 |
+
soundfile.write("audio_src.wav", audio_src, sr_src)
|
235 |
+
#else:
|
236 |
+
# audio_src, sr_src = librosa.load(f"./output/{split_model}/{song_id_src}/vocal_{song_id_src}.wav_10.wav", sr=24000, mono=True)
|
237 |
+
# soundfile.write("audio_src.wav", audio_src, sr_src)
|
238 |
+
if os.path.isfile("output_svc/NeuCoSVCv2.wav"):
|
239 |
+
os.remove("output_svc/NeuCoSVCv2.wav")
|
240 |
+
|
241 |
+
if check_song == True:
|
242 |
+
os.system(f"python inference.py --src_wav_path audio_src.wav --ref_wav_path voiced_audio.wav --key_shift {key_shift}")
|
243 |
+
else:
|
244 |
+
os.system(f"python inference.py --src_wav_path audio_src.wav --ref_wav_path voiced_audio.wav --key_shift {key_shift} --speech_enroll")
|
245 |
+
|
246 |
+
audio_vocal = AudioSegment.from_file("output_svc/NeuCoSVCv2.wav", format="wav")
|
247 |
+
|
248 |
+
# Load the second audio file
|
249 |
+
audio_inst = AudioSegment.from_file(f"output/{split_model}/{song_id_src}/instrument_{song_id_src}.wav_10.wav", format="wav")
|
250 |
+
|
251 |
+
audio_vocal = audio_vocal + vocal_vol # Increase volume of the first audio by 5 dB
|
252 |
+
audio_inst = audio_inst + inst_vol # Decrease volume of the second audio by 5 dB
|
253 |
+
|
254 |
+
# Concatenate audio files
|
255 |
+
combined_audio = audio_vocal.overlay(audio_inst)
|
256 |
+
|
257 |
+
# Export the concatenated audio to a new file
|
258 |
+
combined_audio.export(f"{song_name_src}-AI翻唱.wav", format="wav")
|
259 |
+
|
260 |
+
return f"{song_name_src}-AI翻唱.wav"
|
261 |
+
|
262 |
+
|
263 |
+
|
264 |
+
app = gr.Blocks()
|
265 |
+
|
266 |
+
|
267 |
+
with app:
|
268 |
+
gr.Markdown("# <center>🥳💕🎶 NeuCoSVC v2 AI歌手全明星,无需训练、一键翻唱、重磅更新!</center>")
|
269 |
+
gr.Markdown("## <center>🌟 只需 1 个歌曲名,一键翻唱任意歌手的任意歌曲,支持说话语音翻唱,随时随地,听你想听!</center>")
|
270 |
+
gr.Markdown("### <center>🌊 NeuCoSVC v2 先享版 Powered by Tencent ARC Lab & Tsinghua University 💕</center>")
|
271 |
+
with gr.Row():
|
272 |
+
with gr.Column():
|
273 |
+
with gr.Row():
|
274 |
+
inp1 = gr.Textbox(label="请填写想要AI翻唱的歌曲或BV号", info="直接填写BV号的得到的歌曲最匹配,也可以选择填写“歌曲名+歌手名”")
|
275 |
+
inp2 = gr.Textbox(label="请填写含有目标音色的歌曲��BV号", info="例如您希望使用AI周杰伦的音色,就在此处填写周杰伦的任意一首歌")
|
276 |
+
with gr.Row():
|
277 |
+
inp0 = gr.Number(value=0, label="起始时间 (秒)", info="此程序将自动从起始时间开始提取45秒的翻唱歌曲")
|
278 |
+
inp3 = gr.Checkbox(label="参考音频是否为歌曲演唱,默认为是", info="如果参考音频为正常说话语音,请取消打勾", value=True)
|
279 |
+
inp4 = gr.Slider(minimum=-12, maximum=12, value=0, step=1, label="歌曲人声升降调", info="默认为0,+2为升高2个key,以此类推")
|
280 |
+
with gr.Row():
|
281 |
+
inp5 = gr.Slider(minimum=-3, maximum=3, value=0, step=1, label="调节人声音量,默认为0")
|
282 |
+
inp6 = gr.Slider(minimum=-3, maximum=3, value=0, step=1, label="调节伴奏音量,默认为0")
|
283 |
+
btn = gr.Button("一键开启AI翻唱之旅吧💕", variant="primary")
|
284 |
+
with gr.Column():
|
285 |
+
out = gr.Audio(label="AI歌手为您倾情演唱的歌曲", type="filepath", interactive=True)
|
286 |
+
|
287 |
+
btn.click(convert, [inp0, inp1, inp2, inp3, inp4, inp5, inp6], out)
|
288 |
+
|
289 |
+
gr.Markdown("### <center>注意❗:请不要生成会对个人以及组织造成侵害的内容,此程序仅供科研、学习及个人娱乐使用。</center>")
|
290 |
+
gr.HTML('''
|
291 |
+
<div class="footer">
|
292 |
+
<p>🌊🏞️🎶 - 江水东流急,滔滔无尽声。 明·顾璘
|
293 |
+
</p>
|
294 |
+
</div>
|
295 |
+
''')
|
296 |
+
|
297 |
+
app.queue().launch(show_error=True)
|