|
|
|
"""
|
|
Created on Tue Nov 22 18:32:21 2022
|
|
|
|
@author: renyu
|
|
"""
|
|
|
|
|
|
|
|
import pysrt
|
|
import pandas as pd
|
|
import re
|
|
|
|
import shutil
|
|
import pysrt
|
|
import ffmpeg
|
|
import pydub
|
|
import os, sys, glob, pathlib
|
|
|
|
srcDir= 'shortDir'
|
|
tgtDir= 'shortDir_20'
|
|
|
|
os.makedirs(srcDir, exist_ok=True)
|
|
os.makedirs(tgtDir, exist_ok=True)
|
|
|
|
def ryCreateDataset(fnBase, srcDir= srcDir, timeLimit= 20):
|
|
|
|
fnBase= os.path.basename(fnBase).removesuffix('.mp4').removesuffix('.mp3')
|
|
fn_srt= f"{srcDir}/{fnBase}.zh-TW.srt"
|
|
if os.path.isfile(fn_srt) == False:
|
|
fn_srt= f"{srcDir}/{fnBase}.zh-CN.srt"
|
|
if os.path.isfile(fn_srt) == False:
|
|
fn_srt= f"{srcDir}/{fnBase}.zh-Hans.srt"
|
|
if os.path.isfile(fn_srt) == False:
|
|
fn_srt= f"{srcDir}/{fnBase}.srt"
|
|
if 'Combine' in fn_srt:
|
|
fn_srt= f"{srcDir}/{fnBase}.srt"
|
|
|
|
fn_mp3= f"{srcDir}/{fnBase}.mp3"
|
|
fn_mp4= f"{srcDir}/{fnBase}.mp4"
|
|
|
|
if not os.path.isfile(fn_mp3):
|
|
cmd= f'ffmpeg -i "{fn_mp4}" "{fn_mp3}"'
|
|
os.system(cmd)
|
|
|
|
mp3= pydub.AudioSegment.from_mp3(fn_mp3)
|
|
srt= pysrt.open(fn_srt)
|
|
|
|
|
|
|
|
os.makedirs(f'{tgtDir}/{fnBase}', exist_ok= True)
|
|
os.makedirs(f'{tgtDir}/{fnBase}/data', exist_ok= True)
|
|
|
|
fn_csv= "metadata.csv"
|
|
|
|
|
|
T= 1000 * timeLimit
|
|
|
|
with open(f'{tgtDir}/{fnBase}/{fn_csv}',
|
|
'w',
|
|
encoding='utf8') as fp:
|
|
|
|
fp.write('file_name,transcription\n')
|
|
|
|
t0= 0
|
|
sText= ''
|
|
k=0
|
|
t2 = 0
|
|
|
|
for i, s in enumerate(srt):
|
|
|
|
if t0==0:
|
|
t0= s.start.ordinal
|
|
sText= ''
|
|
|
|
t1= s.end.ordinal
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
dt= t1-t0
|
|
|
|
if dt>T:
|
|
a= mp3[t0:t2]
|
|
fn= f'{fnBase}_{k:04d}.mp3'
|
|
a.export(f'{tgtDir}/{fnBase}/data/{fn}')
|
|
|
|
q= f'"data/{fn}", "{sText}"\n'
|
|
fp.write(q)
|
|
|
|
t0= 0
|
|
sText= ''
|
|
k+=1
|
|
else:
|
|
t2 = t1
|
|
txt= re.sub('\n',' ', s.text)
|
|
sText += txt + ' '
|
|
if t0!=0:
|
|
a= mp3[t0:t1]
|
|
fn= f'{fnBase}_{k:04d}.mp3'
|
|
a.export(f'{tgtDir}/{fnBase}/data/{fn}')
|
|
|
|
q= f'"data/{fn}", "{sText}"\n'
|
|
fp.write(q)
|
|
|
|
|
|
cL= glob.glob(f'{srcDir}/*.mp3')
|
|
for c in cL:
|
|
print(c)
|
|
ryCreateDataset(c, srcDir)
|
|
|