# -*- coding: utf-8 -*- """ Created on Tue Nov 22 18:32:21 2022 @author: renyu """ # # cutMp3bySrt.py import pysrt import pandas as pd import re import shutil import pysrt import ffmpeg import pydub import os, sys, glob, pathlib srcDir= 'shortDir' tgtDir= 'shortDir_20' os.makedirs(srcDir, exist_ok=True) os.makedirs(tgtDir, exist_ok=True) def ryCreateDataset(fnBase, srcDir= srcDir, timeLimit= 20): fnBase= os.path.basename(fnBase).removesuffix('.mp4').removesuffix('.mp3') fn_srt= f"{srcDir}/{fnBase}.zh-TW.srt" if os.path.isfile(fn_srt) == False: fn_srt= f"{srcDir}/{fnBase}.zh-CN.srt" if os.path.isfile(fn_srt) == False: fn_srt= f"{srcDir}/{fnBase}.zh-Hans.srt" if os.path.isfile(fn_srt) == False: fn_srt= f"{srcDir}/{fnBase}.srt" if 'Combine' in fn_srt: fn_srt= f"{srcDir}/{fnBase}.srt" fn_mp3= f"{srcDir}/{fnBase}.mp3" fn_mp4= f"{srcDir}/{fnBase}.mp4" if not os.path.isfile(fn_mp3): cmd= f'ffmpeg -i "{fn_mp4}" "{fn_mp3}"' os.system(cmd) mp3= pydub.AudioSegment.from_mp3(fn_mp3) srt= pysrt.open(fn_srt) #fnBase os.makedirs(f'{tgtDir}/{fnBase}', exist_ok= True) os.makedirs(f'{tgtDir}/{fnBase}/data', exist_ok= True) fn_csv= "metadata.csv" T= 1000 * timeLimit # timeLimit sec with open(f'{tgtDir}/{fnBase}/{fn_csv}', 'w', encoding='utf8') as fp: fp.write('file_name,transcription\n') t0= 0 sText= '' k=0 t2 = 0 for i, s in enumerate(srt): if t0==0: t0= s.start.ordinal sText= '' t1= s.end.ordinal # 文字並未做 normalization, # 只是原 srt 中的「換行」用「空白」取代 #if sText=='': # sText= s.text dt= t1-t0 if dt>T: a= mp3[t0:t2] fn= f'{fnBase}_{k:04d}.mp3' a.export(f'{tgtDir}/{fnBase}/data/{fn}') #q= f'"{tgtDir}/{fnBase}/data/{fn}", "{sText}"\n' q= f'"data/{fn}", "{sText}"\n' fp.write(q) t0= 0 sText= '' k+=1 else: t2 = t1 txt= re.sub('\n',' ', s.text) sText += txt + ' ' if t0!=0: a= mp3[t0:t1] fn= f'{fnBase}_{k:04d}.mp3' a.export(f'{tgtDir}/{fnBase}/data/{fn}') #q= f'"{tgtDir}/{fnBase}/data/{fn}", "{sText}"\n' q= f'"data/{fn}", "{sText}"\n' fp.write(q) cL= glob.glob(f'{srcDir}/*.mp3') for c in cL: print(c) ryCreateDataset(c, srcDir)