whisper_medium_nan_tw / ryCreateDataset03_mp3_metadata_csv.py
thomas0104's picture
Upload 2 files
eab92fe verified
raw
history blame
2.93 kB
# -*- coding: utf-8 -*-
"""
Created on Tue Nov 22 18:32:21 2022
@author: renyu
"""
#
# cutMp3bySrt.py
import pysrt
import pandas as pd
import re
import shutil
import pysrt
import ffmpeg
import pydub
import os, sys, glob, pathlib
srcDir= 'shortDir'
tgtDir= 'shortDir_20'
os.makedirs(srcDir, exist_ok=True)
os.makedirs(tgtDir, exist_ok=True)
def ryCreateDataset(fnBase, srcDir= srcDir, timeLimit= 20):
fnBase= os.path.basename(fnBase).removesuffix('.mp4').removesuffix('.mp3')
fn_srt= f"{srcDir}/{fnBase}.zh-TW.srt"
if os.path.isfile(fn_srt) == False:
fn_srt= f"{srcDir}/{fnBase}.zh-CN.srt"
if os.path.isfile(fn_srt) == False:
fn_srt= f"{srcDir}/{fnBase}.zh-Hans.srt"
if os.path.isfile(fn_srt) == False:
fn_srt= f"{srcDir}/{fnBase}.srt"
if 'Combine' in fn_srt:
fn_srt= f"{srcDir}/{fnBase}.srt"
fn_mp3= f"{srcDir}/{fnBase}.mp3"
fn_mp4= f"{srcDir}/{fnBase}.mp4"
if not os.path.isfile(fn_mp3):
cmd= f'ffmpeg -i "{fn_mp4}" "{fn_mp3}"'
os.system(cmd)
mp3= pydub.AudioSegment.from_mp3(fn_mp3)
srt= pysrt.open(fn_srt)
#fnBase
os.makedirs(f'{tgtDir}/{fnBase}', exist_ok= True)
os.makedirs(f'{tgtDir}/{fnBase}/data', exist_ok= True)
fn_csv= "metadata.csv"
T= 1000 * timeLimit # timeLimit sec
with open(f'{tgtDir}/{fnBase}/{fn_csv}',
'w',
encoding='utf8') as fp:
fp.write('file_name,transcription\n')
t0= 0
sText= ''
k=0
t2 = 0
for i, s in enumerate(srt):
if t0==0:
t0= s.start.ordinal
sText= ''
t1= s.end.ordinal
# 文字並未做 normalization,
# 只是原 srt 中的「換行」用「空白」取代
#if sText=='':
# sText= s.text
dt= t1-t0
if dt>T:
a= mp3[t0:t2]
fn= f'{fnBase}_{k:04d}.mp3'
a.export(f'{tgtDir}/{fnBase}/data/{fn}')
#q= f'"{tgtDir}/{fnBase}/data/{fn}", "{sText}"\n'
q= f'"data/{fn}", "{sText}"\n'
fp.write(q)
t0= 0
sText= ''
k+=1
else:
t2 = t1
txt= re.sub('\n',' ', s.text)
sText += txt + ' '
if t0!=0:
a= mp3[t0:t1]
fn= f'{fnBase}_{k:04d}.mp3'
a.export(f'{tgtDir}/{fnBase}/data/{fn}')
#q= f'"{tgtDir}/{fnBase}/data/{fn}", "{sText}"\n'
q= f'"data/{fn}", "{sText}"\n'
fp.write(q)
cL= glob.glob(f'{srcDir}/*.mp3')
for c in cL:
print(c)
ryCreateDataset(c, srcDir)