thomas0104
commited on
Commit
•
eab92fe
1
Parent(s):
d45c6fc
Upload 2 files
Browse files上傳建立Dataset的code
- load.py +3 -0
- ryCreateDataset03_mp3_metadata_csv.py +114 -0
load.py
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
from datasets import load_dataset
|
2 |
+
dataset = load_dataset("audiofolder", data_dir="shortDir_20/")
|
3 |
+
dataset.push_to_hub("thomas0104/nan_tw_so_short_20s")
|
ryCreateDataset03_mp3_metadata_csv.py
ADDED
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
"""
|
3 |
+
Created on Tue Nov 22 18:32:21 2022
|
4 |
+
|
5 |
+
@author: renyu
|
6 |
+
"""
|
7 |
+
#
|
8 |
+
# cutMp3bySrt.py
|
9 |
+
|
10 |
+
import pysrt
|
11 |
+
import pandas as pd
|
12 |
+
import re
|
13 |
+
|
14 |
+
import shutil
|
15 |
+
import pysrt
|
16 |
+
import ffmpeg
|
17 |
+
import pydub
|
18 |
+
import os, sys, glob, pathlib
|
19 |
+
|
20 |
+
srcDir= 'shortDir'
|
21 |
+
tgtDir= 'shortDir_20'
|
22 |
+
|
23 |
+
os.makedirs(srcDir, exist_ok=True)
|
24 |
+
os.makedirs(tgtDir, exist_ok=True)
|
25 |
+
|
26 |
+
def ryCreateDataset(fnBase, srcDir= srcDir, timeLimit= 20):
|
27 |
+
|
28 |
+
fnBase= os.path.basename(fnBase).removesuffix('.mp4').removesuffix('.mp3')
|
29 |
+
fn_srt= f"{srcDir}/{fnBase}.zh-TW.srt"
|
30 |
+
if os.path.isfile(fn_srt) == False:
|
31 |
+
fn_srt= f"{srcDir}/{fnBase}.zh-CN.srt"
|
32 |
+
if os.path.isfile(fn_srt) == False:
|
33 |
+
fn_srt= f"{srcDir}/{fnBase}.zh-Hans.srt"
|
34 |
+
if os.path.isfile(fn_srt) == False:
|
35 |
+
fn_srt= f"{srcDir}/{fnBase}.srt"
|
36 |
+
if 'Combine' in fn_srt:
|
37 |
+
fn_srt= f"{srcDir}/{fnBase}.srt"
|
38 |
+
|
39 |
+
fn_mp3= f"{srcDir}/{fnBase}.mp3"
|
40 |
+
fn_mp4= f"{srcDir}/{fnBase}.mp4"
|
41 |
+
|
42 |
+
if not os.path.isfile(fn_mp3):
|
43 |
+
cmd= f'ffmpeg -i "{fn_mp4}" "{fn_mp3}"'
|
44 |
+
os.system(cmd)
|
45 |
+
|
46 |
+
mp3= pydub.AudioSegment.from_mp3(fn_mp3)
|
47 |
+
srt= pysrt.open(fn_srt)
|
48 |
+
|
49 |
+
#fnBase
|
50 |
+
|
51 |
+
os.makedirs(f'{tgtDir}/{fnBase}', exist_ok= True)
|
52 |
+
os.makedirs(f'{tgtDir}/{fnBase}/data', exist_ok= True)
|
53 |
+
|
54 |
+
fn_csv= "metadata.csv"
|
55 |
+
|
56 |
+
|
57 |
+
T= 1000 * timeLimit # timeLimit sec
|
58 |
+
|
59 |
+
with open(f'{tgtDir}/{fnBase}/{fn_csv}',
|
60 |
+
'w',
|
61 |
+
encoding='utf8') as fp:
|
62 |
+
|
63 |
+
fp.write('file_name,transcription\n')
|
64 |
+
|
65 |
+
t0= 0
|
66 |
+
sText= ''
|
67 |
+
k=0
|
68 |
+
t2 = 0
|
69 |
+
|
70 |
+
for i, s in enumerate(srt):
|
71 |
+
|
72 |
+
if t0==0:
|
73 |
+
t0= s.start.ordinal
|
74 |
+
sText= ''
|
75 |
+
|
76 |
+
t1= s.end.ordinal
|
77 |
+
|
78 |
+
|
79 |
+
# 文字並未做 normalization,
|
80 |
+
# 只是原 srt 中的「換行」用「空白」取代
|
81 |
+
|
82 |
+
#if sText=='':
|
83 |
+
# sText= s.text
|
84 |
+
|
85 |
+
dt= t1-t0
|
86 |
+
|
87 |
+
if dt>T:
|
88 |
+
a= mp3[t0:t2]
|
89 |
+
fn= f'{fnBase}_{k:04d}.mp3'
|
90 |
+
a.export(f'{tgtDir}/{fnBase}/data/{fn}')
|
91 |
+
#q= f'"{tgtDir}/{fnBase}/data/{fn}", "{sText}"\n'
|
92 |
+
q= f'"data/{fn}", "{sText}"\n'
|
93 |
+
fp.write(q)
|
94 |
+
|
95 |
+
t0= 0
|
96 |
+
sText= ''
|
97 |
+
k+=1
|
98 |
+
else:
|
99 |
+
t2 = t1
|
100 |
+
txt= re.sub('\n',' ', s.text)
|
101 |
+
sText += txt + ' '
|
102 |
+
if t0!=0:
|
103 |
+
a= mp3[t0:t1]
|
104 |
+
fn= f'{fnBase}_{k:04d}.mp3'
|
105 |
+
a.export(f'{tgtDir}/{fnBase}/data/{fn}')
|
106 |
+
#q= f'"{tgtDir}/{fnBase}/data/{fn}", "{sText}"\n'
|
107 |
+
q= f'"data/{fn}", "{sText}"\n'
|
108 |
+
fp.write(q)
|
109 |
+
|
110 |
+
|
111 |
+
cL= glob.glob(f'{srcDir}/*.mp3')
|
112 |
+
for c in cL:
|
113 |
+
print(c)
|
114 |
+
ryCreateDataset(c, srcDir)
|