thomas0104 commited on
Commit
eab92fe
1 Parent(s): d45c6fc

Upload 2 files

Browse files

上傳建立Dataset的code

Files changed (2) hide show
  1. load.py +3 -0
  2. ryCreateDataset03_mp3_metadata_csv.py +114 -0
load.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from datasets import load_dataset
2
+ dataset = load_dataset("audiofolder", data_dir="shortDir_20/")
3
+ dataset.push_to_hub("thomas0104/nan_tw_so_short_20s")
ryCreateDataset03_mp3_metadata_csv.py ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Created on Tue Nov 22 18:32:21 2022
4
+
5
+ @author: renyu
6
+ """
7
+ #
8
+ # cutMp3bySrt.py
9
+
10
+ import pysrt
11
+ import pandas as pd
12
+ import re
13
+
14
+ import shutil
15
+ import pysrt
16
+ import ffmpeg
17
+ import pydub
18
+ import os, sys, glob, pathlib
19
+
20
+ srcDir= 'shortDir'
21
+ tgtDir= 'shortDir_20'
22
+
23
+ os.makedirs(srcDir, exist_ok=True)
24
+ os.makedirs(tgtDir, exist_ok=True)
25
+
26
+ def ryCreateDataset(fnBase, srcDir= srcDir, timeLimit= 20):
27
+
28
+ fnBase= os.path.basename(fnBase).removesuffix('.mp4').removesuffix('.mp3')
29
+ fn_srt= f"{srcDir}/{fnBase}.zh-TW.srt"
30
+ if os.path.isfile(fn_srt) == False:
31
+ fn_srt= f"{srcDir}/{fnBase}.zh-CN.srt"
32
+ if os.path.isfile(fn_srt) == False:
33
+ fn_srt= f"{srcDir}/{fnBase}.zh-Hans.srt"
34
+ if os.path.isfile(fn_srt) == False:
35
+ fn_srt= f"{srcDir}/{fnBase}.srt"
36
+ if 'Combine' in fn_srt:
37
+ fn_srt= f"{srcDir}/{fnBase}.srt"
38
+
39
+ fn_mp3= f"{srcDir}/{fnBase}.mp3"
40
+ fn_mp4= f"{srcDir}/{fnBase}.mp4"
41
+
42
+ if not os.path.isfile(fn_mp3):
43
+ cmd= f'ffmpeg -i "{fn_mp4}" "{fn_mp3}"'
44
+ os.system(cmd)
45
+
46
+ mp3= pydub.AudioSegment.from_mp3(fn_mp3)
47
+ srt= pysrt.open(fn_srt)
48
+
49
+ #fnBase
50
+
51
+ os.makedirs(f'{tgtDir}/{fnBase}', exist_ok= True)
52
+ os.makedirs(f'{tgtDir}/{fnBase}/data', exist_ok= True)
53
+
54
+ fn_csv= "metadata.csv"
55
+
56
+
57
+ T= 1000 * timeLimit # timeLimit sec
58
+
59
+ with open(f'{tgtDir}/{fnBase}/{fn_csv}',
60
+ 'w',
61
+ encoding='utf8') as fp:
62
+
63
+ fp.write('file_name,transcription\n')
64
+
65
+ t0= 0
66
+ sText= ''
67
+ k=0
68
+ t2 = 0
69
+
70
+ for i, s in enumerate(srt):
71
+
72
+ if t0==0:
73
+ t0= s.start.ordinal
74
+ sText= ''
75
+
76
+ t1= s.end.ordinal
77
+
78
+
79
+ # 文字並未做 normalization,
80
+ # 只是原 srt 中的「換行」用「空白」取代
81
+
82
+ #if sText=='':
83
+ # sText= s.text
84
+
85
+ dt= t1-t0
86
+
87
+ if dt>T:
88
+ a= mp3[t0:t2]
89
+ fn= f'{fnBase}_{k:04d}.mp3'
90
+ a.export(f'{tgtDir}/{fnBase}/data/{fn}')
91
+ #q= f'"{tgtDir}/{fnBase}/data/{fn}", "{sText}"\n'
92
+ q= f'"data/{fn}", "{sText}"\n'
93
+ fp.write(q)
94
+
95
+ t0= 0
96
+ sText= ''
97
+ k+=1
98
+ else:
99
+ t2 = t1
100
+ txt= re.sub('\n',' ', s.text)
101
+ sText += txt + ' '
102
+ if t0!=0:
103
+ a= mp3[t0:t1]
104
+ fn= f'{fnBase}_{k:04d}.mp3'
105
+ a.export(f'{tgtDir}/{fnBase}/data/{fn}')
106
+ #q= f'"{tgtDir}/{fnBase}/data/{fn}", "{sText}"\n'
107
+ q= f'"data/{fn}", "{sText}"\n'
108
+ fp.write(q)
109
+
110
+
111
+ cL= glob.glob(f'{srcDir}/*.mp3')
112
+ for c in cL:
113
+ print(c)
114
+ ryCreateDataset(c, srcDir)