Fix bugs
Browse files- add_subtitles_to_video.py +78 -47
- translate_transcriptions.py +3 -2
add_subtitles_to_video.py
CHANGED
@@ -8,15 +8,25 @@ import os
|
|
8 |
DEBUG = False
|
9 |
|
10 |
COLOR_BLUE = (255, 0, 0)
|
|
|
11 |
COLOR_GREEN = (0, 255, 0)
|
|
|
12 |
COLOR_RED = (0, 0, 255)
|
|
|
13 |
COLOR_YELLOW = (0, 255, 255)
|
|
|
14 |
COLOR_WHITE = (255, 255, 255)
|
|
|
15 |
COLOR_BLACK = (0, 0, 0)
|
16 |
-
|
|
|
|
|
17 |
COLOR_MAGENTA = (255, 0, 255)
|
|
|
18 |
COLOR_ORANGE = (0, 165, 255)
|
|
|
19 |
COLOR_PURPLE = (128, 0, 128)
|
|
|
20 |
COLOR_GRAY = (128, 128, 128)
|
21 |
|
22 |
def replace_characters_that_opencv_cant_show(text):
|
@@ -25,13 +35,13 @@ def replace_characters_that_opencv_cant_show(text):
|
|
25 |
text = text.replace("í", "i")
|
26 |
text = text.replace("ó", "o")
|
27 |
text = text.replace("ú", "u")
|
28 |
-
text = text.replace("ñ", "
|
29 |
text = text.replace("Á", "A")
|
30 |
text = text.replace("É", "E")
|
31 |
text = text.replace("Í", "I")
|
32 |
text = text.replace("Ó", "O")
|
33 |
text = text.replace("Ú", "U")
|
34 |
-
text = text.replace("Ñ", "
|
35 |
text = text.replace("\n", "")
|
36 |
text = text.replace("¿", "?")
|
37 |
text = text.replace("¡", "!")
|
@@ -47,30 +57,40 @@ def remove_speaker_text(text):
|
|
47 |
text = text[prefix_len:] # Remove the matched text from the beginning
|
48 |
return text, speaker
|
49 |
|
50 |
-
def get_filter_text_and_speaker(text, color):
|
51 |
text, speaker = remove_speaker_text(text)
|
52 |
if speaker is not None:
|
53 |
if speaker == 0:
|
54 |
color = COLOR_GREEN
|
|
|
55 |
elif speaker == 1:
|
56 |
color = COLOR_BLUE
|
|
|
57 |
elif speaker == 2:
|
58 |
color = COLOR_RED
|
|
|
59 |
elif speaker == 3:
|
60 |
color = COLOR_YELLOW
|
|
|
61 |
elif speaker == 4:
|
62 |
color = COLOR_WHITE
|
|
|
63 |
elif speaker == 5:
|
64 |
color = COLOR_BLACK
|
|
|
65 |
elif speaker == 6:
|
66 |
color = COLOR_BROWN
|
|
|
67 |
elif speaker == 7:
|
68 |
color = COLOR_MAGENTA
|
|
|
69 |
elif speaker == 8:
|
70 |
color = COLOR_ORANGE
|
|
|
71 |
elif speaker == 9:
|
72 |
color = COLOR_PURPLE
|
73 |
-
|
|
|
74 |
|
75 |
def create_dict_of_transcription(transcription_file):
|
76 |
transcription_dict = {}
|
@@ -85,7 +105,7 @@ def create_dict_of_transcription(transcription_file):
|
|
85 |
# Get start time (dd:dd:dd,ddd) and end time (dd:dd:dd,ddd)
|
86 |
start, end = line.split(" --> ")
|
87 |
# Add key to dictionary
|
88 |
-
transcription_dict[start] = ""
|
89 |
|
90 |
# if line is a number and carriage continue
|
91 |
elif re.match(r"\d+$", line):
|
@@ -99,11 +119,13 @@ def create_dict_of_transcription(transcription_file):
|
|
99 |
else:
|
100 |
# Remove characters that opencv can't show
|
101 |
line = replace_characters_that_opencv_cant_show(line)
|
102 |
-
transcription_dict[start] += f"{line}\n"
|
103 |
|
104 |
return transcription_dict
|
105 |
|
106 |
def hour_minute_seconds_miliseconds_to_seconds(time):
|
|
|
|
|
107 |
hours, minutes, seconds_miliseconds = time.split(":")
|
108 |
seconds, miliseconds = seconds_miliseconds.split(",")
|
109 |
seconds = int(hours) * 3600 + int(minutes) * 60 + int(seconds) + int(miliseconds) / 1000
|
@@ -131,9 +153,12 @@ def search_transcription_in_dict_of_transcription(transcription_dict, seconds):
|
|
131 |
key_seconds = hour_minute_seconds_miliseconds_to_seconds(key_hmsms)
|
132 |
next_key_seconds = hour_minute_seconds_miliseconds_to_seconds(next_key_hmsms)
|
133 |
if key_seconds <= seconds and seconds < next_key_seconds:
|
134 |
-
|
|
|
|
|
135 |
else:
|
136 |
continue
|
|
|
137 |
|
138 |
def get_length_of_cv2_text(text, fontFace, fontScale, thickness):
|
139 |
text_size, _ = cv2.getTextSize(text, fontFace, fontScale, thickness)
|
@@ -167,7 +192,8 @@ def add_subtitles_to_video(transcription_dict, input_video_file):
|
|
167 |
fontFace = cv2.FONT_HERSHEY_DUPLEX
|
168 |
fontScale = 1
|
169 |
thickness = 2
|
170 |
-
color =
|
|
|
171 |
lineType = cv2.LINE_AA
|
172 |
bottomLeftOrigin = False
|
173 |
|
@@ -180,53 +206,58 @@ def add_subtitles_to_video(transcription_dict, input_video_file):
|
|
180 |
|
181 |
# Add the text to the frame
|
182 |
current_time = captured_video.get(cv2.CAP_PROP_POS_MSEC) / 1000
|
183 |
-
text = search_transcription_in_dict_of_transcription(transcription_dict, current_time)
|
184 |
if text is not None:
|
185 |
-
if text
|
186 |
-
|
187 |
-
|
188 |
-
|
|
|
189 |
if old_text != text:
|
190 |
old_text = text
|
191 |
text_length = get_length_of_cv2_text(text, fontFace, fontScale, thickness)
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
203 |
length_of_text = get_length_of_cv2_text(text, fontFace, fontScale, thickness)
|
204 |
if length_of_text > 10:
|
205 |
rectangle_point2 = (length_of_text+50, text_position[1]+10)
|
206 |
-
cv2.rectangle(frame, rectangle_point1, rectangle_point2,
|
207 |
cv2.putText(frame, text, text_position, fontFace, fontScale, color, thickness, lineType, bottomLeftOrigin)
|
208 |
-
|
209 |
-
text_position = (50,
|
210 |
rectangle_point1 = (40, text_position[1]-30)
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
else:
|
220 |
-
text_position = (50, int(captured_video_height)-50)
|
221 |
-
rectangle_point1 = (40, text_position[1]-30)
|
222 |
-
rectangle_point2 = (int(captured_video_width)-50, text_position[1]+10)
|
223 |
-
if text is not None:
|
224 |
-
text, color = get_filter_text_and_speaker(text, color)
|
225 |
-
length_of_text = get_length_of_cv2_text(text, fontFace, fontScale, thickness)
|
226 |
-
if length_of_text > 10:
|
227 |
-
rectangle_point2 = (length_of_text+50, text_position[1]+10)
|
228 |
-
cv2.rectangle(frame, rectangle_point1, rectangle_point2, COLOR_GRAY, -1, cv2.LINE_AA, 0)
|
229 |
-
cv2.putText(frame, text, text_position, fontFace, fontScale, color, thickness, lineType, bottomLeftOrigin)
|
230 |
|
231 |
# Update the progress bar
|
232 |
progress_bar.update(1)
|
|
|
8 |
DEBUG = False
|
9 |
|
10 |
COLOR_BLUE = (255, 0, 0)
|
11 |
+
BACKGROUND_FOR_COLOR_BLUE = (255, 255, 0)
|
12 |
COLOR_GREEN = (0, 255, 0)
|
13 |
+
BACKGROUND_FOR_COLOR_GREEN = (255, 0, 255)
|
14 |
COLOR_RED = (0, 0, 255)
|
15 |
+
BACKGROUND_FOR_COLOR_RED = (255, 255, 0)
|
16 |
COLOR_YELLOW = (0, 255, 255)
|
17 |
+
BACKGROUND_FOR_COLOR_YELLOW = (255, 0, 0)
|
18 |
COLOR_WHITE = (255, 255, 255)
|
19 |
+
BACKGROUND_FOR_COLOR_WHITE = (128, 128, 128)
|
20 |
COLOR_BLACK = (0, 0, 0)
|
21 |
+
BACKGROUND_FOR_COLOR_BLACK = (128, 128, 128)
|
22 |
+
COLOR_BROWN = (202, 221, 234)
|
23 |
+
BACKGROUND_FOR_COLOR_BROWN = (234, 215, 202)
|
24 |
COLOR_MAGENTA = (255, 0, 255)
|
25 |
+
BACKGROUND_FOR_COLOR_MAGENTA = (0, 255, 0)
|
26 |
COLOR_ORANGE = (0, 165, 255)
|
27 |
+
BACKGROUND_FOR_COLOR_ORANGE = (255, 90, 0)
|
28 |
COLOR_PURPLE = (128, 0, 128)
|
29 |
+
BACKGROUND_FOR_COLOR_PURPLE = (127, 255, 127)
|
30 |
COLOR_GRAY = (128, 128, 128)
|
31 |
|
32 |
def replace_characters_that_opencv_cant_show(text):
|
|
|
35 |
text = text.replace("í", "i")
|
36 |
text = text.replace("ó", "o")
|
37 |
text = text.replace("ú", "u")
|
38 |
+
text = text.replace("ñ", "nh")
|
39 |
text = text.replace("Á", "A")
|
40 |
text = text.replace("É", "E")
|
41 |
text = text.replace("Í", "I")
|
42 |
text = text.replace("Ó", "O")
|
43 |
text = text.replace("Ú", "U")
|
44 |
+
text = text.replace("Ñ", "NH")
|
45 |
text = text.replace("\n", "")
|
46 |
text = text.replace("¿", "?")
|
47 |
text = text.replace("¡", "!")
|
|
|
57 |
text = text[prefix_len:] # Remove the matched text from the beginning
|
58 |
return text, speaker
|
59 |
|
60 |
+
def get_filter_text_and_speaker(text, color, background):
|
61 |
text, speaker = remove_speaker_text(text)
|
62 |
if speaker is not None:
|
63 |
if speaker == 0:
|
64 |
color = COLOR_GREEN
|
65 |
+
background = BACKGROUND_FOR_COLOR_GREEN
|
66 |
elif speaker == 1:
|
67 |
color = COLOR_BLUE
|
68 |
+
background = BACKGROUND_FOR_COLOR_BLUE
|
69 |
elif speaker == 2:
|
70 |
color = COLOR_RED
|
71 |
+
background = BACKGROUND_FOR_COLOR_RED
|
72 |
elif speaker == 3:
|
73 |
color = COLOR_YELLOW
|
74 |
+
background = BACKGROUND_FOR_COLOR_YELLOW
|
75 |
elif speaker == 4:
|
76 |
color = COLOR_WHITE
|
77 |
+
background = BACKGROUND_FOR_COLOR_WHITE
|
78 |
elif speaker == 5:
|
79 |
color = COLOR_BLACK
|
80 |
+
background = BACKGROUND_FOR_COLOR_BLACK
|
81 |
elif speaker == 6:
|
82 |
color = COLOR_BROWN
|
83 |
+
background = BACKGROUND_FOR_COLOR_BROWN
|
84 |
elif speaker == 7:
|
85 |
color = COLOR_MAGENTA
|
86 |
+
background = BACKGROUND_FOR_COLOR_MAGENTA
|
87 |
elif speaker == 8:
|
88 |
color = COLOR_ORANGE
|
89 |
+
background = BACKGROUND_FOR_COLOR_ORANGE
|
90 |
elif speaker == 9:
|
91 |
color = COLOR_PURPLE
|
92 |
+
background = BACKGROUND_FOR_COLOR_PURPLE
|
93 |
+
return text, color, background
|
94 |
|
95 |
def create_dict_of_transcription(transcription_file):
|
96 |
transcription_dict = {}
|
|
|
105 |
# Get start time (dd:dd:dd,ddd) and end time (dd:dd:dd,ddd)
|
106 |
start, end = line.split(" --> ")
|
107 |
# Add key to dictionary
|
108 |
+
transcription_dict[start] = {"transcription": "", "end": end}
|
109 |
|
110 |
# if line is a number and carriage continue
|
111 |
elif re.match(r"\d+$", line):
|
|
|
119 |
else:
|
120 |
# Remove characters that opencv can't show
|
121 |
line = replace_characters_that_opencv_cant_show(line)
|
122 |
+
transcription_dict[start]["transcription"] += f"{line}\n"
|
123 |
|
124 |
return transcription_dict
|
125 |
|
126 |
def hour_minute_seconds_miliseconds_to_seconds(time):
|
127 |
+
if time is None:
|
128 |
+
return None
|
129 |
hours, minutes, seconds_miliseconds = time.split(":")
|
130 |
seconds, miliseconds = seconds_miliseconds.split(",")
|
131 |
seconds = int(hours) * 3600 + int(minutes) * 60 + int(seconds) + int(miliseconds) / 1000
|
|
|
153 |
key_seconds = hour_minute_seconds_miliseconds_to_seconds(key_hmsms)
|
154 |
next_key_seconds = hour_minute_seconds_miliseconds_to_seconds(next_key_hmsms)
|
155 |
if key_seconds <= seconds and seconds < next_key_seconds:
|
156 |
+
transcription = transcription_dict[key_hmsms]["transcription"]
|
157 |
+
end_time = transcription_dict[key_hmsms]["end"]
|
158 |
+
return transcription, end_time
|
159 |
else:
|
160 |
continue
|
161 |
+
return None, None
|
162 |
|
163 |
def get_length_of_cv2_text(text, fontFace, fontScale, thickness):
|
164 |
text_size, _ = cv2.getTextSize(text, fontFace, fontScale, thickness)
|
|
|
192 |
fontFace = cv2.FONT_HERSHEY_DUPLEX
|
193 |
fontScale = 1
|
194 |
thickness = 2
|
195 |
+
color = COLOR_WHITE
|
196 |
+
background = COLOR_GRAY
|
197 |
lineType = cv2.LINE_AA
|
198 |
bottomLeftOrigin = False
|
199 |
|
|
|
206 |
|
207 |
# Add the text to the frame
|
208 |
current_time = captured_video.get(cv2.CAP_PROP_POS_MSEC) / 1000
|
209 |
+
text, end_time = search_transcription_in_dict_of_transcription(transcription_dict, current_time)
|
210 |
if text is not None:
|
211 |
+
if len(text) > 0:
|
212 |
+
if text[-1] == "\n":
|
213 |
+
text = text[:-1]
|
214 |
+
if text[-1] == " ":
|
215 |
+
text = text[:-1]
|
216 |
if old_text != text:
|
217 |
old_text = text
|
218 |
text_length = get_length_of_cv2_text(text, fontFace, fontScale, thickness)
|
219 |
+
current_time = captured_video.get(cv2.CAP_PROP_POS_MSEC) / 1000
|
220 |
+
end_time_seconds = hour_minute_seconds_miliseconds_to_seconds(end_time)
|
221 |
+
if current_time is not None and end_time_seconds is not None:
|
222 |
+
if current_time <= end_time_seconds:
|
223 |
+
if text_length > captured_video_width:
|
224 |
+
necesary_rows = int(text_length // (captured_video_width-300)+1)
|
225 |
+
words = text.split(" ")
|
226 |
+
number_of_words = len(words)
|
227 |
+
words_per_row = int(number_of_words // necesary_rows)
|
228 |
+
text = ""
|
229 |
+
text_position = (50, int(captured_video_height)-50*(necesary_rows+1))
|
230 |
+
rectangle_point1 = (40, text_position[1]-30)
|
231 |
+
for i in range(number_of_words):
|
232 |
+
if i % words_per_row == 0 and i != 0:
|
233 |
+
text, color, background = get_filter_text_and_speaker(text, color, background)
|
234 |
+
length_of_text = get_length_of_cv2_text(text, fontFace, fontScale, thickness)
|
235 |
+
if length_of_text > 10:
|
236 |
+
rectangle_point2 = (length_of_text+50, text_position[1]+10)
|
237 |
+
cv2.rectangle(frame, rectangle_point1, rectangle_point2, background, -1, cv2.LINE_AA, 0)
|
238 |
+
cv2.putText(frame, text, text_position, fontFace, fontScale, color, thickness, lineType, bottomLeftOrigin)
|
239 |
+
text = ""
|
240 |
+
text_position = (50, text_position[1]+50)
|
241 |
+
rectangle_point1 = (40, text_position[1]-30)
|
242 |
+
text += words[i] + " "
|
243 |
+
# Add the last words
|
244 |
+
text, color, background = get_filter_text_and_speaker(text, color, background)
|
245 |
length_of_text = get_length_of_cv2_text(text, fontFace, fontScale, thickness)
|
246 |
if length_of_text > 10:
|
247 |
rectangle_point2 = (length_of_text+50, text_position[1]+10)
|
248 |
+
cv2.rectangle(frame, rectangle_point1, rectangle_point2, background, -1, cv2.LINE_AA, 0)
|
249 |
cv2.putText(frame, text, text_position, fontFace, fontScale, color, thickness, lineType, bottomLeftOrigin)
|
250 |
+
else:
|
251 |
+
text_position = (50, int(captured_video_height)-50)
|
252 |
rectangle_point1 = (40, text_position[1]-30)
|
253 |
+
rectangle_point2 = (int(captured_video_width)-50, text_position[1]+10)
|
254 |
+
if text is not None:
|
255 |
+
text, color, background = get_filter_text_and_speaker(text, color, background)
|
256 |
+
length_of_text = get_length_of_cv2_text(text, fontFace, fontScale, thickness)
|
257 |
+
if length_of_text > 10:
|
258 |
+
rectangle_point2 = (length_of_text+50, text_position[1]+10)
|
259 |
+
cv2.rectangle(frame, rectangle_point1, rectangle_point2, background, -1, cv2.LINE_AA, 0)
|
260 |
+
cv2.putText(frame, text, text_position, fontFace, fontScale, color, thickness, lineType, bottomLeftOrigin)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
261 |
|
262 |
# Update the progress bar
|
263 |
progress_bar.update(1)
|
translate_transcriptions.py
CHANGED
@@ -118,8 +118,9 @@ def main(transcription_file, source_languaje, target_languaje, translate_model,
|
|
118 |
translated_transcription_time_stamps += f"{line}\n"
|
119 |
else:
|
120 |
if (i < len(translated_transcription_list)):
|
121 |
-
if translated_transcription_list[i]
|
122 |
-
translated_transcription_list[i]
|
|
|
123 |
speaker = ""
|
124 |
if re.match(r"\[SPEAKER_\d\d\]:", line):
|
125 |
speaker = re.match(r"\[SPEAKER_\d\d\]:", line).group(0)
|
|
|
118 |
translated_transcription_time_stamps += f"{line}\n"
|
119 |
else:
|
120 |
if (i < len(translated_transcription_list)):
|
121 |
+
if len(translated_transcription_list[i]) > 0:
|
122 |
+
if translated_transcription_list[i][0] == " ": # Remove space at the beginning
|
123 |
+
translated_transcription_list[i] = translated_transcription_list[i][1:]
|
124 |
speaker = ""
|
125 |
if re.match(r"\[SPEAKER_\d\d\]:", line):
|
126 |
speaker = re.match(r"\[SPEAKER_\d\d\]:", line).group(0)
|