Prudvireddy commited on
Commit
e881d3d
1 Parent(s): 995fb34

Update tools.py

Browse files
Files changed (1) hide show
  1. tools.py +126 -319
tools.py CHANGED
@@ -1,217 +1,114 @@
 
1
  from langchain.tools import tool, Tool
2
  import re
3
  import os
4
  from langchain_groq import ChatGroq
5
- import requests
6
- import cv2
7
- from moviepy.editor import ImageClip, AudioFileClip, concatenate_videoclips
8
- from langchain.pydantic_v1 import BaseModel, Field
9
  from langchain_community.tools import WikipediaQueryRun
10
  from langchain_community.utilities import WikipediaAPIWrapper
 
 
 
 
 
11
  from gtts import gTTS
12
  from pydub import AudioSegment
 
 
 
 
 
13
 
14
- # from diffusers import StableDiffusionXLPipeline, DPMSolverSinglestepScheduler
15
- # import bitsandbytes as bnb
16
- # import torch.nn as nn
17
- # import torch
18
- # import pyttsx3
19
- # from agents import get_agents_and_tasks
20
- # from langchain_google_genai import ChatGoogleGenerativeAI
21
-
22
- # from langchain.chat_models import ChatOpenAI
23
- # # llm2 = ChatOpenAI(model='gpt-3.5-turbo')
24
- # # llm3 = ChatOpenAI(model='gpt-3.5-turbo')
25
- # llm1 = ChatGroq(model='llama3-70b-8192', temperature=0.6, max_tokens=2048)
26
- # # llm2 = ChatGroq(model='mixtral-8x7b-32768', temperature=0.6, max_tokens=2048, api_key='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')
27
- # llm2 = ChatGoogleGenerativeAI(model='gemini-pro', temperature=0.0)
28
- # # llm2 = ChatGroq(model='llama3-70b-8192', temperature=0.6, max_tokens=2048, api_key='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')
29
- # # llm3 = ChatGoogleGenerativeAI(model='gemini-pro')
30
- # llm4 = ChatGroq(model='llama3-70b-8192', temperature=0.6, max_tokens=2048, api_key='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')
31
- # groq_api_key=os.environ.get('GROQ_API_KEY')
32
- # llm = ChatGroq(model='llama3-70b-8192', temperature=0.6, max_tokens=1024, api_key=groq_api_key)
33
-
34
- # pipe = StableDiffusionXLPipeline.from_pretrained("sd-community/sdxl-flash", torch_dtype=torch.float16).to('cuda')
35
- # pipe.scheduler = DPMSolverSinglestepScheduler.from_config(pipe.scheduler.config, timestep_spacing="trailing")
36
-
37
- # def quantize_model_to_4bit(model):
38
- # replacements = []
39
-
40
- # # Collect layers to be replaced
41
- # for name, module in model.named_modules():
42
- # if isinstance(module, nn.Linear):
43
- # replacements.append((name, module))
44
-
45
- # # Replace layers
46
- # for name, module in replacements:
47
- # # Split the name to navigate to the parent module
48
- # *path, last = name.split('.')
49
- # parent = model
50
- # for part in path:
51
- # parent = getattr(parent, part)
52
-
53
- # # Create and assign the quantized layer
54
- # quantized_layer = bnb.nn.Linear4bit(module.in_features, module.out_features, bias=module.bias is not None)
55
- # quantized_layer.weight.data = module.weight.data
56
- # if module.bias is not None:
57
- # quantized_layer.bias.data = module.bias.data
58
- # setattr(parent, last, quantized_layer)
59
-
60
- # return model
61
 
62
- # pipe.unet = quantize_model_to_4bit(pipe.unet)
63
- # pipe.enable_model_cpu_offload()
 
 
 
 
 
 
 
 
 
 
 
 
64
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
 
66
- # def generate_speech(text, speech_dir='./outputs/audio', lang='en', speed=170, voice='default', num=0):
67
- # """
68
- # Generates speech for given script.
69
- # """
70
- # engine = pyttsx3.init()
71
 
72
- # # Set language and voice
73
- # voices = engine.getProperty('voices')
74
- # if voice == 'default':
75
- # voice_id = voices[1].id
76
- # else:
77
- # # Try to find the voice with the given name
78
- # voice_id = None
79
- # for v in voices:
80
- # if voice in v.name:
81
- # voice_id = v.id
82
- # break
83
- # if not voice_id:
84
- # raise ValueError(f"Voice '{voice}' not found.")
85
-
86
- # engine.setProperty('voice', voice_id)
87
- # engine.setProperty('rate', speed)
88
- # # os.remove(os.path.join(os.path.dirname(os.path.abspath(__file__)), speech_dir, f'speech_{num}.mp3')) if os.path.exists(os.path.join(speech_dir, f'speech_{num}.mp3')) else None
89
- # engine.save_to_file(text, os.path.join(os.path.dirname(os.path.abspath(__file__)), speech_dir, f'speech_{num}.mp3'))
90
- # engine.runAndWait()
91
 
92
- def generate_speech(text, speech_dir='./outputs/speeches', lang='en', speed=1.0, num=0):
 
93
  """
94
  Generates speech for the given script using gTTS and adjusts the speed.
95
- """
96
- # Ensure the speech directory exists
97
- if not os.path.exists(speech_dir):
98
- os.makedirs(speech_dir)
99
-
100
- # Generate speech
101
- tts = gTTS(text=text, lang=lang)
102
-
103
- # Save the speech to an MP3 file
104
- speech_path = os.path.join(speech_dir, f'speech_{num}.mp3')
105
- temp_path = os.path.join(speech_dir, f'temp_speech_{num}.mp3')
106
- if os.path.exists(speech_path):
107
- os.remove(speech_path) # Remove existing file if it exists
108
-
109
- tts.save(temp_path)
110
-
111
- # Adjust the speed of the speech
112
- sound = AudioSegment.from_file(temp_path)
113
- if speed != 1.0:
114
- sound_with_altered_speed = sound._spawn(sound.raw_data, overrides={
115
- "frame_rate": int(sound.frame_rate * speed)
116
- }).set_frame_rate(sound.frame_rate)
117
- sound_with_altered_speed.export(speech_path, format="mp3")
118
- else:
119
- sound.export(speech_path, format="mp3")
120
-
121
- os.remove(temp_path) # Remove the temporary file
122
- # print(f"Speech saved to {speech_path}")
123
-
124
- # Example usage
125
- # generate_speech("Hello, this is a test speech.", speed=1.2, num=1)
126
 
127
- # class VideoGeneration(BaseModel):
128
- # images_dir : str = Field(description='Path to images directory, such as "outputs/images"')
129
- # speeches_dir : str = Field(description='Path to speeches directory, such as "outputs/speeches"')
130
-
131
- # @tool(args_schema=VideoGeneration)
132
- # def create_video_from_images_and_audio(images_dir, speeches_dir, zoom_factor=1.2):
133
- # """Creates video using images and audios with zoom-in effect"""
134
- # images_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), images_dir)
135
- # speeches_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), speeches_dir)
136
 
137
- # images_paths = os.listdir(images_dir)
138
- # audio_paths = os.listdir(speeches_dir)
139
- # # print(images_paths, audio_paths)
140
- # clips = []
141
-
142
- # for i in range(min(len(images_paths), len(audio_paths))):
143
- # # Load the image
144
- # img_clip = ImageClip(os.path.join(images_dir, images_paths[i]))
145
-
146
- # # Load the audio file
147
- # audioclip = AudioFileClip(os.path.join(speeches_dir, audio_paths[i]))
148
-
149
- # # Set the duration of the video clip to the duration of the audio file
150
- # videoclip = img_clip.set_duration(audioclip.duration)
151
-
152
- # # Apply zoom-in effect to the video clip
153
- # zoomed_clip = apply_zoom_in_effect(videoclip, zoom_factor)
154
-
155
- # # Add audio to the zoomed video clip
156
- # zoomed_clip = zoomed_clip.set_audio(audioclip)
157
-
158
- # clips.append(zoomed_clip)
159
-
160
- # # Concatenate all video clips
161
- # final_clip = concatenate_videoclips(clips)
162
 
163
- # # Write the result to a file
164
- # final_clip.write_videofile(os.path.join(os.path.dirname(os.path.abspath(__file__)), "outputs/final_video/final_video.mp4"), codec='libx264', fps=24)
165
 
166
- # return os.path.join(os.path.dirname(os.path.abspath(__file__)), "outputs/final_video/final_video.mp4")
167
-
168
- # def apply_zoom_in_effect(clip, zoom_factor=1.2):
169
- # width, height = clip.size
170
- # duration = clip.duration
171
-
172
- # def zoom_in_effect(get_frame, t):
173
- # frame = get_frame(t)
174
- # zoom = 1 + (zoom_factor - 1) * (t / duration)
175
- # new_width, new_height = int(width * zoom), int(height * zoom)
176
- # resized_frame = cv2.resize(frame, (new_width, new_height))
177
 
178
- # # Calculate the position to crop the frame to the original size
179
- # x_start = (new_width - width) // 2
180
- # y_start = (new_height - height) // 2
181
- # cropped_frame = resized_frame[y_start:y_start + height, x_start:x_start + width]
 
 
 
 
 
 
 
 
182
 
183
- # return cropped_frame
184
-
185
- # return clip.fl(zoom_in_effect, apply_to=['mask'])
186
-
187
- # Example usage
188
- # image_paths = "outputs/images"
189
- # audio_paths = "outputs/audio"
190
-
191
- # video_path = create_video_from_images_and_audio(image_paths, audio_paths)
192
- # print(f"Video created at: {video_path}")
193
-
194
-
195
- # class ImageGeneration(BaseModel):
196
- # text : str = Field(description='description of sentence used for image generation')
197
- # num : int = Field(description='sequence of description passed this tool. Used in image saving path. Example 1,2,3,4,5 and so on')
198
-
199
- # class SpeechGeneration(BaseModel):
200
- # text : str = Field(description='description of sentence used for image generation')
201
- # num : int = Field(description='sequence of description passed this tool. Used in image saving path. Example 1,2,3,4,5 and so on')
202
-
203
- import os
204
- import cv2
205
- from moviepy.editor import ImageClip, AudioFileClip, concatenate_videoclips, VideoFileClip
206
- from PIL import Image, ImageDraw, ImageFont
207
- import numpy as np
208
- from groq import Groq
209
-
210
-
211
-
212
- class VideoGeneration(BaseModel):
213
- images_dir: str = Field(description='Path to images directory, such as "outputs/images"')
214
- speeches_dir: str = Field(description='Path to speeches directory, such as "outputs/speeches"')
215
 
216
  def split_text_into_chunks(text, chunk_size):
217
  words = text.split()
@@ -219,7 +116,7 @@ def split_text_into_chunks(text, chunk_size):
219
 
220
  def add_text_to_video(input_video, output_video, text, duration=1, fontsize=40, fontcolor=(255, 255, 255),
221
  outline_thickness=2, outline_color=(0, 0, 0), delay_between_chunks=0.1,
222
- font_path=os.path.join(os.path.dirname(os.path.abspath(__file__)),'Montserrat-Bold.ttf')):
223
 
224
  chunks = split_text_into_chunks(text, 3) # Adjust chunk size as needed
225
 
@@ -250,7 +147,8 @@ def add_text_to_video(input_video, output_video, text, duration=1, fontsize=40,
250
 
251
  if current_frame % (chunk_duration_frames + delay_frames) < chunk_duration_frames and chunk_index < len(chunks):
252
  chunk = chunks[chunk_index]
253
- text_width, text_height = draw.textsize(chunk, font=font)
 
254
  text_x = (width - text_width) // 2
255
  text_y = height - 400 # Position text at the bottom
256
 
@@ -310,37 +208,45 @@ def apply_zoom_in_effect(clip, zoom_factor=1.2):
310
 
311
  return clip.fl(zoom_in_effect, apply_to=['mask'])
312
 
313
- @tool(args_schema=VideoGeneration)
314
- def create_video_from_images_and_audio(images_dir, speeches_dir, zoom_factor=1.2):
315
  """Creates video using images and audios.
316
  Args:
317
- images_dir: path to images folder, example 'outputs/images'
318
- speeches_dir: path to speeches folder, example 'outputs/speeches'"""
319
- client = Groq()
320
- images_paths = sorted(os.listdir(os.path.join(os.path.dirname(os.path.abspath(__file__)),images_dir)))
321
- audio_paths = sorted(os.listdir(os.path.join(os.path.dirname(os.path.abspath(__file__)),speeches_dir)))
322
  clips = []
323
  temp_files = []
324
 
325
- for i in range(min(len(images_paths), len(audio_paths))):
326
- img_clip = ImageClip(os.path.join(os.path.dirname(os.path.abspath(__file__)),images_dir, images_paths[i]))
327
- audioclip = AudioFileClip(os.path.join(os.path.dirname(os.path.abspath(__file__)),speeches_dir, audio_paths[i]))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
328
  videoclip = img_clip.set_duration(audioclip.duration)
329
  zoomed_clip = apply_zoom_in_effect(videoclip, zoom_factor)
330
 
331
- with open(os.path.join(os.path.dirname(os.path.abspath(__file__)),speeches_dir, audio_paths[i]), "rb") as file:
332
- transcription = client.audio.transcriptions.create(
333
- file=(audio_paths[i], file.read()),
334
- model="whisper-large-v3",
335
- response_format="verbose_json",
336
- )
337
- caption = transcription.text
338
 
339
- temp_video_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), f"outputs/final_video/temp_zoomed_{i}.mp4")
340
  zoomed_clip.write_videofile(temp_video_path, codec='libx264', fps=24)
341
  temp_files.append(temp_video_path)
342
 
343
- final_video_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), f"outputs/final_video/temp_captioned_{i}.mp4")
344
  add_text_to_video(temp_video_path, final_video_path, caption, duration=1, fontsize=60)
345
  temp_files.append(final_video_path)
346
 
@@ -350,116 +256,17 @@ def create_video_from_images_and_audio(images_dir, speeches_dir, zoom_factor=1.2
350
  clips.append(final_clip)
351
 
352
  final_clip = concatenate_videoclips(clips)
353
- final_clip.write_videofile(os.path.join(os.path.dirname(os.path.abspath(__file__)), "outputs/final_video/final_video.mp4"), codec='libx264', fps=24)
354
 
355
  # Close all video files properly
356
  for clip in clips:
357
  clip.close()
358
 
359
  # Remove all temporary files
360
- for temp_file in temp_files:
361
- try:
362
- os.remove(temp_file)
363
- except Exception as e:
364
- print(f"Error removing file {temp_file}: {e}")
365
 
366
- return os.path.join(os.path.dirname(os.path.abspath(__file__)), "outputs/final_video/final_video.mp4")
367
-
368
- # Example usage
369
- # image_paths = "outputs/images"
370
- # audio_paths = "outputs/speeches"
371
-
372
- # video_path = create_video_from_images_and_audio(image_paths, audio_paths)
373
- # print(f"Video created at: {video_path}")
374
-
375
- class WikiInputs(BaseModel):
376
- """Inputs to the wikipedia tool."""
377
- query: str = Field(description="query to look up in Wikipedia, should be 3 or less words")
378
-
379
- api_wrapper = WikipediaAPIWrapper(top_k_results=3)#, doc_content_chars_max=100)
380
-
381
- wiki_tool = WikipediaQueryRun(
382
- name="wiki-tool",
383
- description="{query:'input here'}",
384
- args_schema=WikiInputs,
385
- api_wrapper=api_wrapper,
386
- return_direct=True,
387
- )
388
-
389
- wiki = Tool(
390
- name = 'wikipedia',
391
- func = wiki_tool.run,
392
- description= "{query:'input here'}"
393
- )
394
-
395
- # wiki_tool.run("latest news in India")
396
-
397
- # @tool
398
- def process_script(script):
399
- """Used to process the script into dictionary format"""
400
- dict = {}
401
- dict['text_for_image_generation'] = re.findall(r'<image>(.*?)</?image>', script)
402
- dict['text_for_speech_generation'] = re.findall(r'<narration>.*?</?narration>', script)
403
- return dict
404
-
405
- @tool#(args_schema=ImageGeneration)
406
- def image_generator(script):
407
- """Generates images for the given script.
408
- Saves it to images_dir and return path
409
- Args:
410
- script: a complete script containing narrations and image descriptions"""
411
- # images_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), './outputs/images')
412
- images_dir = os.path.join('./outputs/images')
413
-
414
- os.makedirs(images_dir, exist_ok=True)
415
- # if num==1:
416
- for filename in os.listdir(images_dir):
417
- file_path = os.path.join(images_dir, filename)
418
- if os.path.isfile(file_path):
419
- os.remove(file_path)
420
-
421
- dict = process_script(script)
422
- for i, text in enumerate(dict['text_for_image_generation']):
423
- # image = pipe(text, num_inference_steps=12, guidance_scale=2, width=720, height=1280, verbose=0).images[0]
424
- # image.save(os.path.join(images_dir, f'image{i}.jpg'))
425
- response = requests.post(
426
- f"https://api.stability.ai/v2beta/stable-image/generate/core",
427
- headers={
428
- "authorization": os.environ.get('STABILITY_AI_API_KEY'),
429
- "accept": "image/*"
430
- },
431
- files={"none": ''},
432
- data={
433
- "prompt": text,
434
- "output_format": "png",
435
- 'aspect_ratio': "9:16",
436
- },
437
- )
438
-
439
- if response.status_code == 200:
440
- with open(os.path.join(images_dir, f'image_{i}.png'), 'wb') as file:
441
- file.write(response.content)
442
- else:
443
- raise Exception(str(response.json()))
444
- return f'images generated.'#f'image generated for "{text}" and saved to directory {images_dir} as image{num}.jpg'
445
-
446
- @tool
447
- def speech_generator(script):
448
- """Generates speech for given text
449
- Saves it to speech_dir and return path
450
- Args:
451
- script: a complete script containing narrations and image descriptions"""
452
- speech_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), './outputs/speeches')
453
- os.makedirs(speech_dir, exist_ok=True)
454
-
455
- # if num==1:
456
- for filename in os.listdir(speech_dir):
457
- file_path = os.path.join(speech_dir, filename)
458
- if os.path.isfile(file_path):
459
- os.remove(file_path)
460
-
461
- dict = process_script(script)
462
- print(dict)
463
- for i, text in enumerate(dict['text_for_speech_generation']):
464
- generate_speech(text, speech_dir, num=i)
465
- return f'speechs generated.'#f'speech generated for "{text}" and saved to directory {speech_dir} as speech{num}.mp3'
 
1
+ from crewai import Task, Agent, Crew, Process
2
  from langchain.tools import tool, Tool
3
  import re
4
  import os
5
  from langchain_groq import ChatGroq
6
+ # llm = ChatGroq(model='mixtral-8x7b-32768', temperature=0.6, max_tokens=2048)
7
+ llm = ChatGroq(model='llama3-70b-8192', temperature=0.6, max_tokens=1024, api_key='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')
 
 
8
  from langchain_community.tools import WikipediaQueryRun
9
  from langchain_community.utilities import WikipediaAPIWrapper
10
+ from langchain_core.pydantic_v1 import BaseModel, Field
11
+ import requests
12
+ # import pyttsx3
13
+ import io
14
+ import tempfile
15
  from gtts import gTTS
16
  from pydub import AudioSegment
17
+ from groq import Groq
18
+ import cv2
19
+ import numpy as np
20
+ from PIL import Image, ImageDraw, ImageFont
21
+ from moviepy.editor import VideoFileClip, AudioFileClip, concatenate_videoclips, ImageClip
22
 
23
+ def process_script(script):
24
+ """Used to process the script into dictionary format"""
25
+ dict = {}
26
+ text_for_image_generation = re.findall(r'<image>(.*?)</?image>', script, re.DOTALL)
27
+ text_for_speech_generation = re.findall(r'<narration>(.*?)</?narration>', script, re.DOTALL)
28
+ dict['text_for_image_generation'] = text_for_image_generation
29
+ dict['text_for_speech_generation'] = text_for_speech_generation
30
+ return dict
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
+ @tool
33
+ def image_generator(script):
34
+ """Generates images for the given script.
35
+ Saves it to images_dir and return path
36
+ Args:
37
+ script: a complete script containing narrations and image descriptions
38
+ Returns:
39
+ A list of images in bytes format.
40
+ """
41
+ # images_dir = './outputs/images'
42
+ # for filename in os.listdir(images_dir):
43
+ # file_path = os.path.join(images_dir, filename)
44
+ # if os.path.isfile(file_path):
45
+ # os.remove(file_path)
46
 
47
+ dict = process_script(script)
48
+ images_list = []
49
+ for i, text in enumerate(dict['text_for_image_generation']):
50
+ response = requests.post(
51
+ f"https://api.stability.ai/v2beta/stable-image/generate/core",
52
+ headers={
53
+ "authorization": f'sk-2h9CmjC33uxc9W8fmx23oIicgqHk2jVtBF9KoEfdyTUIfODt',
54
+ "accept": "image/*"
55
+ },
56
+ files={"none": ''},
57
+ data={
58
+ "prompt": text,
59
+ "output_format": "png",
60
+ 'aspect_ratio': "9:16",
61
+ },
62
+ )
63
+ print('image generated')
64
 
65
+ if response.status_code == 200:
66
+ images_list.append(response.content)
67
+ else:
68
+ raise Exception(str(response.json()))
 
69
 
70
+ return images_list
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
 
72
+ @tool
73
+ def generate_speech(script, lang='en', speed=1.2, max_segments=2):
74
  """
75
  Generates speech for the given script using gTTS and adjusts the speed.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
 
77
+ Args:
78
+ script (str): The script containing narration segments.
79
+ lang (str, optional): The language code (default is 'en' for English).
80
+ speed (float, optional): The speed factor of speech generation (default is 1.0).
81
+ max_segments (int, optional): Maximum number of speech segments to generate (default is 2).
 
 
 
 
82
 
83
+ Returns:
84
+ list: List of generated speech segments as bytes.
85
+ """
86
+ dict = process_script(script)
87
+ speeches_list = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
 
89
+ # Ensure we limit the number of segments processed
90
+ segments_to_process = min(max_segments, len(dict['text_for_speech_generation']))
91
 
92
+ for text in dict['text_for_speech_generation'][:segments_to_process]:
93
+ # Generate speech
94
+ tts = gTTS(text=text, lang=lang)
 
 
 
 
 
 
 
 
95
 
96
+ # Save speech to BytesIO
97
+ speech_data = io.BytesIO()
98
+ tts.write_to_fp(speech_data)
99
+ speech_data.seek(0)
100
+
101
+ # Adjust speed if necessary
102
+ if speed != 1.0:
103
+ audio_segment = AudioSegment.from_file(speech_data, format="mp3")
104
+ audio_segment = audio_segment.speedup(playback_speed=speed)
105
+ speech_data = io.BytesIO()
106
+ audio_segment.export(speech_data, format="mp3")
107
+ speech_data.seek(0)
108
 
109
+ speeches_list.append(speech_data.read())
110
+
111
+ return speeches_list
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
 
113
  def split_text_into_chunks(text, chunk_size):
114
  words = text.split()
 
116
 
117
  def add_text_to_video(input_video, output_video, text, duration=1, fontsize=40, fontcolor=(255, 255, 255),
118
  outline_thickness=2, outline_color=(0, 0, 0), delay_between_chunks=0.1,
119
+ font_path=os.path.join(os.path.dirname(os.path.abspath(__name__)),'Montserrat-Bold.ttf')):
120
 
121
  chunks = split_text_into_chunks(text, 3) # Adjust chunk size as needed
122
 
 
147
 
148
  if current_frame % (chunk_duration_frames + delay_frames) < chunk_duration_frames and chunk_index < len(chunks):
149
  chunk = chunks[chunk_index]
150
+ text_bbox = draw.textbbox((0, 0), chunk, font=font)
151
+ text_width, text_height = text_bbox[2] - text_bbox[0], text_bbox[3] - text_bbox[1]
152
  text_x = (width - text_width) // 2
153
  text_y = height - 400 # Position text at the bottom
154
 
 
208
 
209
  return clip.fl(zoom_in_effect, apply_to=['mask'])
210
 
211
+ @tool
212
+ def create_video_from_images_and_audio(images, speeches, zoom_factor=1.2):
213
  """Creates video using images and audios.
214
  Args:
215
+ images: list of images in bytes format
216
+ speeches: list of speeches in bytes format"""
217
+
 
 
218
  clips = []
219
  temp_files = []
220
 
221
+ for i in range(min(len(images), len(speeches))):
222
+ # Save image to a temporary file
223
+ img_path = f"./temp_image_{i}.png"
224
+ with open(img_path, 'wb') as img_file:
225
+ img_file.write(images[i])
226
+
227
+ # Create an ImageClip
228
+ img_clip = ImageClip(img_path)
229
+
230
+ # Save audio to a temporary file
231
+ audio_path = f"./temp_audio_{i}.mp3"
232
+ with open(audio_path, 'wb') as audio_file:
233
+ audio_file.write(speeches[i])
234
+
235
+ # Create an AudioClip
236
+ audioclip = AudioFileClip(audio_path)
237
+
238
+ # Set the duration of the video clip to match the audio duration
239
  videoclip = img_clip.set_duration(audioclip.duration)
240
  zoomed_clip = apply_zoom_in_effect(videoclip, zoom_factor)
241
 
242
+ # Generate captions using the text for speech generation
243
+ caption = process_script(script)['text_for_speech_generation'][i]
 
 
 
 
 
244
 
245
+ temp_video_path = f"./temp_zoomed_{i}.mp4"
246
  zoomed_clip.write_videofile(temp_video_path, codec='libx264', fps=24)
247
  temp_files.append(temp_video_path)
248
 
249
+ final_video_path = f"./temp_captioned_{i}.mp4"
250
  add_text_to_video(temp_video_path, final_video_path, caption, duration=1, fontsize=60)
251
  temp_files.append(final_video_path)
252
 
 
256
  clips.append(final_clip)
257
 
258
  final_clip = concatenate_videoclips(clips)
259
+ final_clip.write_videofile("./final_video.mp4", codec='libx264', fps=24)
260
 
261
  # Close all video files properly
262
  for clip in clips:
263
  clip.close()
264
 
265
  # Remove all temporary files
266
+ # for temp_file in temp_files:
267
+ # try:
268
+ # os.remove(temp_file)
269
+ # except Exception as e:
270
+ # print(f"Error removing file {temp_file}: {e}")
271
 
272
+ return "./final_video.mp4"