Spaces:

Prudvireddy
/

lablabai-hackathon

Running

App Files Files Community

Prudvireddy commited on Jul 5, 2024

Commit

063ae63

•

1 Parent(s): 8e4df09

Update tools.py

Browse files

Files changed (1) hide show

tools.py +114 -131

tools.py CHANGED Viewed

@@ -20,104 +20,16 @@ import numpy as np
 from PIL import Image, ImageDraw, ImageFont
 from moviepy.editor import VideoFileClip, AudioFileClip, concatenate_videoclips, ImageClip
-def process_script(script):
-    """Used to process the script into dictionary format"""
-    dict = {}
-    text_for_image_generation = re.findall(r'<image>(.*?)</?image>', script, re.DOTALL)
-    text_for_speech_generation = re.findall(r'<narration>(.*?)</?narration>', script, re.DOTALL)
-    dict['text_for_image_generation'] = text_for_image_generation
-    dict['text_for_speech_generation'] = text_for_speech_generation
-    return dict
-@tool
-def image_generator(script):
-    """Generates images for the given script.
-    Saves it to images_dir and return path
-    Args:
-    script: a complete script containing narrations and image descriptions
-    Returns:
-    A list of images in bytes format.
-    """
-    # images_dir = './outputs/images'
-    # for filename in os.listdir(images_dir):
-    #     file_path = os.path.join(images_dir, filename)
-    #     if os.path.isfile(file_path):
-    #         os.remove(file_path)
-    dict = process_script(script)
-    images_list = []
-    for i, text in enumerate(dict['text_for_image_generation']):
-      response = requests.post(
-        f"https://api.stability.ai/v2beta/stable-image/generate/core",
-        headers={
-            "authorization": f'sk-2h9CmjC33uxc9W8fmx23oIicgqHk2jVtBF9KoEfdyTUIfODt',
-            "accept": "image/*"
-        },
-        files={"none": ''},
-        data={
-            "prompt": text,
-            "output_format": "png",
-            'aspect_ratio': "9:16",
-        },
-        )
-      print('image generated')
-      if response.status_code == 200:
-        images_list.append(response.content)
-      else:
-        raise Exception(str(response.json()))
-    return images_list
-@tool
-def generate_speech(script, lang='en', speed=1.2, max_segments=2):
-    """
-    Generates speech for the given script using gTTS and adjusts the speed.
-    Args:
-        script (str): The script containing narration segments.
-        lang (str, optional): The language code (default is 'en' for English).
-        speed (float, optional): The speed factor of speech generation (default is 1.0).
-        max_segments (int, optional): Maximum number of speech segments to generate (default is 2).
-    Returns:
-        list: List of generated speech segments as bytes.
-    """
-    dict = process_script(script)
-    speeches_list = []
-    # Ensure we limit the number of segments processed
-    segments_to_process = min(max_segments, len(dict['text_for_speech_generation']))
-    for text in dict['text_for_speech_generation'][:segments_to_process]:
-        # Generate speech
-        tts = gTTS(text=text, lang=lang)
-        # Save speech to BytesIO
-        speech_data = io.BytesIO()
-        tts.write_to_fp(speech_data)
-        speech_data.seek(0)
-        # Adjust speed if necessary
-        if speed != 1.0:
-            audio_segment = AudioSegment.from_file(speech_data, format="mp3")
-            audio_segment = audio_segment.speedup(playback_speed=speed)
-            speech_data = io.BytesIO()
-            audio_segment.export(speech_data, format="mp3")
-            speech_data.seek(0)
-        speeches_list.append(speech_data.read())
-    return speeches_list
 def split_text_into_chunks(text, chunk_size):
     words = text.split()
     return [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
-def add_text_to_video(input_video, output_video, text, duration=1, fontsize=40, fontcolor=(255, 255, 255),
-                      outline_thickness=2, outline_color=(0, 0, 0), delay_between_chunks=0.1,
-                      font_path=os.path.join(os.path.dirname(os.path.abspath(__name__)),'Montserrat-Bold.ttf')):
     chunks = split_text_into_chunks(text, 3)  # Adjust chunk size as needed
     cap = cv2.VideoCapture(input_video)
@@ -189,6 +101,8 @@ def add_text_to_video(input_video, output_video, text, duration=1, fontsize=40,
     cap.release()
     out.release()
     cv2.destroyAllWindows()
 def apply_zoom_in_effect(clip, zoom_factor=1.2):
     width, height = clip.size
@@ -208,46 +122,32 @@ def apply_zoom_in_effect(clip, zoom_factor=1.2):
     return clip.fl(zoom_in_effect, apply_to=['mask'])
-@tool
-def create_video_from_images_and_audio(images, speeches, zoom_factor=1.2):
-    """Creates video using images and audios.
-    Args:
-    images: list of images in bytes format
-    speeches: list of speeches in bytes format"""
     clips = []
     temp_files = []
-    for i in range(min(len(images), len(speeches))):
-        # Save image to a temporary file
-        img_path = f"./temp_image_{i}.png"
-        with open(img_path, 'wb') as img_file:
-            img_file.write(images[i])
-        # Create an ImageClip
-        img_clip = ImageClip(img_path)
-        # Save audio to a temporary file
-        audio_path = f"./temp_audio_{i}.mp3"
-        with open(audio_path, 'wb') as audio_file:
-            audio_file.write(speeches[i])
-        # Create an AudioClip
-        audioclip = AudioFileClip(audio_path)
-        # Set the duration of the video clip to match the audio duration
         videoclip = img_clip.set_duration(audioclip.duration)
         zoomed_clip = apply_zoom_in_effect(videoclip, zoom_factor)
-        # Generate captions using the text for speech generation
-        caption = process_script(script)['text_for_speech_generation'][i]
-        temp_video_path = f"./temp_zoomed_{i}.mp4"
         zoomed_clip.write_videofile(temp_video_path, codec='libx264', fps=24)
         temp_files.append(temp_video_path)
-        final_video_path = f"./temp_captioned_{i}.mp4"
-        add_text_to_video(temp_video_path, final_video_path, caption, duration=1, fontsize=60)
         temp_files.append(final_video_path)
         final_clip = VideoFileClip(final_video_path)
@@ -256,20 +156,25 @@ def create_video_from_images_and_audio(images, speeches, zoom_factor=1.2):
         clips.append(final_clip)
     final_clip = concatenate_videoclips(clips)
-    final_clip.write_videofile("./final_video.mp4", codec='libx264', fps=24)
     # Close all video files properly
     for clip in clips:
         clip.close()
     # Remove all temporary files
-    # for temp_file in temp_files:
-    #     try:
-    #         os.remove(temp_file)
-    #     except Exception as e:
-    #         print(f"Error removing file {temp_file}: {e}")
-    return "./final_video.mp4"
 class WikiInputs(BaseModel):
     """Inputs to the wikipedia tool."""
@@ -290,3 +195,81 @@ wiki = Tool(
     func = wiki_tool.run,
     description= "{query:'input here'}"
 )

 from PIL import Image, ImageDraw, ImageFont
 from moviepy.editor import VideoFileClip, AudioFileClip, concatenate_videoclips, ImageClip
 def split_text_into_chunks(text, chunk_size):
     words = text.split()
     return [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
+def add_text_to_video(input_video, text, duration=1, fontsize=40, fontcolor=(255, 255, 255),
+                      outline_thickness=2, outline_color=(0, 0, 0), delay_between_chunks=0.3,
+                      font_path='Montserrat-Bold.ttf'):
+    temp_output_file = tempfile.NamedTemporaryFile(delete=False, suffix='.mp4')
+    output_video = temp_output_file.name
     chunks = split_text_into_chunks(text, 3)  # Adjust chunk size as needed
     cap = cv2.VideoCapture(input_video)
     cap.release()
     out.release()
     cv2.destroyAllWindows()
+    return output_video
 def apply_zoom_in_effect(clip, zoom_factor=1.2):
     width, height = clip.size
     return clip.fl(zoom_in_effect, apply_to=['mask'])
+def create_video_from_images_and_audio(images_dir, speeches_dir, zoom_factor=1.2):
+    client = Groq(api_key='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')
+    images_paths = sorted(os.listdir(images_dir))
+    audio_paths = sorted(os.listdir(speeches_dir))
     clips = []
     temp_files = []
+    for i in range(min(len(images_paths), len(audio_paths))):
+        img_clip = ImageClip(os.path.join(images_dir, images_paths[i]))
+        audioclip = AudioFileClip(os.path.join(speeches_dir, audio_paths[i]))
         videoclip = img_clip.set_duration(audioclip.duration)
         zoomed_clip = apply_zoom_in_effect(videoclip, zoom_factor)
+        with open(os.path.join(speeches_dir, audio_paths[i]), "rb") as file:
+            transcription = client.audio.transcriptions.create(
+                file=(audio_paths[i], file.read()),
+                model="whisper-large-v3",
+                response_format="verbose_json",
+            )
+            caption = transcription.text
+        temp_video_path = tempfile.NamedTemporaryFile(delete=False, suffix='.mp4').name
         zoomed_clip.write_videofile(temp_video_path, codec='libx264', fps=24)
         temp_files.append(temp_video_path)
+        final_video_path = add_text_to_video(temp_video_path, caption, duration=1, fontsize=60)
         temp_files.append(final_video_path)
         final_clip = VideoFileClip(final_video_path)
         clips.append(final_clip)
     final_clip = concatenate_videoclips(clips)
+    temp_final_video = tempfile.NamedTemporaryFile(delete=False, suffix='.mp4').name
+    final_clip.write_videofile(temp_final_video, codec='libx264', fps=24)
     # Close all video files properly
     for clip in clips:
         clip.close()
     # Remove all temporary files
+    for temp_file in temp_files:
+        try:
+            os.remove(temp_file)
+        except Exception as e:
+            print(f"Error removing file {temp_file}: {e}")
+    return temp_final_video
+from langchain.pydantic_v1 import BaseModel, Field
+from langchain_community.tools import WikipediaQueryRun
+from langchain_community.utilities import WikipediaAPIWrapper
 class WikiInputs(BaseModel):
     """Inputs to the wikipedia tool."""
     func = wiki_tool.run,
     description= "{query:'input here'}"
 )
+def process_script(script):
+    """Used to process the script into dictionary format"""
+    dict = {}
+    text_for_image_generation = re.findall(r'<image>(.*?)</?image>', script, re.DOTALL)
+    text_for_speech_generation = re.findall(r'<narration>(.*?)</?narration>', script, re.DOTALL)
+    dict['text_for_image_generation'] = text_for_image_generation
+    dict['text_for_speech_generation'] = text_for_speech_generation
+    return dict
+def generate_speech(text, lang='en', speed=1.15, num=0):
+    """
+    Generates speech for the given script using gTTS and adjusts the speed.
+    """
+    temp_speech_file = tempfile.NamedTemporaryFile(delete=False, suffix='.mp3')
+    temp_speech_path = temp_speech_file.name
+    tts = gTTS(text=text, lang=lang)
+    tts.save(temp_speech_path)
+    sound = AudioSegment.from_file(temp_speech_path)
+    if speed != 1.0:
+        sound_with_altered_speed = sound._spawn(sound.raw_data, overrides={
+            "frame_rate": int(sound.frame_rate * speed)
+        }).set_frame_rate(sound.frame_rate)
+        sound_with_altered_speed.export(temp_speech_path, format="mp3")
+    else:
+        sound.export(temp_speech_path, format="mp3")
+    temp_speech_file.close()
+    return temp_speech_path
+def image_generator(script):
+    """Generates images for the given script.
+    Saves it to a temporary directory and returns the path.
+    Args:
+    script: a complete script containing narrations and image descriptions."""
+    images_dir = tempfile.mkdtemp()
+    dict = process_script(script)
+    for i, text in enumerate(dict['text_for_image_generation']):
+        response = requests.post(
+            f"https://api.stability.ai/v2beta/stable-image/generate/core",
+            headers={
+                "authorization": 'sk-amF2RAcBrDHNwuFUivtDsZFGJ6hzISz53NhtjdY9bs0SsrLc',
+                "accept": "image/*"
+            },
+            files={"none": ''},
+            data={
+                "prompt": text,
+                "output_format": "png",
+                'aspect_ratio': "9:16",
+            },
+        )
+        if response.status_code == 200:
+            with open(os.path.join(images_dir, f'image_{i}.png'), 'wb') as file:
+                file.write(response.content)
+        else:
+            raise Exception(f"Image generation failed with status code {response.status_code} and message: {response.text}")
+    return images_dir
+def speech_generator(script):
+    """
+    Generates speech files for the given script using gTTS.
+    Saves them to a temporary directory and returns the path.
+    """
+    speeches_dir = tempfile.mkdtemp()
+    dict = process_script(script)
+    for i, text in enumerate(dict['text_for_speech_generation']):
+        speech_path = generate_speech(text, num=i)
+        os.rename(speech_path, os.path.join(speeches_dir, f'speech_{i}.mp3'))
+    return speeches_dir