Spaces:

Prudvireddy
/

lablabai-hackathon

Running

App Files Files Community

Prudvireddy commited on Jul 5, 2024

Commit

e881d3d

•

1 Parent(s): 995fb34

Update tools.py

Browse files

Files changed (1) hide show

tools.py +126 -319

tools.py CHANGED Viewed

@@ -1,217 +1,114 @@
 from langchain.tools import tool, Tool
 import re
 import os
 from langchain_groq import ChatGroq
-import requests
-import cv2
-from moviepy.editor import ImageClip, AudioFileClip, concatenate_videoclips
-from langchain.pydantic_v1 import BaseModel, Field
 from langchain_community.tools import WikipediaQueryRun
 from langchain_community.utilities import WikipediaAPIWrapper
 from gtts import gTTS
 from pydub import AudioSegment
-# from diffusers import StableDiffusionXLPipeline, DPMSolverSinglestepScheduler
-# import bitsandbytes as bnb
-# import torch.nn as nn
-# import torch
-# import pyttsx3
-# from agents import get_agents_and_tasks
-# from langchain_google_genai import ChatGoogleGenerativeAI
-# from langchain.chat_models import ChatOpenAI
-# # llm2 = ChatOpenAI(model='gpt-3.5-turbo')
-# # llm3 = ChatOpenAI(model='gpt-3.5-turbo')
-# llm1 = ChatGroq(model='llama3-70b-8192', temperature=0.6, max_tokens=2048)
-# # llm2 = ChatGroq(model='mixtral-8x7b-32768', temperature=0.6, max_tokens=2048, api_key='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')
-# llm2 = ChatGoogleGenerativeAI(model='gemini-pro', temperature=0.0)
-# # llm2 = ChatGroq(model='llama3-70b-8192', temperature=0.6, max_tokens=2048, api_key='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')
-# # llm3 = ChatGoogleGenerativeAI(model='gemini-pro')
-# llm4 = ChatGroq(model='llama3-70b-8192', temperature=0.6, max_tokens=2048, api_key='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')
-# groq_api_key=os.environ.get('GROQ_API_KEY')
-# llm = ChatGroq(model='llama3-70b-8192', temperature=0.6, max_tokens=1024, api_key=groq_api_key)
-# pipe = StableDiffusionXLPipeline.from_pretrained("sd-community/sdxl-flash", torch_dtype=torch.float16).to('cuda')
-# pipe.scheduler = DPMSolverSinglestepScheduler.from_config(pipe.scheduler.config, timestep_spacing="trailing")
-# def quantize_model_to_4bit(model):
-#     replacements = []
-#     # Collect layers to be replaced
-#     for name, module in model.named_modules():
-#         if isinstance(module, nn.Linear):
-#             replacements.append((name, module))
-#     # Replace layers
-#     for name, module in replacements:
-#         # Split the name to navigate to the parent module
-#         *path, last = name.split('.')
-#         parent = model
-#         for part in path:
-#             parent = getattr(parent, part)
-#         # Create and assign the quantized layer
-#         quantized_layer = bnb.nn.Linear4bit(module.in_features, module.out_features, bias=module.bias is not None)
-#         quantized_layer.weight.data = module.weight.data
-#         if module.bias is not None:
-#             quantized_layer.bias.data = module.bias.data
-#         setattr(parent, last, quantized_layer)
-#     return model
-# pipe.unet = quantize_model_to_4bit(pipe.unet)
-# pipe.enable_model_cpu_offload()
-# def generate_speech(text, speech_dir='./outputs/audio', lang='en', speed=170, voice='default', num=0):
-#     """
-#     Generates speech for given script.
-#     """
-#     engine = pyttsx3.init()
-#     # Set language and voice
-#     voices = engine.getProperty('voices')
-#     if voice == 'default':
-#         voice_id = voices[1].id
-#     else:
-#         # Try to find the voice with the given name
-#         voice_id = None
-#         for v in voices:
-#             if voice in v.name:
-#                 voice_id = v.id
-#                 break
-#         if not voice_id:
-#             raise ValueError(f"Voice '{voice}' not found.")
-#     engine.setProperty('voice', voice_id)
-#     engine.setProperty('rate', speed)
-#     # os.remove(os.path.join(os.path.dirname(os.path.abspath(__file__)), speech_dir, f'speech_{num}.mp3')) if os.path.exists(os.path.join(speech_dir, f'speech_{num}.mp3')) else None
-#     engine.save_to_file(text, os.path.join(os.path.dirname(os.path.abspath(__file__)), speech_dir, f'speech_{num}.mp3'))
-#     engine.runAndWait()
-def generate_speech(text, speech_dir='./outputs/speeches', lang='en', speed=1.0, num=0):
     """
     Generates speech for the given script using gTTS and adjusts the speed.
-    """
-    # Ensure the speech directory exists
-    if not os.path.exists(speech_dir):
-        os.makedirs(speech_dir)
-    # Generate speech
-    tts = gTTS(text=text, lang=lang)
-    # Save the speech to an MP3 file
-    speech_path = os.path.join(speech_dir, f'speech_{num}.mp3')
-    temp_path = os.path.join(speech_dir, f'temp_speech_{num}.mp3')
-    if os.path.exists(speech_path):
-        os.remove(speech_path)  # Remove existing file if it exists
-    tts.save(temp_path)
-    # Adjust the speed of the speech
-    sound = AudioSegment.from_file(temp_path)
-    if speed != 1.0:
-        sound_with_altered_speed = sound._spawn(sound.raw_data, overrides={
-            "frame_rate": int(sound.frame_rate * speed)
-        }).set_frame_rate(sound.frame_rate)
-        sound_with_altered_speed.export(speech_path, format="mp3")
-    else:
-        sound.export(speech_path, format="mp3")
-    os.remove(temp_path)  # Remove the temporary file
-    # print(f"Speech saved to {speech_path}")
-# Example usage
-# generate_speech("Hello, this is a test speech.", speed=1.2, num=1)
-# class VideoGeneration(BaseModel):
-#     images_dir : str = Field(description='Path to images directory, such as "outputs/images"')
-#     speeches_dir : str = Field(description='Path to speeches directory, such as "outputs/speeches"')
-# @tool(args_schema=VideoGeneration)
-# def create_video_from_images_and_audio(images_dir, speeches_dir, zoom_factor=1.2):
-#     """Creates video using images and audios with zoom-in effect"""
-#     images_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), images_dir)
-#     speeches_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), speeches_dir)
-#     images_paths = os.listdir(images_dir)
-#     audio_paths = os.listdir(speeches_dir)
-#     # print(images_paths, audio_paths)
-#     clips = []
-#     for i in range(min(len(images_paths), len(audio_paths))):
-#         # Load the image
-#         img_clip = ImageClip(os.path.join(images_dir, images_paths[i]))
-#         # Load the audio file
-#         audioclip = AudioFileClip(os.path.join(speeches_dir, audio_paths[i]))
-#         # Set the duration of the video clip to the duration of the audio file
-#         videoclip = img_clip.set_duration(audioclip.duration)
-#         # Apply zoom-in effect to the video clip
-#         zoomed_clip = apply_zoom_in_effect(videoclip, zoom_factor)
-#         # Add audio to the zoomed video clip
-#         zoomed_clip = zoomed_clip.set_audio(audioclip)
-#         clips.append(zoomed_clip)
-#     # Concatenate all video clips
-#     final_clip = concatenate_videoclips(clips)
-#     # Write the result to a file
-#     final_clip.write_videofile(os.path.join(os.path.dirname(os.path.abspath(__file__)), "outputs/final_video/final_video.mp4"), codec='libx264', fps=24)
-#     return os.path.join(os.path.dirname(os.path.abspath(__file__)), "outputs/final_video/final_video.mp4")
-# def apply_zoom_in_effect(clip, zoom_factor=1.2):
-#     width, height = clip.size
-#     duration = clip.duration
-#     def zoom_in_effect(get_frame, t):
-#         frame = get_frame(t)
-#         zoom = 1 + (zoom_factor - 1) * (t / duration)
-#         new_width, new_height = int(width * zoom), int(height * zoom)
-#         resized_frame = cv2.resize(frame, (new_width, new_height))
-#         # Calculate the position to crop the frame to the original size
-#         x_start = (new_width - width) // 2
-#         y_start = (new_height - height) // 2
-#         cropped_frame = resized_frame[y_start:y_start + height, x_start:x_start + width]
-#         return cropped_frame
-#     return clip.fl(zoom_in_effect, apply_to=['mask'])
-# Example usage
-# image_paths = "outputs/images"
-# audio_paths = "outputs/audio"
-# video_path = create_video_from_images_and_audio(image_paths, audio_paths)
-# print(f"Video created at: {video_path}")
-# class ImageGeneration(BaseModel):
-#     text : str = Field(description='description of sentence used for image generation')
-#     num : int = Field(description='sequence of description passed this tool. Used in image saving path. Example 1,2,3,4,5 and so on')
-# class SpeechGeneration(BaseModel):
-#     text : str = Field(description='description of sentence used for image generation')
-#     num : int = Field(description='sequence of description passed this tool. Used in image saving path. Example 1,2,3,4,5 and so on')
-import os
-import cv2
-from moviepy.editor import ImageClip, AudioFileClip, concatenate_videoclips, VideoFileClip
-from PIL import Image, ImageDraw, ImageFont
-import numpy as np
-from groq import Groq
-class VideoGeneration(BaseModel):
-    images_dir: str = Field(description='Path to images directory, such as "outputs/images"')
-    speeches_dir: str = Field(description='Path to speeches directory, such as "outputs/speeches"')
 def split_text_into_chunks(text, chunk_size):
     words = text.split()
@@ -219,7 +116,7 @@ def split_text_into_chunks(text, chunk_size):
 def add_text_to_video(input_video, output_video, text, duration=1, fontsize=40, fontcolor=(255, 255, 255),
                       outline_thickness=2, outline_color=(0, 0, 0), delay_between_chunks=0.1,
-                      font_path=os.path.join(os.path.dirname(os.path.abspath(__file__)),'Montserrat-Bold.ttf')):
     chunks = split_text_into_chunks(text, 3)  # Adjust chunk size as needed
@@ -250,7 +147,8 @@ def add_text_to_video(input_video, output_video, text, duration=1, fontsize=40,
         if current_frame % (chunk_duration_frames + delay_frames) < chunk_duration_frames and chunk_index < len(chunks):
             chunk = chunks[chunk_index]
-            text_width, text_height = draw.textsize(chunk, font=font)
             text_x = (width - text_width) // 2
             text_y = height - 400  # Position text at the bottom
@@ -310,37 +208,45 @@ def apply_zoom_in_effect(clip, zoom_factor=1.2):
     return clip.fl(zoom_in_effect, apply_to=['mask'])
-@tool(args_schema=VideoGeneration)
-def create_video_from_images_and_audio(images_dir, speeches_dir, zoom_factor=1.2):
     """Creates video using images and audios.
     Args:
-    images_dir: path to images folder, example 'outputs/images'
-    speeches_dir: path to speeches folder, example 'outputs/speeches'"""
-    client = Groq()
-    images_paths = sorted(os.listdir(os.path.join(os.path.dirname(os.path.abspath(__file__)),images_dir)))
-    audio_paths = sorted(os.listdir(os.path.join(os.path.dirname(os.path.abspath(__file__)),speeches_dir)))
     clips = []
     temp_files = []
-    for i in range(min(len(images_paths), len(audio_paths))):
-        img_clip = ImageClip(os.path.join(os.path.dirname(os.path.abspath(__file__)),images_dir, images_paths[i]))
-        audioclip = AudioFileClip(os.path.join(os.path.dirname(os.path.abspath(__file__)),speeches_dir, audio_paths[i]))
         videoclip = img_clip.set_duration(audioclip.duration)
         zoomed_clip = apply_zoom_in_effect(videoclip, zoom_factor)
-        with open(os.path.join(os.path.dirname(os.path.abspath(__file__)),speeches_dir, audio_paths[i]), "rb") as file:
-            transcription = client.audio.transcriptions.create(
-                file=(audio_paths[i], file.read()),
-                model="whisper-large-v3",
-                response_format="verbose_json",
-            )
-            caption = transcription.text
-        temp_video_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), f"outputs/final_video/temp_zoomed_{i}.mp4")
         zoomed_clip.write_videofile(temp_video_path, codec='libx264', fps=24)
         temp_files.append(temp_video_path)
-        final_video_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), f"outputs/final_video/temp_captioned_{i}.mp4")
         add_text_to_video(temp_video_path, final_video_path, caption, duration=1, fontsize=60)
         temp_files.append(final_video_path)
@@ -350,116 +256,17 @@ def create_video_from_images_and_audio(images_dir, speeches_dir, zoom_factor=1.2
         clips.append(final_clip)
     final_clip = concatenate_videoclips(clips)
-    final_clip.write_videofile(os.path.join(os.path.dirname(os.path.abspath(__file__)), "outputs/final_video/final_video.mp4"), codec='libx264', fps=24)
     # Close all video files properly
     for clip in clips:
         clip.close()
     # Remove all temporary files
-    for temp_file in temp_files:
-        try:
-            os.remove(temp_file)
-        except Exception as e:
-            print(f"Error removing file {temp_file}: {e}")
-    return os.path.join(os.path.dirname(os.path.abspath(__file__)), "outputs/final_video/final_video.mp4")
-# Example usage
-# image_paths = "outputs/images"
-# audio_paths = "outputs/speeches"
-# video_path = create_video_from_images_and_audio(image_paths, audio_paths)
-# print(f"Video created at: {video_path}")
-class WikiInputs(BaseModel):
-    """Inputs to the wikipedia tool."""
-    query: str = Field(description="query to look up in Wikipedia, should be 3 or less words")
-api_wrapper = WikipediaAPIWrapper(top_k_results=3)#, doc_content_chars_max=100)
-wiki_tool = WikipediaQueryRun(
-    name="wiki-tool",
-    description="{query:'input here'}",
-    args_schema=WikiInputs,
-    api_wrapper=api_wrapper,
-    return_direct=True,
-)
-wiki = Tool(
-    name = 'wikipedia',
-    func = wiki_tool.run,
-    description= "{query:'input here'}"
-)
-# wiki_tool.run("latest news in India")
-# @tool
-def process_script(script):
-    """Used to process the script into dictionary format"""
-    dict = {}
-    dict['text_for_image_generation'] = re.findall(r'<image>(.*?)</?image>', script)
-    dict['text_for_speech_generation'] = re.findall(r'<narration>.*?</?narration>', script)
-    return dict
-@tool#(args_schema=ImageGeneration)
-def image_generator(script):
-    """Generates images for the given script.
-    Saves it to images_dir and return path
-    Args:
-    script: a complete script containing narrations and image descriptions"""
-    # images_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), './outputs/images')
-    images_dir = os.path.join('./outputs/images')
-    os.makedirs(images_dir, exist_ok=True)
-    # if num==1:
-    for filename in os.listdir(images_dir):
-        file_path = os.path.join(images_dir, filename)
-        if os.path.isfile(file_path):
-            os.remove(file_path)
-    dict = process_script(script)
-    for i, text in enumerate(dict['text_for_image_generation']):
-        # image = pipe(text, num_inference_steps=12, guidance_scale=2, width=720, height=1280, verbose=0).images[0]
-        # image.save(os.path.join(images_dir, f'image{i}.jpg'))
-        response = requests.post(
-        f"https://api.stability.ai/v2beta/stable-image/generate/core",
-        headers={
-            "authorization": os.environ.get('STABILITY_AI_API_KEY'),
-            "accept": "image/*"
-        },
-        files={"none": ''},
-        data={
-            "prompt": text,
-            "output_format": "png",
-            'aspect_ratio': "9:16",
-        },
-        )
-        if response.status_code == 200:
-            with open(os.path.join(images_dir, f'image_{i}.png'), 'wb') as file:
-                file.write(response.content)
-        else:
-            raise Exception(str(response.json()))
-    return f'images generated.'#f'image generated for "{text}" and saved to directory {images_dir} as image{num}.jpg'
-@tool
-def speech_generator(script):
-    """Generates speech for given text
-    Saves it to speech_dir and return path
-    Args:
-    script: a complete script containing narrations and image descriptions"""
-    speech_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), './outputs/speeches')
-    os.makedirs(speech_dir, exist_ok=True)
-    # if num==1:
-    for filename in os.listdir(speech_dir):
-        file_path = os.path.join(speech_dir, filename)
-        if os.path.isfile(file_path):
-            os.remove(file_path)
-    dict = process_script(script)
-    print(dict)
-    for i, text in enumerate(dict['text_for_speech_generation']):
-        generate_speech(text, speech_dir, num=i)
-    return f'speechs generated.'#f'speech generated for "{text}" and saved to directory {speech_dir} as speech{num}.mp3'

+from crewai import Task, Agent, Crew, Process
 from langchain.tools import tool, Tool
 import re
 import os
 from langchain_groq import ChatGroq
+# llm = ChatGroq(model='mixtral-8x7b-32768', temperature=0.6, max_tokens=2048)
+llm = ChatGroq(model='llama3-70b-8192', temperature=0.6, max_tokens=1024, api_key='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')
 from langchain_community.tools import WikipediaQueryRun
 from langchain_community.utilities import WikipediaAPIWrapper
+from langchain_core.pydantic_v1 import BaseModel, Field
+import requests
+# import pyttsx3
+import io
+import tempfile
 from gtts import gTTS
 from pydub import AudioSegment
+from groq import Groq
+import cv2
+import numpy as np
+from PIL import Image, ImageDraw, ImageFont
+from moviepy.editor import VideoFileClip, AudioFileClip, concatenate_videoclips, ImageClip
+def process_script(script):
+    """Used to process the script into dictionary format"""
+    dict = {}
+    text_for_image_generation = re.findall(r'<image>(.*?)</?image>', script, re.DOTALL)
+    text_for_speech_generation = re.findall(r'<narration>(.*?)</?narration>', script, re.DOTALL)
+    dict['text_for_image_generation'] = text_for_image_generation
+    dict['text_for_speech_generation'] = text_for_speech_generation
+    return dict
+@tool
+def image_generator(script):
+    """Generates images for the given script.
+    Saves it to images_dir and return path
+    Args:
+    script: a complete script containing narrations and image descriptions
+    Returns:
+    A list of images in bytes format.
+    """
+    # images_dir = './outputs/images'
+    # for filename in os.listdir(images_dir):
+    #     file_path = os.path.join(images_dir, filename)
+    #     if os.path.isfile(file_path):
+    #         os.remove(file_path)
+    dict = process_script(script)
+    images_list = []
+    for i, text in enumerate(dict['text_for_image_generation']):
+      response = requests.post(
+        f"https://api.stability.ai/v2beta/stable-image/generate/core",
+        headers={
+            "authorization": f'sk-2h9CmjC33uxc9W8fmx23oIicgqHk2jVtBF9KoEfdyTUIfODt',
+            "accept": "image/*"
+        },
+        files={"none": ''},
+        data={
+            "prompt": text,
+            "output_format": "png",
+            'aspect_ratio': "9:16",
+        },
+        )
+      print('image generated')
+      if response.status_code == 200:
+        images_list.append(response.content)
+      else:
+        raise Exception(str(response.json()))
+    return images_list
+@tool
+def generate_speech(script, lang='en', speed=1.2, max_segments=2):
     """
     Generates speech for the given script using gTTS and adjusts the speed.
+    Args:
+        script (str): The script containing narration segments.
+        lang (str, optional): The language code (default is 'en' for English).
+        speed (float, optional): The speed factor of speech generation (default is 1.0).
+        max_segments (int, optional): Maximum number of speech segments to generate (default is 2).
+    Returns:
+        list: List of generated speech segments as bytes.
+    """
+    dict = process_script(script)
+    speeches_list = []
+    # Ensure we limit the number of segments processed
+    segments_to_process = min(max_segments, len(dict['text_for_speech_generation']))
+    for text in dict['text_for_speech_generation'][:segments_to_process]:
+        # Generate speech
+        tts = gTTS(text=text, lang=lang)
+        # Save speech to BytesIO
+        speech_data = io.BytesIO()
+        tts.write_to_fp(speech_data)
+        speech_data.seek(0)
+        # Adjust speed if necessary
+        if speed != 1.0:
+            audio_segment = AudioSegment.from_file(speech_data, format="mp3")
+            audio_segment = audio_segment.speedup(playback_speed=speed)
+            speech_data = io.BytesIO()
+            audio_segment.export(speech_data, format="mp3")
+            speech_data.seek(0)
+        speeches_list.append(speech_data.read())
+    return speeches_list
 def split_text_into_chunks(text, chunk_size):
     words = text.split()
 def add_text_to_video(input_video, output_video, text, duration=1, fontsize=40, fontcolor=(255, 255, 255),
                       outline_thickness=2, outline_color=(0, 0, 0), delay_between_chunks=0.1,
+                      font_path=os.path.join(os.path.dirname(os.path.abspath(__name__)),'Montserrat-Bold.ttf')):
     chunks = split_text_into_chunks(text, 3)  # Adjust chunk size as needed
         if current_frame % (chunk_duration_frames + delay_frames) < chunk_duration_frames and chunk_index < len(chunks):
             chunk = chunks[chunk_index]
+            text_bbox = draw.textbbox((0, 0), chunk, font=font)
+            text_width, text_height = text_bbox[2] - text_bbox[0], text_bbox[3] - text_bbox[1]
             text_x = (width - text_width) // 2
             text_y = height - 400  # Position text at the bottom
     return clip.fl(zoom_in_effect, apply_to=['mask'])
+@tool
+def create_video_from_images_and_audio(images, speeches, zoom_factor=1.2):
     """Creates video using images and audios.
     Args:
+    images: list of images in bytes format
+    speeches: list of speeches in bytes format"""
     clips = []
     temp_files = []
+    for i in range(min(len(images), len(speeches))):
+        # Save image to a temporary file
+        img_path = f"./temp_image_{i}.png"
+        with open(img_path, 'wb') as img_file:
+            img_file.write(images[i])
+        # Create an ImageClip
+        img_clip = ImageClip(img_path)
+        # Save audio to a temporary file
+        audio_path = f"./temp_audio_{i}.mp3"
+        with open(audio_path, 'wb') as audio_file:
+            audio_file.write(speeches[i])
+        # Create an AudioClip
+        audioclip = AudioFileClip(audio_path)
+        # Set the duration of the video clip to match the audio duration
         videoclip = img_clip.set_duration(audioclip.duration)
         zoomed_clip = apply_zoom_in_effect(videoclip, zoom_factor)
+        # Generate captions using the text for speech generation
+        caption = process_script(script)['text_for_speech_generation'][i]
+        temp_video_path = f"./temp_zoomed_{i}.mp4"
         zoomed_clip.write_videofile(temp_video_path, codec='libx264', fps=24)
         temp_files.append(temp_video_path)
+        final_video_path = f"./temp_captioned_{i}.mp4"
         add_text_to_video(temp_video_path, final_video_path, caption, duration=1, fontsize=60)
         temp_files.append(final_video_path)
         clips.append(final_clip)
     final_clip = concatenate_videoclips(clips)
+    final_clip.write_videofile("./final_video.mp4", codec='libx264', fps=24)
     # Close all video files properly
     for clip in clips:
         clip.close()
     # Remove all temporary files
+    # for temp_file in temp_files:
+    #     try:
+    #         os.remove(temp_file)
+    #     except Exception as e:
+    #         print(f"Error removing file {temp_file}: {e}")
+    return "./final_video.mp4"