Spaces:

aadnk
/

faster-whisper-webui

Running

App Files Files Community

aadnk commited on Nov 22, 2022

Commit

31f7bdb

•

1 Parent(s): 4aa12d0

Cache parallel processes

Browse files

Files changed (4) hide show

app.py +54 -27
cli.py +1 -1
src/vadParallel.py +83 -4
src/whisperContainer.py +31 -3

app.py CHANGED Viewed

@@ -1,3 +1,4 @@
 from typing import Iterator
 import argparse
@@ -5,12 +6,11 @@ from io import StringIO
 import os
 import pathlib
 import tempfile
-from src.vadParallel import ParallelTranscription
-from src.whisperContainer import WhisperContainer
 # External programs
-import whisper
 import ffmpeg
 # UI
@@ -50,13 +50,15 @@ LANGUAGES = [
 ]
 class WhisperTranscriber:
-    def __init__(self, inputAudioMaxDuration: float = DEFAULT_INPUT_AUDIO_MAX_DURATION, deleteUploadedFiles: bool = DELETE_UPLOADED_FILES):
-        self.model_cache = dict()
         self.parallel_device_list = None
         self.vad_model = None
-        self.inputAudioMaxDuration = inputAudioMaxDuration
-        self.deleteUploadedFiles = deleteUploadedFiles
     def transcribe_webui(self, modelName, languageName, urlData, uploadFile, microphoneData, task, vad, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow):
         try:
@@ -66,11 +68,7 @@ class WhisperTranscriber:
                 selectedLanguage = languageName.lower() if len(languageName) > 0 else None
                 selectedModel = modelName if modelName is not None else "base"
-                model = self.model_cache.get(selectedModel, None)
-                if not model:
-                    model = WhisperContainer(selectedModel)
-                    self.model_cache[selectedModel] = model
                 # Execute whisper
                 result = self.transcribe_file(model, source, selectedLanguage, task, vad, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow)
@@ -124,18 +122,34 @@ class WhisperTranscriber:
             result = self.process_vad(audio_path, whisperCallable, periodic_vad, period_config)
         else:
-            # Default VAD
-            result = whisperCallable(audio_path, 0, None, None)
         return result
     def process_vad(self, audio_path, whisperCallable, vadModel: AbstractTranscription, vadConfig: TranscriptionConfig):
-        if (self.parallel_device_list is None or len(self.parallel_device_list) == 0):
             # No parallel devices, so just run the VAD and Whisper in sequence
             return vadModel.transcribe(audio_path, whisperCallable, vadConfig)
-        parallell_vad = ParallelTranscription()
-        return parallell_vad.transcribe_parallel(transcription=vadModel, audio=audio_path, whisperCallable=whisperCallable, config=vadConfig, devices=self.parallel_device_list)
     def _concat_prompt(self, prompt1, prompt2):
         if (prompt1 is None):
@@ -177,7 +191,7 @@ class WhisperTranscriber:
         return output_files, text, vtt
     def clear_cache(self):
-        self.model_cache = dict()
         self.vad_model = None
     def __get_source(self, urlData, uploadFile, microphoneData):
@@ -229,9 +243,16 @@ class WhisperTranscriber:
         return file.name
-def create_ui(inputAudioMaxDuration, share=False, server_name: str = None, server_port: int = 7860, vad_parallel_devices: str = None):
-    ui = WhisperTranscriber(inputAudioMaxDuration)
     # Specify a list of devices to use for parallel processing
     ui.parallel_device_list = [ device.strip() for device in vad_parallel_devices.split(",") ] if vad_parallel_devices else None
@@ -242,19 +263,19 @@ def create_ui(inputAudioMaxDuration, share=False, server_name: str = None, serve
     ui_description += "\n\n\n\nFor longer audio files (>10 minutes) not in English, it is recommended that you select Silero VAD (Voice Activity Detector) in the VAD option."
-    if inputAudioMaxDuration > 0:
-        ui_description += "\n\n" + "Max audio file length: " + str(inputAudioMaxDuration) + " s"
     ui_article = "Read the [documentation here](https://huggingface.co/spaces/aadnk/whisper-webui/blob/main/docs/options.md)"
     demo = gr.Interface(fn=ui.transcribe_webui, description=ui_description, article=ui_article, inputs=[
-        gr.Dropdown(choices=["tiny", "base", "small", "medium", "large"], value="medium", label="Model"),
         gr.Dropdown(choices=sorted(LANGUAGES), label="Language"),
         gr.Text(label="URL (YouTube, etc.)"),
         gr.Audio(source="upload", type="filepath", label="Upload Audio"),
         gr.Audio(source="microphone", type="filepath", label="Microphone Input"),
         gr.Dropdown(choices=["transcribe", "translate"], label="Task"),
-        gr.Dropdown(choices=["none", "silero-vad", "silero-vad-skip-gaps", "silero-vad-expand-into-gaps", "periodic-vad"], label="VAD"),
         gr.Number(label="VAD - Merge Window (s)", precision=0, value=5),
         gr.Number(label="VAD - Max Merge Size (s)", precision=0, value=30),
         gr.Number(label="VAD - Padding (s)", precision=None, value=1),
@@ -265,15 +286,21 @@ def create_ui(inputAudioMaxDuration, share=False, server_name: str = None, serve
         gr.Text(label="Segments")
     ])
-    demo.launch(share=share, server_name=server_name, server_port=server_port)
 if __name__ == '__main__':
     parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-    parser.add_argument("--inputAudioMaxDuration", type=int, default=600, help="Maximum audio file length in seconds, or -1 for no limit.")
     parser.add_argument("--share", type=bool, default=False, help="True to share the app on HuggingFace.")
     parser.add_argument("--server_name", type=str, default=None, help="The host or IP to bind to. If None, bind to localhost.")
     parser.add_argument("--server_port", type=int, default=7860, help="The port to bind to.")
-    parser.add_argument("--vad_parallel_devices", type=str, default="0,1", help="A commma delimited list of CUDA devices to use for parallel processing. If None, disable parallel processing.")
     args = parser.parse_args().__dict__
     create_ui(**args)

+import math
 from typing import Iterator
 import argparse
 import os
 import pathlib
 import tempfile
+from src.vadParallel import ParallelContext, ParallelTranscription
+from src.whisperContainer import WhisperContainer, WhisperModelCache
 # External programs
 import ffmpeg
 # UI
 ]
 class WhisperTranscriber:
+    def __init__(self, input_audio_max_duration: float = DEFAULT_INPUT_AUDIO_MAX_DURATION, vad_process_timeout: float = None, delete_uploaded_files: bool = DELETE_UPLOADED_FILES):
+        self.model_cache = WhisperModelCache()
         self.parallel_device_list = None
+        self.parallel_context = None
+        self.vad_process_timeout = vad_process_timeout
         self.vad_model = None
+        self.inputAudioMaxDuration = input_audio_max_duration
+        self.deleteUploadedFiles = delete_uploaded_files
     def transcribe_webui(self, modelName, languageName, urlData, uploadFile, microphoneData, task, vad, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow):
         try:
                 selectedLanguage = languageName.lower() if len(languageName) > 0 else None
                 selectedModel = modelName if modelName is not None else "base"
+                model = WhisperContainer(model_name=selectedModel, cache=self.model_cache)
                 # Execute whisper
                 result = self.transcribe_file(model, source, selectedLanguage, task, vad, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow)
             result = self.process_vad(audio_path, whisperCallable, periodic_vad, period_config)
         else:
+            if (self._has_parallel_devices()):
+                # Use a simple period transcription instead, as we need to use the parallel context
+                periodic_vad = VadPeriodicTranscription()
+                period_config = PeriodicTranscriptionConfig(periodic_duration=math.inf, max_prompt_window=1)
+                result = self.process_vad(audio_path, whisperCallable, periodic_vad, period_config)
+            else:
+                # Default VAD
+                result = whisperCallable(audio_path, 0, None, None)
         return result
     def process_vad(self, audio_path, whisperCallable, vadModel: AbstractTranscription, vadConfig: TranscriptionConfig):
+        if (not self._has_parallel_devices()):
             # No parallel devices, so just run the VAD and Whisper in sequence
             return vadModel.transcribe(audio_path, whisperCallable, vadConfig)
+        # Create parallel context if needed
+        if (self.parallel_context is None):
+            # Create a context wih processes and automatically clear the pool after 1 hour of inactivity
+            self.parallel_context = ParallelContext(num_processes=len(self.parallel_device_list), auto_cleanup_timeout_seconds=self.vad_process_timeout)
+        parallel_vad = ParallelTranscription()
+        return parallel_vad.transcribe_parallel(transcription=vadModel, audio=audio_path, whisperCallable=whisperCallable,
+                                                config=vadConfig, devices=self.parallel_device_list, parallel_context=self.parallel_context)
+    def _has_parallel_devices(self):
+        return self.parallel_device_list is not None and len(self.parallel_device_list) > 0
     def _concat_prompt(self, prompt1, prompt2):
         if (prompt1 is None):
         return output_files, text, vtt
     def clear_cache(self):
+        self.model_cache.clear()
         self.vad_model = None
     def __get_source(self, urlData, uploadFile, microphoneData):
         return file.name
+    def close(self):
+        self.clear_cache()
+        if (self.parallel_context is not None):
+            self.parallel_context.close()
+def create_ui(input_audio_max_duration, share=False, server_name: str = None, server_port: int = 7860,
+              default_model_name: str = "medium", default_vad: str = None, vad_parallel_devices: str = None, vad_process_timeout: float = None):
+    ui = WhisperTranscriber(input_audio_max_duration, vad_process_timeout)
     # Specify a list of devices to use for parallel processing
     ui.parallel_device_list = [ device.strip() for device in vad_parallel_devices.split(",") ] if vad_parallel_devices else None
     ui_description += "\n\n\n\nFor longer audio files (>10 minutes) not in English, it is recommended that you select Silero VAD (Voice Activity Detector) in the VAD option."
+    if input_audio_max_duration > 0:
+        ui_description += "\n\n" + "Max audio file length: " + str(input_audio_max_duration) + " s"
     ui_article = "Read the [documentation here](https://huggingface.co/spaces/aadnk/whisper-webui/blob/main/docs/options.md)"
     demo = gr.Interface(fn=ui.transcribe_webui, description=ui_description, article=ui_article, inputs=[
+        gr.Dropdown(choices=["tiny", "base", "small", "medium", "large"], value=default_model_name, label="Model"),
         gr.Dropdown(choices=sorted(LANGUAGES), label="Language"),
         gr.Text(label="URL (YouTube, etc.)"),
         gr.Audio(source="upload", type="filepath", label="Upload Audio"),
         gr.Audio(source="microphone", type="filepath", label="Microphone Input"),
         gr.Dropdown(choices=["transcribe", "translate"], label="Task"),
+        gr.Dropdown(choices=["none", "silero-vad", "silero-vad-skip-gaps", "silero-vad-expand-into-gaps", "periodic-vad"], value=default_vad, label="VAD"),
         gr.Number(label="VAD - Merge Window (s)", precision=0, value=5),
         gr.Number(label="VAD - Max Merge Size (s)", precision=0, value=30),
         gr.Number(label="VAD - Padding (s)", precision=None, value=1),
         gr.Text(label="Segments")
     ])
+    demo.launch(share=share, server_name=server_name, server_port=server_port)
+    # Clean up
+    ui.close()
 if __name__ == '__main__':
     parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument("--input_audio_max_duration", type=int, default=600, help="Maximum audio file length in seconds, or -1 for no limit.")
     parser.add_argument("--share", type=bool, default=False, help="True to share the app on HuggingFace.")
     parser.add_argument("--server_name", type=str, default=None, help="The host or IP to bind to. If None, bind to localhost.")
     parser.add_argument("--server_port", type=int, default=7860, help="The port to bind to.")
+    parser.add_argument("--default_model_name", type=str, default="medium", help="The default model name.")
+    parser.add_argument("--default_vad", type=str, default="silero-vad", help="The default VAD.")
+    parser.add_argument("--vad_parallel_devices", type=str, default="", help="A commma delimited list of CUDA devices to use for parallel processing. If None, disable parallel processing.")
+    parser.add_argument("--vad_process_timeout", type=float, default="1800", help="The number of seconds before inactivate processes are terminated. Use 0 to close processes immediately, or None for no timeout.")
     args = parser.parse_args().__dict__
     create_ui(**args)

cli.py CHANGED Viewed

@@ -74,7 +74,7 @@ def cli():
     vad_prompt_window = args.pop("vad_prompt_window")
     model = whisper.load_model(model_name, device=device, download_root=model_dir)
-    transcriber = WhisperTranscriber(deleteUploadedFiles=False)
     transcriber.parallel_device_list = args.pop("vad_parallel_devices")
     for audio_path in args.pop("audio"):

     vad_prompt_window = args.pop("vad_prompt_window")
     model = whisper.load_model(model_name, device=device, download_root=model_dir)
+    transcriber = WhisperTranscriber(delete_uploaded_files=False)
     transcriber.parallel_device_list = args.pop("vad_parallel_devices")
     for audio_path in args.pop("audio"):

src/vadParallel.py CHANGED Viewed

@@ -1,4 +1,6 @@
 import multiprocessing
 from src.vad import AbstractTranscription, TranscriptionConfig
 from src.whisperContainer import WhisperCallback
@@ -7,6 +9,68 @@ from multiprocessing import Pool
 from typing import List
 import os
 class ParallelTranscriptionConfig(TranscriptionConfig):
     def __init__(self, device_id: str, override_timestamps, initial_segment_index, copy: TranscriptionConfig = None):
         super().__init__(copy.non_speech_strategy, copy.segment_padding_left, copy.segment_padding_right, copy.max_silent_period, copy.max_merge_size, copy.max_prompt_window, initial_segment_index)
@@ -18,7 +82,7 @@ class ParallelTranscription(AbstractTranscription):
         super().__init__(sampling_rate=sampling_rate)
-    def transcribe_parallel(self, transcription: AbstractTranscription, audio: str, whisperCallable: WhisperCallback, config: TranscriptionConfig, devices: List[str]):
         # First, get the timestamps for the original audio
         merged = transcription.get_merged_timestamps(audio, config)
@@ -45,12 +109,19 @@ class ParallelTranscription(AbstractTranscription):
             'language': None
         }
         # Spawn a separate process for each device
-        context = multiprocessing.get_context('spawn')
-        with context.Pool(len(devices)) as p:
             # Run the transcription in parallel
-            results = p.starmap(self.transcribe, parameters)
             for result in results:
                 # Merge the results
@@ -61,6 +132,14 @@ class ParallelTranscription(AbstractTranscription):
                 if (result['language'] is not None):
                     merged['language'] = result['language']
         return merged
     def get_transcribe_timestamps(self, audio: str, config: ParallelTranscriptionConfig):

 import multiprocessing
+import threading
+import time
 from src.vad import AbstractTranscription, TranscriptionConfig
 from src.whisperContainer import WhisperCallback
 from typing import List
 import os
+class ParallelContext:
+    def __init__(self, num_processes: int = None, auto_cleanup_timeout_seconds: float = None):
+        self.num_processes = num_processes
+        self.auto_cleanup_timeout_seconds = auto_cleanup_timeout_seconds
+        self.lock = threading.Lock()
+        self.ref_count = 0
+        self.pool = None
+        self.cleanup_timer = None
+    def get_pool(self):
+        # Initialize pool lazily
+        if (self.pool is None):
+            context = multiprocessing.get_context('spawn')
+            self.pool = context.Pool(self.num_processes)
+        self.ref_count = self.ref_count + 1
+        if (self.auto_cleanup_timeout_seconds is not None):
+            self._stop_auto_cleanup()
+        return self.pool
+    def return_pool(self, pool):
+        if (self.pool == pool and self.ref_count > 0):
+            self.ref_count = self.ref_count - 1
+            if (self.ref_count == 0):
+                if (self.auto_cleanup_timeout_seconds is not None):
+                    self._start_auto_cleanup()
+    def _start_auto_cleanup(self):
+        if (self.cleanup_timer is not None):
+            self.cleanup_timer.cancel()
+        self.cleanup_timer = threading.Timer(self.auto_cleanup_timeout_seconds, self._execute_cleanup)
+        self.cleanup_timer.start()
+        print("Started auto cleanup of pool in " + str(self.auto_cleanup_timeout_seconds) + " seconds")
+    def _stop_auto_cleanup(self):
+        if (self.cleanup_timer is not None):
+            self.cleanup_timer.cancel()
+            self.cleanup_timer = None
+            print("Stopped auto cleanup of pool")
+    def _execute_cleanup(self):
+        print("Executing cleanup of pool")
+        if (self.ref_count == 0):
+            self.close()
+    def close(self):
+        self._stop_auto_cleanup()
+        if (self.pool is not None):
+            print("Closing pool of " + str(self.num_processes) + " processes")
+            self.pool.close()
+            self.pool.join()
+        self.pool = None
 class ParallelTranscriptionConfig(TranscriptionConfig):
     def __init__(self, device_id: str, override_timestamps, initial_segment_index, copy: TranscriptionConfig = None):
         super().__init__(copy.non_speech_strategy, copy.segment_padding_left, copy.segment_padding_right, copy.max_silent_period, copy.max_merge_size, copy.max_prompt_window, initial_segment_index)
         super().__init__(sampling_rate=sampling_rate)
+    def transcribe_parallel(self, transcription: AbstractTranscription, audio: str, whisperCallable: WhisperCallback, config: TranscriptionConfig, devices: List[str], parallel_context: ParallelContext = None):
         # First, get the timestamps for the original audio
         merged = transcription.get_merged_timestamps(audio, config)
             'language': None
         }
+        created_context = False
         # Spawn a separate process for each device
+        try:
+            if (parallel_context is None):
+                parallel_context = ParallelContext(len(devices))
+                created_context = True
+            # Get a pool of processes
+            pool = parallel_context.get_pool()
             # Run the transcription in parallel
+            results = pool.starmap(self.transcribe, parameters)
             for result in results:
                 # Merge the results
                 if (result['language'] is not None):
                     merged['language'] = result['language']
+        finally:
+            # Return the pool to the context
+            if (parallel_context is not None):
+                parallel_context.return_pool(pool)
+            # Always close the context if we created it
+            if (created_context):
+                parallel_context.close()
         return merged
     def get_transcribe_timestamps(self, audio: str, config: ParallelTranscriptionConfig):

src/whisperContainer.py CHANGED Viewed

@@ -1,18 +1,44 @@
 # External programs
 import whisper
 class WhisperContainer:
-    def __init__(self, model_name: str, device: str = None):
         self.model_name = model_name
         self.device = device
         # Will be created on demand
         self.model = None
     def get_model(self):
         if self.model is None:
-            print("Loading model " + self.model_name)
-            self.model = whisper.load_model(self.model_name, device=self.device)
         return self.model
     def create_callback(self, language: str = None, task: str = None, initial_prompt: str = None, **decodeOptions: dict):
@@ -44,6 +70,8 @@ class WhisperContainer:
         self.model_name = state["model_name"]
         self.device = state["device"]
         self.model = None
 class WhisperCallback:

 # External programs
 import whisper
+class WhisperModelCache:
+    def __init__(self):
+        self._cache = dict()
+    def get(self, model_name, device: str = None):
+        key = model_name + ":" + (device if device else '')
+        result = self._cache.get(key)
+        if result is None:
+            print("Loading whisper model " + model_name)
+            result = whisper.load_model(name=model_name, device=device)
+            self._cache[key] = result
+        return result
+    def clear(self):
+        self._cache.clear()
+# A global cache of models. This is mainly used by the daemon processes to avoid loading the same model multiple times.
+GLOBAL_WHISPER_MODEL_CACHE = WhisperModelCache()
 class WhisperContainer:
+    def __init__(self, model_name: str, device: str = None, cache: WhisperModelCache = None):
         self.model_name = model_name
         self.device = device
+        self.cache = cache
         # Will be created on demand
         self.model = None
     def get_model(self):
         if self.model is None:
+            if (self.cache is None):
+                print("Loading whisper model " + self.model_name)
+                self.model = whisper.load_model(self.model_name, device=self.device)
+            else:
+                self.model = self.cache.get(self.model_name, device=self.device)
         return self.model
     def create_callback(self, language: str = None, task: str = None, initial_prompt: str = None, **decodeOptions: dict):
         self.model_name = state["model_name"]
         self.device = state["device"]
         self.model = None
+        # Depickled objects must use the global cache
+        self.cache = GLOBAL_WHISPER_MODEL_CACHE
 class WhisperCallback: