multi-meeting-QnA

Sleeping

App Files Files Community

tykiww commited on Jul 29, 2024

Commit

61cefee

•

1 Parent(s): 262ffbf

Delete utilities/transcript_loader.py

Browse files

Files changed (1) hide show

utilities/transcript_loader.py +0 -114

utilities/transcript_loader.py DELETED Viewed

@@ -1,114 +0,0 @@
-import os
-import webvtt
-import re
-from datetime import datetime
-from llama_index import Document
-class VTTTranscriptLoader:
-    """
-    vtt file ingestion and cleaning. This was done because vtt files
-    are not recognized by llamaindex. The output should mirror that of
-    any document loader from llamaindex or langchain.
-    """
-    def __init__(self, file_path):
-        self.fp = file_path
-        self.data = None
-    def open_vtt(self, file_path, plaintext=True):
-        """Read VTT file."""
-        if plaintext:
-            with open(file_path, "r") as f:
-                data = f.readlines()
-        else:
-            data = webvtt.read(file_path)
-        return data
-    def extract_speaker_name(self, text):
-        """Extracts the speaker name from a VTT caption."""
-        match = re.search(r"<v (.*?)>", text)
-        if match:
-            return match.group(1)
-        else:
-            return None
-    def extract_speaker_words(self, captions):
-        """Extracts the speaker text from a VTT caption."""
-        return [caption.text for caption in captions]
-    def merge_speaker_words(self, words, speakers, split=True):
-        """Joins speaker names with their words."""
-        # Extract speaker names
-        speaker_list = [self.extract_speaker_name(line) for line in speakers if self.extract_speaker_name(line)]
-        # Extract words
-        words_list = self.extract_speaker_words(words)
-        # Combine speaker names and words
-        combined_list = list(zip(speaker_list, words_list))
-        # Return the combined list as a single string if split is False
-        if not split:
-            combined_list = '\n'.join([f"{name}: '{text}'" for name, text in combined_list])
-        return combined_list, speaker_list
-    def get_metadata(self, speaker_list, file_path):
-        """Generates metadata for the transcript."""
-        # Meeting length
-        time_format = "%H:%M:%S.%f"
-        sess = self.open_vtt(file_path, plaintext=False)
-        dt1 = datetime.strptime(sess[0].start, time_format)
-        dt2 = datetime.strptime(sess[-1].end, time_format)
-        minutes = (dt2 - dt1).seconds / 60
-        # Meeting date
-        match = re.search(r"\d{4}[-_]\d{2}[-_]\d{2}", file_path)
-        if match:
-            date_str = match.group().replace('_', '-')
-            date_obj = datetime.strptime(date_str, "%Y-%m-%d").date()
-        else:
-            date_obj = None
-        # Pull dictionary here
-        output = {
-            'title': file_path,
-            'duration': minutes,
-            'meeting_date': date_obj.strftime("%Y-%m-%d"),
-            'speakers': list(set(speaker_list)),
-        }
-        return output
-    def manual_document(self, output, metadata):
-        """Create document manually"""
-        document = Document(text=output)
-        document.metadata = metadata
-        return document
-    def process_file(self, file_path):
-        """Processes a single VTT file and returns the combined speaker names and words."""
-        # Get words as webvtt captions
-        words = self.open_vtt(file_path, plaintext=False)
-        # Get speaker lines as plaintext
-        speaker = self.open_vtt(file_path, plaintext=True)
-        # Combine speaker names and words
-        output, speaker_list = self.merge_speaker_words(words, speaker, split=False)
-        # Get session data as dictionary
-        metadata = self.get_metadata(speaker_list, file_path)
-        return self.manual_document(output, metadata)
-    def load(self):
-        """Processes all VTT files in the directory or the single file and returns a list of results."""
-        results = []
-        if os.path.isdir(self.fp):
-            for root, _, files in os.walk(self.fp):
-                for file in files:
-                    if file.endswith('.vtt'):
-                        file_path = os.path.join(root, file)
-                        transcript = self.process_file(file_path)
-                        results.append(transcript)
-        else:
-            transcript = self.process_file(self.fp)
-            results.append(transcript)
-        return results