multi-meeting-QnA

Sleeping

App Files Files Community

tykiww commited on Jul 29, 2024

Commit

a5f561a

verified ·

1 Parent(s): f9b358b

Create transcript_loader.py

Browse files

Files changed (1) hide show

utilities/transcript_loader.py +114 -0

utilities/transcript_loader.py ADDED Viewed

	@@ -0,0 +1,114 @@

+import os
+import webvtt
+import re
+from datetime import datetime
+from llama_index import Document
+class VTTTranscriptLoader:
+    """
+    vtt file ingestion and cleaning. This was done because vtt files
+    are not recognized by llamaindex. The output should mirror that of
+    any document loader from llamaindex or langchain.
+    """
+    def __init__(self, file_path):
+        self.fp = file_path
+        self.data = None
+    def open_vtt(self, file_path, plaintext=True):
+        """Read VTT file."""
+        if plaintext:
+            with open(file_path, "r") as f:
+                data = f.readlines()
+        else:
+            data = webvtt.read(file_path)
+        return data
+    def extract_speaker_name(self, text):
+        """Extracts the speaker name from a VTT caption."""
+        match = re.search(r"<v (.*?)>", text)
+        if match:
+            return match.group(1)
+        else:
+            return None
+    def extract_speaker_words(self, captions):
+        """Extracts the speaker text from a VTT caption."""
+        return [caption.text for caption in captions]
+    def merge_speaker_words(self, words, speakers, split=True):
+        """Joins speaker names with their words."""
+        # Extract speaker names
+        speaker_list = [self.extract_speaker_name(line) for line in speakers if self.extract_speaker_name(line)]
+        # Extract words
+        words_list = self.extract_speaker_words(words)
+        # Combine speaker names and words
+        combined_list = list(zip(speaker_list, words_list))
+        # Return the combined list as a single string if split is False
+        if not split:
+            combined_list = '\n'.join([f"{name}: '{text}'" for name, text in combined_list])
+        return combined_list, speaker_list
+    def get_metadata(self, speaker_list, file_path):
+        """Generates metadata for the transcript."""
+        # Meeting length
+        time_format = "%H:%M:%S.%f"
+        sess = self.open_vtt(file_path, plaintext=False)
+        dt1 = datetime.strptime(sess[0].start, time_format)
+        dt2 = datetime.strptime(sess[-1].end, time_format)
+        minutes = (dt2 - dt1).seconds / 60
+        # Meeting date
+        match = re.search(r"\d{4}[-_]\d{2}[-_]\d{2}", file_path)
+        if match:
+            date_str = match.group().replace('_', '-')
+            date_obj = datetime.strptime(date_str, "%Y-%m-%d").date()
+        else:
+            date_obj = None
+        # Pull dictionary here
+        output = {
+            'title': file_path,
+            'duration': minutes,
+            'meeting_date': date_obj.strftime("%Y-%m-%d"),
+            'speakers': list(set(speaker_list)),
+        }
+        return output
+    def manual_document(self, output, metadata):
+        """Create document manually"""
+        document = Document(text=output)
+        document.metadata = metadata
+        return document
+    def process_file(self, file_path):
+        """Processes a single VTT file and returns the combined speaker names and words."""
+        # Get words as webvtt captions
+        words = self.open_vtt(file_path, plaintext=False)
+        # Get speaker lines as plaintext
+        speaker = self.open_vtt(file_path, plaintext=True)
+        # Combine speaker names and words
+        output, speaker_list = self.merge_speaker_words(words, speaker, split=False)
+        # Get session data as dictionary
+        metadata = self.get_metadata(speaker_list, file_path)
+        return self.manual_document(output, metadata)
+    def load(self):
+        """Processes all VTT files in the directory or the single file and returns a list of results."""
+        results = []
+        if os.path.isdir(self.fp):
+            for root, _, files in os.walk(self.fp):
+                for file in files:
+                    if file.endswith('.vtt'):
+                        file_path = os.path.join(root, file)
+                        transcript = self.process_file(file_path)
+                        results.append(transcript)
+        else:
+            transcript = self.process_file(self.fp)
+            results.append(transcript)
+        return results