Spaces:
Sleeping
Sleeping
import os | |
import webvtt | |
import re | |
from datetime import datetime | |
from llama_index import Document | |
class VTTTranscriptLoader: | |
""" | |
vtt file ingestion and cleaning. This was done because vtt files | |
are not recognized by llamaindex. The output should mirror that of | |
any document loader from llamaindex or langchain. | |
""" | |
def __init__(self, file_path): | |
self.fp = file_path | |
self.data = None | |
def open_vtt(self, file_path, plaintext=True): | |
"""Read VTT file.""" | |
if plaintext: | |
with open(file_path, "r") as f: | |
data = f.readlines() | |
else: | |
data = webvtt.read(file_path) | |
return data | |
def extract_speaker_name(self, text): | |
"""Extracts the speaker name from a VTT caption.""" | |
match = re.search(r"<v (.*?)>", text) | |
if match: | |
return match.group(1) | |
else: | |
return None | |
def extract_speaker_words(self, captions): | |
"""Extracts the speaker text from a VTT caption.""" | |
return [caption.text for caption in captions] | |
def merge_speaker_words(self, words, speakers, split=True): | |
"""Joins speaker names with their words.""" | |
# Extract speaker names | |
speaker_list = [self.extract_speaker_name(line) for line in speakers if self.extract_speaker_name(line)] | |
# Extract words | |
words_list = self.extract_speaker_words(words) | |
# Combine speaker names and words | |
combined_list = list(zip(speaker_list, words_list)) | |
# Return the combined list as a single string if split is False | |
if not split: | |
combined_list = '\n'.join([f"{name}: '{text}'" for name, text in combined_list]) | |
return combined_list, speaker_list | |
def get_metadata(self, speaker_list, file_path): | |
"""Generates metadata for the transcript.""" | |
# Meeting length | |
time_format = "%H:%M:%S.%f" | |
sess = self.open_vtt(file_path, plaintext=False) | |
dt1 = datetime.strptime(sess[0].start, time_format) | |
dt2 = datetime.strptime(sess[-1].end, time_format) | |
minutes = (dt2 - dt1).seconds / 60 | |
# Meeting date | |
match = re.search(r"\d{4}[-_]\d{2}[-_]\d{2}", file_path) | |
if match: | |
date_str = match.group().replace('_', '-') | |
date_obj = datetime.strptime(date_str, "%Y-%m-%d").date() | |
else: | |
date_obj = None | |
# Pull dictionary here | |
output = { | |
'title': file_path, | |
'duration': minutes, | |
'meeting_date': date_obj.strftime("%Y-%m-%d"), | |
'speakers': list(set(speaker_list)), | |
} | |
return output | |
def manual_document(self, output, metadata): | |
"""Create document manually""" | |
document = Document(text=output) | |
document.metadata = metadata | |
return document | |
def process_file(self, file_path): | |
"""Processes a single VTT file and returns the combined speaker names and words.""" | |
# Get words as webvtt captions | |
words = self.open_vtt(file_path, plaintext=False) | |
# Get speaker lines as plaintext | |
speaker = self.open_vtt(file_path, plaintext=True) | |
# Combine speaker names and words | |
output, speaker_list = self.merge_speaker_words(words, speaker, split=False) | |
# Get session data as dictionary | |
metadata = self.get_metadata(speaker_list, file_path) | |
return self.manual_document(output, metadata) | |
def load(self): | |
"""Processes all VTT files in the directory or the single file and returns a list of results.""" | |
results = [] | |
if os.path.isdir(self.fp): | |
for root, _, files in os.walk(self.fp): | |
for file in files: | |
if file.endswith('.vtt'): | |
file_path = os.path.join(root, file) | |
transcript = self.process_file(file_path) | |
results.append(transcript) | |
else: | |
transcript = self.process_file(self.fp) | |
results.append(transcript) | |
return results | |