tykiww commited on
Commit
be97349
1 Parent(s): 68a6e02

Delete utilities/transcripts.py

Browse files
Files changed (1) hide show
  1. utilities/transcripts.py +0 -261
utilities/transcripts.py DELETED
@@ -1,261 +0,0 @@
1
- # Imports for Transcript Loader
2
- import os
3
- import webvtt
4
- import re
5
- from datetime import datetime
6
- from llama_index import Document
7
-
8
-
9
- # Imports for Document Embedder
10
- import gc
11
- import re
12
-
13
- from llama_index.embeddings.huggingface import HuggingFaceEmbedding
14
- from pinecone.grpc import PineconeGRPC
15
- from pinecone import ServerlessSpec
16
-
17
- from llama_index.vector_stores import PineconeVectorStore
18
- from llama_index.node_parser import SemanticSplitterNodeParser
19
- from llama_index.ingestion import IngestionPipeline
20
-
21
-
22
-
23
-
24
-
25
- class VTTTranscriptLoader:
26
- """
27
- vtt file ingestion and cleaning. This was done because vtt files
28
- are not recognized by llamaindex. The output should mirror that of
29
- any document loader from llamaindex or langchain.
30
- """
31
-
32
- def __init__(self, file_path):
33
- self.fp = file_path
34
- self.data = None
35
-
36
- def open_vtt(self, file_path, plaintext=True):
37
- """Read VTT file."""
38
- if plaintext:
39
- with open(file_path, "r") as f:
40
- data = f.readlines()
41
- else:
42
- data = webvtt.read(file_path)
43
- return data
44
-
45
- def extract_speaker_name(self, text):
46
- """Extracts the speaker name from a VTT caption."""
47
- match = re.search(r"<v (.*?)>", text)
48
- if match:
49
- return match.group(1)
50
- else:
51
- return None
52
-
53
- def extract_speaker_words(self, captions):
54
- """Extracts the speaker text from a VTT caption."""
55
- return [caption.text for caption in captions]
56
-
57
- def merge_speaker_words(self, words, speakers, split=True):
58
- """Joins speaker names with their words."""
59
- # Extract speaker names
60
- speaker_list = [self.extract_speaker_name(line) for line in speakers if self.extract_speaker_name(line)]
61
- # Extract words
62
- words_list = self.extract_speaker_words(words)
63
- # Combine speaker names and words
64
- combined_list = list(zip(speaker_list, words_list))
65
- # Return the combined list as a single string if split is False
66
- if not split:
67
- combined_list = '\n'.join([f"{name}: '{text}'" for name, text in combined_list])
68
- return combined_list, speaker_list
69
-
70
- def get_metadata(self, speaker_list, file_path):
71
- """Generates metadata for the transcript."""
72
- # Meeting length
73
- time_format = "%H:%M:%S.%f"
74
- sess = self.open_vtt(file_path, plaintext=False)
75
-
76
- dt1 = datetime.strptime(sess[0].start, time_format)
77
- dt2 = datetime.strptime(sess[-1].end, time_format)
78
-
79
- minutes = (dt2 - dt1).seconds / 60
80
- # Meeting date
81
- match = re.search(r"\d{4}[-_]\d{2}[-_]\d{2}", file_path)
82
- if match:
83
- date_str = match.group().replace('_', '-')
84
- date_obj = datetime.strptime(date_str, "%Y-%m-%d").date()
85
- else:
86
- date_obj = None
87
-
88
- # Pull dictionary here
89
- output = {
90
- 'title': file_path,
91
- 'duration': minutes,
92
- 'meeting_date': date_obj.strftime("%Y-%m-%d"),
93
- 'speakers': list(set(speaker_list)),
94
- }
95
-
96
- return output
97
-
98
- def manual_document(self, output, metadata):
99
- """Create document manually"""
100
- document = Document(text=output)
101
- document.metadata = metadata
102
- return document
103
-
104
- def process_file(self, file_path):
105
- """Processes a single VTT file and returns the combined speaker names and words."""
106
- # Get words as webvtt captions
107
- words = self.open_vtt(file_path, plaintext=False)
108
- # Get speaker lines as plaintext
109
- speaker = self.open_vtt(file_path, plaintext=True)
110
- # Combine speaker names and words
111
- output, speaker_list = self.merge_speaker_words(words, speaker, split=False)
112
- # Get session data as dictionary
113
- metadata = self.get_metadata(speaker_list, file_path)
114
-
115
- return self.manual_document(output, metadata)
116
-
117
-
118
- def load(self):
119
- """Processes all VTT files in the given list of directories or files and returns a list of results."""
120
- results = []
121
- for path in self.fp:
122
- if os.path.isdir(path):
123
- for root, _, files in os.walk(path):
124
- for file in files:
125
- if file.endswith('.vtt'):
126
- file_path = os.path.join(root, file)
127
- transcript = self.process_file(file_path)
128
- results.append(transcript)
129
- else:
130
- if path.endswith('.vtt'):
131
- transcript = self.process_file(path)
132
- results.append(transcript)
133
- return results
134
-
135
-
136
- class DocumentEmbedder:
137
- """
138
- Takes a document and embeds it directly into a pinecone data store.
139
- Process retrieves, cleans, embeds, and sends the documents to vector
140
- store.
141
-
142
- Currently supports hugginface embeddings only. Gotta keep things cheap.
143
- """
144
-
145
- def __init__(self, api_keys, files, embedding, index_name):
146
- # api keys
147
- self.pinecone_api_key = api_keys['pinecone']
148
- self.openai_api_key = api_keys['openai']
149
- self.huggingface_api_key = api_keys['huggingface']
150
- # pinecone
151
- self.embedding = embedding
152
- self.vector_db = index_name
153
- # basic items
154
- self.files = files
155
- #self.interactive = interactive
156
-
157
-
158
- def clean_text(self, content: str) -> str:
159
- """
160
- Remove unwanted characters and patterns in text input.
161
- :param content: Text input.
162
- :return: Cleaned version of original text input.
163
- """
164
-
165
- # Fix hyphenated words broken by newline
166
- content = re.sub(r'(\w+)-\n(\w+)', r'\1\2', content)
167
-
168
- # Remove specific unwanted patterns and characters
169
- unwanted_patterns = [
170
- "\\n", " —", "——————————", "—————————", "—————",
171
- r'\\u[\dA-Fa-f]{4}', r'\uf075', r'\uf0b7'
172
- ]
173
- for pattern in unwanted_patterns:
174
- content = re.sub(pattern, "", content)
175
-
176
- # Fix improperly spaced hyphenated words and normalize whitespace
177
- content = re.sub(r'(\w)\s*-\s*(\w)', r'\1-\2', content)
178
- content = re.sub(r'\s+', ' ', content)
179
-
180
- return content
181
-
182
-
183
- def create_embedder(self):
184
- """Get the right embedding model"""
185
-
186
- embedding = HuggingFaceEmbedding(model_name=self.embedding)
187
- return embedding
188
-
189
-
190
- def pinecone_pipeline(self, embedding):
191
- """Initialize pinecone connection and vectorstore"""
192
-
193
- # connect
194
- pc = PineconeGRPC(api_key=self.pinecone_api_key)
195
-
196
- # Create your index if index does not exist
197
- indexes = [i.name for i in pc.list_indexes()]
198
- index_exists = any([self.vector_db in i for i in indexes])
199
-
200
- if index_exists:
201
- print("Index already exists")
202
- else:
203
- print("Creating index")
204
- pc.create_index(
205
- self.vector_db,
206
- dimension=768,
207
- metric="cosine",
208
- spec=ServerlessSpec(cloud="aws", region="us-east-1"),
209
- )
210
-
211
- # Initialize your index
212
- pinecone_index = pc.Index(self.vector_db)
213
-
214
- # Initialize VectorStore
215
- vector_store = PineconeVectorStore(pinecone_index=pinecone_index)
216
-
217
- # create pipeline (abstracts away the need to adaptively process and batch)
218
- pipeline = IngestionPipeline(
219
- transformations=[
220
- # creating appropriate chunks and cutoffs (this needs to be worked on).
221
- SemanticSplitterNodeParser(
222
- buffer_size=10, # 1 = each sentence is a node
223
- breakpoint_percentile_threshold=95,
224
- embed_model=embedding,
225
- ),
226
- embedding,
227
- ],
228
- vector_store=vector_store
229
- )
230
-
231
- return pipeline
232
-
233
-
234
- def embed(self):
235
- """stringing process above to embed and upsert directly to pinecone"""
236
-
237
- # read_file
238
- print("reading files")
239
- results = self.files
240
-
241
- # Call clean function
242
- print("cleaning files")
243
- for d in range(len(results)):
244
- results[d].text = self.clean_text(results[d].text)
245
-
246
- # set up embedder
247
- print("retrieving embedder")
248
- embedder = self.create_embedder()
249
-
250
- # set up pinecone pipeline
251
- print("initializing pinecone db")
252
- pipeline = self.pinecone_pipeline(embedder)
253
-
254
- # run pinecone in batches (of 1) for memory preservation.
255
- print("reading into pinecone db")
256
- batchsize = 1
257
- for i in range(0, len(results), batchsize):
258
- gc.collect()
259
- batch = pipeline.run(documents=results[i:i+batchsize])
260
- print("completed batch %s" % ((i+batchsize)/batchsize))
261
-