Spaces:
Sleeping
Sleeping
Update services/embed_service/utils.py
Browse files
services/embed_service/utils.py
CHANGED
@@ -9,6 +9,30 @@ from llama_index import Document
|
|
9 |
import gc
|
10 |
import re
|
11 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
|
13 |
class VTTTranscriptLoader:
|
14 |
"""
|
@@ -171,7 +195,7 @@ class DocumentEmbedder:
|
|
171 |
# Call clean function
|
172 |
print("cleaning files")
|
173 |
for d in range(len(results)):
|
174 |
-
results[d].text =
|
175 |
|
176 |
# run pinecone in batches (of 1) for memory preservation.
|
177 |
print("reading into pinecone db")
|
|
|
9 |
import gc
|
10 |
import re
|
11 |
|
12 |
+
def clean_text(content: str) -> str:
|
13 |
+
"""
|
14 |
+
Remove unwanted characters and patterns in text input.
|
15 |
+
:param content: Text input.
|
16 |
+
:return: Cleaned version of original text input.
|
17 |
+
"""
|
18 |
+
|
19 |
+
# Fix hyphenated words broken by newline
|
20 |
+
content = re.sub(r'(\w+)-\n(\w+)', r'\1\2', content)
|
21 |
+
|
22 |
+
# Remove specific unwanted patterns and characters
|
23 |
+
unwanted_patterns = [
|
24 |
+
"\\n", " β", "ββββββββββ", "βββββββββ", "βββββ",
|
25 |
+
r'\\u[\dA-Fa-f]{4}', r'\uf075', r'\uf0b7'
|
26 |
+
]
|
27 |
+
for pattern in unwanted_patterns:
|
28 |
+
content = re.sub(pattern, "", content)
|
29 |
+
|
30 |
+
# Fix improperly spaced hyphenated words and normalize whitespace
|
31 |
+
content = re.sub(r'(\w)\s*-\s*(\w)', r'\1-\2', content)
|
32 |
+
content = re.sub(r'\s+', ' ', content)
|
33 |
+
|
34 |
+
return content
|
35 |
+
|
36 |
|
37 |
class VTTTranscriptLoader:
|
38 |
"""
|
|
|
195 |
# Call clean function
|
196 |
print("cleaning files")
|
197 |
for d in range(len(results)):
|
198 |
+
results[d].text = clean_text(results[d].text)
|
199 |
|
200 |
# run pinecone in batches (of 1) for memory preservation.
|
201 |
print("reading into pinecone db")
|