Spaces:
Sleeping
Sleeping
Update services/embed_service/utils.py
Browse files
services/embed_service/utils.py
CHANGED
@@ -160,30 +160,7 @@ class DocumentEmbedder:
|
|
160 |
# basic items
|
161 |
self.files = files
|
162 |
#self.interactive = interactive
|
163 |
-
|
164 |
-
def clean_text(self, content: str) -> str:
|
165 |
-
"""
|
166 |
-
Remove unwanted characters and patterns in text input.
|
167 |
-
:param content: Text input.
|
168 |
-
:return: Cleaned version of original text input.
|
169 |
-
"""
|
170 |
-
|
171 |
-
# Fix hyphenated words broken by newline
|
172 |
-
content = re.sub(r'(\w+)-\n(\w+)', r'\1\2', content)
|
173 |
-
|
174 |
-
# Remove specific unwanted patterns and characters
|
175 |
-
unwanted_patterns = [
|
176 |
-
"\\n", " β", "ββββββββββ", "βββββββββ", "βββββ",
|
177 |
-
r'\\u[\dA-Fa-f]{4}', r'\uf075', r'\uf0b7'
|
178 |
-
]
|
179 |
-
for pattern in unwanted_patterns:
|
180 |
-
content = re.sub(pattern, "", content)
|
181 |
-
|
182 |
-
# Fix improperly spaced hyphenated words and normalize whitespace
|
183 |
-
content = re.sub(r'(\w)\s*-\s*(\w)', r'\1-\2', content)
|
184 |
-
content = re.sub(r'\s+', ' ', content)
|
185 |
-
|
186 |
-
return content
|
187 |
|
188 |
def embed(self):
|
189 |
"""stringing process above to embed and upsert directly to pinecone"""
|
|
|
160 |
# basic items
|
161 |
self.files = files
|
162 |
#self.interactive = interactive
|
163 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
164 |
|
165 |
def embed(self):
|
166 |
"""stringing process above to embed and upsert directly to pinecone"""
|