tykiww commited on
Commit
c14654e
β€’
1 Parent(s): 38a034d

Update services/embed_service/utils.py

Browse files
Files changed (1) hide show
  1. services/embed_service/utils.py +1 -24
services/embed_service/utils.py CHANGED
@@ -160,30 +160,7 @@ class DocumentEmbedder:
160
  # basic items
161
  self.files = files
162
  #self.interactive = interactive
163
-
164
- def clean_text(self, content: str) -> str:
165
- """
166
- Remove unwanted characters and patterns in text input.
167
- :param content: Text input.
168
- :return: Cleaned version of original text input.
169
- """
170
-
171
- # Fix hyphenated words broken by newline
172
- content = re.sub(r'(\w+)-\n(\w+)', r'\1\2', content)
173
-
174
- # Remove specific unwanted patterns and characters
175
- unwanted_patterns = [
176
- "\\n", " β€”", "β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”", "β€”β€”β€”β€”β€”β€”β€”β€”β€”", "β€”β€”β€”β€”β€”",
177
- r'\\u[\dA-Fa-f]{4}', r'\uf075', r'\uf0b7'
178
- ]
179
- for pattern in unwanted_patterns:
180
- content = re.sub(pattern, "", content)
181
-
182
- # Fix improperly spaced hyphenated words and normalize whitespace
183
- content = re.sub(r'(\w)\s*-\s*(\w)', r'\1-\2', content)
184
- content = re.sub(r'\s+', ' ', content)
185
-
186
- return content
187
 
188
  def embed(self):
189
  """stringing process above to embed and upsert directly to pinecone"""
 
160
  # basic items
161
  self.files = files
162
  #self.interactive = interactive
163
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
164
 
165
  def embed(self):
166
  """stringing process above to embed and upsert directly to pinecone"""