tykiww commited on
Commit
38a034d
β€’
1 Parent(s): 4c992c2

Update services/embed_service/utils.py

Browse files
Files changed (1) hide show
  1. services/embed_service/utils.py +25 -1
services/embed_service/utils.py CHANGED
@@ -9,6 +9,30 @@ from llama_index import Document
9
  import gc
10
  import re
11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
  class VTTTranscriptLoader:
14
  """
@@ -171,7 +195,7 @@ class DocumentEmbedder:
171
  # Call clean function
172
  print("cleaning files")
173
  for d in range(len(results)):
174
- results[d].text = self.clean_text(results[d].text)
175
 
176
  # run pinecone in batches (of 1) for memory preservation.
177
  print("reading into pinecone db")
 
9
  import gc
10
  import re
11
 
12
+ def clean_text(content: str) -> str:
13
+ """
14
+ Remove unwanted characters and patterns in text input.
15
+ :param content: Text input.
16
+ :return: Cleaned version of original text input.
17
+ """
18
+
19
+ # Fix hyphenated words broken by newline
20
+ content = re.sub(r'(\w+)-\n(\w+)', r'\1\2', content)
21
+
22
+ # Remove specific unwanted patterns and characters
23
+ unwanted_patterns = [
24
+ "\\n", " β€”", "β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”", "β€”β€”β€”β€”β€”β€”β€”β€”β€”", "β€”β€”β€”β€”β€”",
25
+ r'\\u[\dA-Fa-f]{4}', r'\uf075', r'\uf0b7'
26
+ ]
27
+ for pattern in unwanted_patterns:
28
+ content = re.sub(pattern, "", content)
29
+
30
+ # Fix improperly spaced hyphenated words and normalize whitespace
31
+ content = re.sub(r'(\w)\s*-\s*(\w)', r'\1-\2', content)
32
+ content = re.sub(r'\s+', ' ', content)
33
+
34
+ return content
35
+
36
 
37
  class VTTTranscriptLoader:
38
  """
 
195
  # Call clean function
196
  print("cleaning files")
197
  for d in range(len(results)):
198
+ results[d].text = clean_text(results[d].text)
199
 
200
  # run pinecone in batches (of 1) for memory preservation.
201
  print("reading into pinecone db")