pierreguillou commited on
Commit
fd53ed7
1 Parent(s): 723d054

Update files/functions.py

Browse files
Files changed (1) hide show
  1. files/functions.py +4 -0
files/functions.py CHANGED
@@ -383,6 +383,9 @@ def extraction_data_from_image(images):
383
 
384
  # OCR PyTesseract | get langs of page
385
  txt = pytesseract.image_to_string(img, config=custom_config)
 
 
 
386
  # txt = os.popen(f'tesseract {img_filepath} - {custom_config}').read()
387
 
388
  try:
@@ -392,6 +395,7 @@ def extraction_data_from_image(images):
392
  except:
393
  langs_string = "eng"
394
  langs_string += '+osd'
 
395
  custom_config = f'--oem 3 --psm 3 -l {langs_string} tsv' # default config PyTesseract: --oem 3 --psm 3
396
 
397
  # OCR PyTesseract | get data
 
383
 
384
  # OCR PyTesseract | get langs of page
385
  txt = pytesseract.image_to_string(img, config=custom_config)
386
+ txt = txt.strip().lower()
387
+ txt = re.sub(r" +", " ", txt) # multiple space
388
+ txt = re.sub(r"(\n\s*)+\n+", "\n", txt) # multiple line
389
  # txt = os.popen(f'tesseract {img_filepath} - {custom_config}').read()
390
 
391
  try:
 
395
  except:
396
  langs_string = "eng"
397
  langs_string += '+osd'
398
+ print("langs_string", langs_string)
399
  custom_config = f'--oem 3 --psm 3 -l {langs_string} tsv' # default config PyTesseract: --oem 3 --psm 3
400
 
401
  # OCR PyTesseract | get data