pierreguillou
commited on
Commit
•
fd53ed7
1
Parent(s):
723d054
Update files/functions.py
Browse files- files/functions.py +4 -0
files/functions.py
CHANGED
@@ -383,6 +383,9 @@ def extraction_data_from_image(images):
|
|
383 |
|
384 |
# OCR PyTesseract | get langs of page
|
385 |
txt = pytesseract.image_to_string(img, config=custom_config)
|
|
|
|
|
|
|
386 |
# txt = os.popen(f'tesseract {img_filepath} - {custom_config}').read()
|
387 |
|
388 |
try:
|
@@ -392,6 +395,7 @@ def extraction_data_from_image(images):
|
|
392 |
except:
|
393 |
langs_string = "eng"
|
394 |
langs_string += '+osd'
|
|
|
395 |
custom_config = f'--oem 3 --psm 3 -l {langs_string} tsv' # default config PyTesseract: --oem 3 --psm 3
|
396 |
|
397 |
# OCR PyTesseract | get data
|
|
|
383 |
|
384 |
# OCR PyTesseract | get langs of page
|
385 |
txt = pytesseract.image_to_string(img, config=custom_config)
|
386 |
+
txt = txt.strip().lower()
|
387 |
+
txt = re.sub(r" +", " ", txt) # multiple space
|
388 |
+
txt = re.sub(r"(\n\s*)+\n+", "\n", txt) # multiple line
|
389 |
# txt = os.popen(f'tesseract {img_filepath} - {custom_config}').read()
|
390 |
|
391 |
try:
|
|
|
395 |
except:
|
396 |
langs_string = "eng"
|
397 |
langs_string += '+osd'
|
398 |
+
print("langs_string", langs_string)
|
399 |
custom_config = f'--oem 3 --psm 3 -l {langs_string} tsv' # default config PyTesseract: --oem 3 --psm 3
|
400 |
|
401 |
# OCR PyTesseract | get data
|