pierreguillou
commited on
Commit
•
723d054
1
Parent(s):
d159597
Update files/functions.py
Browse files- files/functions.py +3 -14
files/functions.py
CHANGED
@@ -80,7 +80,7 @@ else:
|
|
80 |
doc_stride = 128 # The authorized overlap between two part of the context when splitting it is needed.
|
81 |
|
82 |
# max PDF page images that will be displayed
|
83 |
-
max_imgboxes =
|
84 |
examples_dir = 'files/'
|
85 |
image_wo_content = examples_dir + "wo_content.png" # image without content
|
86 |
pdf_blank = examples_dir + "blank.pdf" # blank PDF
|
@@ -366,8 +366,7 @@ def extraction_data_from_image(images):
|
|
366 |
if num_imgs > 0:
|
367 |
|
368 |
# https://pyimagesearch.com/2021/11/15/tesseract-page-segmentation-modes-psms-explained-how-to-improve-your-ocr-accuracy/
|
369 |
-
custom_config = r'--oem 3 --psm 3 -l eng
|
370 |
-
# custom_config = r'--oem 3 --psm 3 -l eng' # default config PyTesseract: --oem 3 --psm 3
|
371 |
results, lines, row_indexes, par_boxes, line_boxes = dict(), dict(), dict(), dict(), dict()
|
372 |
images_ids_list, lines_list, par_boxes_list, line_boxes_list, images_list, page_no_list, num_pages_list = list(), list(), list(), list(), list(), list(), list()
|
373 |
|
@@ -381,15 +380,11 @@ def extraction_data_from_image(images):
|
|
381 |
img = np.array(img, dtype='uint8') # convert PIL to cv2
|
382 |
img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) # gray scale image
|
383 |
ret,img = cv2.threshold(img,127,255,cv2.THRESH_BINARY)
|
384 |
-
# img_filepath = f"img{i}.png"
|
385 |
-
# img.save(img_filepath)
|
386 |
-
# cv2.imwrite(img_filepath, img)
|
387 |
|
388 |
# OCR PyTesseract | get langs of page
|
389 |
txt = pytesseract.image_to_string(img, config=custom_config)
|
390 |
-
|
391 |
# txt = os.popen(f'tesseract {img_filepath} - {custom_config}').read()
|
392 |
-
|
393 |
try:
|
394 |
langs = detect_langs(txt)
|
395 |
langs = [langdetect2Tesseract[langs[i].lang] for i in range(len(langs))]
|
@@ -398,15 +393,11 @@ def extraction_data_from_image(images):
|
|
398 |
langs_string = "eng"
|
399 |
langs_string += '+osd'
|
400 |
custom_config = f'--oem 3 --psm 3 -l {langs_string} tsv' # default config PyTesseract: --oem 3 --psm 3
|
401 |
-
# print("langs", i, "-", langs_string)
|
402 |
|
403 |
# OCR PyTesseract | get data
|
404 |
results[i] = pytesseract.image_to_data(img, config=custom_config, output_type=pytesseract.Output.DICT)
|
405 |
-
|
406 |
# results[i] = os.popen(f'tesseract {img_filepath} - {custom_config}').read()
|
407 |
-
# print("results[i].keys()", i, "-",results[i].keys())
|
408 |
|
409 |
-
# print("factor", factor)
|
410 |
lines[i], row_indexes[i], par_boxes[i], line_boxes[i] = get_data(results[i], factor, conf_min=0)
|
411 |
lines_list.append(lines[i])
|
412 |
par_boxes_list.append(par_boxes[i])
|
@@ -415,8 +406,6 @@ def extraction_data_from_image(images):
|
|
415 |
images_list.append(images[i])
|
416 |
page_no_list.append(i)
|
417 |
num_pages_list.append(num_imgs)
|
418 |
-
# print("i - lines[i], row_indexes[i], par_boxes[i], line_boxes[i]",i,"-",lines[i], row_indexes[i], par_boxes[i], line_boxes[i])
|
419 |
-
# print("***************************************************************")
|
420 |
|
421 |
except:
|
422 |
print(f"There was an error within the extraction of PDF text by the OCR!")
|
|
|
80 |
doc_stride = 128 # The authorized overlap between two part of the context when splitting it is needed.
|
81 |
|
82 |
# max PDF page images that will be displayed
|
83 |
+
max_imgboxes = 2
|
84 |
examples_dir = 'files/'
|
85 |
image_wo_content = examples_dir + "wo_content.png" # image without content
|
86 |
pdf_blank = examples_dir + "blank.pdf" # blank PDF
|
|
|
366 |
if num_imgs > 0:
|
367 |
|
368 |
# https://pyimagesearch.com/2021/11/15/tesseract-page-segmentation-modes-psms-explained-how-to-improve-your-ocr-accuracy/
|
369 |
+
custom_config = r'--oem 3 --psm 3 -l eng' # default config PyTesseract: --oem 3 --psm 3 -l eng+deu+fra+jpn+por+spa+rus+hin+chi_sim
|
|
|
370 |
results, lines, row_indexes, par_boxes, line_boxes = dict(), dict(), dict(), dict(), dict()
|
371 |
images_ids_list, lines_list, par_boxes_list, line_boxes_list, images_list, page_no_list, num_pages_list = list(), list(), list(), list(), list(), list(), list()
|
372 |
|
|
|
380 |
img = np.array(img, dtype='uint8') # convert PIL to cv2
|
381 |
img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) # gray scale image
|
382 |
ret,img = cv2.threshold(img,127,255,cv2.THRESH_BINARY)
|
|
|
|
|
|
|
383 |
|
384 |
# OCR PyTesseract | get langs of page
|
385 |
txt = pytesseract.image_to_string(img, config=custom_config)
|
|
|
386 |
# txt = os.popen(f'tesseract {img_filepath} - {custom_config}').read()
|
387 |
+
|
388 |
try:
|
389 |
langs = detect_langs(txt)
|
390 |
langs = [langdetect2Tesseract[langs[i].lang] for i in range(len(langs))]
|
|
|
393 |
langs_string = "eng"
|
394 |
langs_string += '+osd'
|
395 |
custom_config = f'--oem 3 --psm 3 -l {langs_string} tsv' # default config PyTesseract: --oem 3 --psm 3
|
|
|
396 |
|
397 |
# OCR PyTesseract | get data
|
398 |
results[i] = pytesseract.image_to_data(img, config=custom_config, output_type=pytesseract.Output.DICT)
|
|
|
399 |
# results[i] = os.popen(f'tesseract {img_filepath} - {custom_config}').read()
|
|
|
400 |
|
|
|
401 |
lines[i], row_indexes[i], par_boxes[i], line_boxes[i] = get_data(results[i], factor, conf_min=0)
|
402 |
lines_list.append(lines[i])
|
403 |
par_boxes_list.append(par_boxes[i])
|
|
|
406 |
images_list.append(images[i])
|
407 |
page_no_list.append(i)
|
408 |
num_pages_list.append(num_imgs)
|
|
|
|
|
409 |
|
410 |
except:
|
411 |
print(f"There was an error within the extraction of PDF text by the OCR!")
|