pierreguillou commited on
Commit
723d054
1 Parent(s): d159597

Update files/functions.py

Browse files
Files changed (1) hide show
  1. files/functions.py +3 -14
files/functions.py CHANGED
@@ -80,7 +80,7 @@ else:
80
  doc_stride = 128 # The authorized overlap between two part of the context when splitting it is needed.
81
 
82
  # max PDF page images that will be displayed
83
- max_imgboxes = 3
84
  examples_dir = 'files/'
85
  image_wo_content = examples_dir + "wo_content.png" # image without content
86
  pdf_blank = examples_dir + "blank.pdf" # blank PDF
@@ -366,8 +366,7 @@ def extraction_data_from_image(images):
366
  if num_imgs > 0:
367
 
368
  # https://pyimagesearch.com/2021/11/15/tesseract-page-segmentation-modes-psms-explained-how-to-improve-your-ocr-accuracy/
369
- custom_config = r'--oem 3 --psm 3 -l eng+por+spa' # default config PyTesseract: --oem 3 --psm 3 -l eng+deu+fra+jpn+por+spa+rus+hin+chi_sim
370
- # custom_config = r'--oem 3 --psm 3 -l eng' # default config PyTesseract: --oem 3 --psm 3
371
  results, lines, row_indexes, par_boxes, line_boxes = dict(), dict(), dict(), dict(), dict()
372
  images_ids_list, lines_list, par_boxes_list, line_boxes_list, images_list, page_no_list, num_pages_list = list(), list(), list(), list(), list(), list(), list()
373
 
@@ -381,15 +380,11 @@ def extraction_data_from_image(images):
381
  img = np.array(img, dtype='uint8') # convert PIL to cv2
382
  img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) # gray scale image
383
  ret,img = cv2.threshold(img,127,255,cv2.THRESH_BINARY)
384
- # img_filepath = f"img{i}.png"
385
- # img.save(img_filepath)
386
- # cv2.imwrite(img_filepath, img)
387
 
388
  # OCR PyTesseract | get langs of page
389
  txt = pytesseract.image_to_string(img, config=custom_config)
390
-
391
  # txt = os.popen(f'tesseract {img_filepath} - {custom_config}').read()
392
-
393
  try:
394
  langs = detect_langs(txt)
395
  langs = [langdetect2Tesseract[langs[i].lang] for i in range(len(langs))]
@@ -398,15 +393,11 @@ def extraction_data_from_image(images):
398
  langs_string = "eng"
399
  langs_string += '+osd'
400
  custom_config = f'--oem 3 --psm 3 -l {langs_string} tsv' # default config PyTesseract: --oem 3 --psm 3
401
- # print("langs", i, "-", langs_string)
402
 
403
  # OCR PyTesseract | get data
404
  results[i] = pytesseract.image_to_data(img, config=custom_config, output_type=pytesseract.Output.DICT)
405
-
406
  # results[i] = os.popen(f'tesseract {img_filepath} - {custom_config}').read()
407
- # print("results[i].keys()", i, "-",results[i].keys())
408
 
409
- # print("factor", factor)
410
  lines[i], row_indexes[i], par_boxes[i], line_boxes[i] = get_data(results[i], factor, conf_min=0)
411
  lines_list.append(lines[i])
412
  par_boxes_list.append(par_boxes[i])
@@ -415,8 +406,6 @@ def extraction_data_from_image(images):
415
  images_list.append(images[i])
416
  page_no_list.append(i)
417
  num_pages_list.append(num_imgs)
418
- # print("i - lines[i], row_indexes[i], par_boxes[i], line_boxes[i]",i,"-",lines[i], row_indexes[i], par_boxes[i], line_boxes[i])
419
- # print("***************************************************************")
420
 
421
  except:
422
  print(f"There was an error within the extraction of PDF text by the OCR!")
 
80
  doc_stride = 128 # The authorized overlap between two part of the context when splitting it is needed.
81
 
82
  # max PDF page images that will be displayed
83
+ max_imgboxes = 2
84
  examples_dir = 'files/'
85
  image_wo_content = examples_dir + "wo_content.png" # image without content
86
  pdf_blank = examples_dir + "blank.pdf" # blank PDF
 
366
  if num_imgs > 0:
367
 
368
  # https://pyimagesearch.com/2021/11/15/tesseract-page-segmentation-modes-psms-explained-how-to-improve-your-ocr-accuracy/
369
+ custom_config = r'--oem 3 --psm 3 -l eng' # default config PyTesseract: --oem 3 --psm 3 -l eng+deu+fra+jpn+por+spa+rus+hin+chi_sim
 
370
  results, lines, row_indexes, par_boxes, line_boxes = dict(), dict(), dict(), dict(), dict()
371
  images_ids_list, lines_list, par_boxes_list, line_boxes_list, images_list, page_no_list, num_pages_list = list(), list(), list(), list(), list(), list(), list()
372
 
 
380
  img = np.array(img, dtype='uint8') # convert PIL to cv2
381
  img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) # gray scale image
382
  ret,img = cv2.threshold(img,127,255,cv2.THRESH_BINARY)
 
 
 
383
 
384
  # OCR PyTesseract | get langs of page
385
  txt = pytesseract.image_to_string(img, config=custom_config)
 
386
  # txt = os.popen(f'tesseract {img_filepath} - {custom_config}').read()
387
+
388
  try:
389
  langs = detect_langs(txt)
390
  langs = [langdetect2Tesseract[langs[i].lang] for i in range(len(langs))]
 
393
  langs_string = "eng"
394
  langs_string += '+osd'
395
  custom_config = f'--oem 3 --psm 3 -l {langs_string} tsv' # default config PyTesseract: --oem 3 --psm 3
 
396
 
397
  # OCR PyTesseract | get data
398
  results[i] = pytesseract.image_to_data(img, config=custom_config, output_type=pytesseract.Output.DICT)
 
399
  # results[i] = os.popen(f'tesseract {img_filepath} - {custom_config}').read()
 
400
 
 
401
  lines[i], row_indexes[i], par_boxes[i], line_boxes[i] = get_data(results[i], factor, conf_min=0)
402
  lines_list.append(lines[i])
403
  par_boxes_list.append(par_boxes[i])
 
406
  images_list.append(images[i])
407
  page_no_list.append(i)
408
  num_pages_list.append(num_imgs)
 
 
409
 
410
  except:
411
  print(f"There was an error within the extraction of PDF text by the OCR!")