Spaces:
Sleeping
Sleeping
File size: 5,773 Bytes
1f72938 9312707 1f72938 9312707 1f72938 9312707 1f72938 9312707 1f72938 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 |
################# cnocr ##################
from cnocr import CnOcr
def validate(text):
invalid_list = [' ',',']
for char in invalid_list:
text = text.replace(char, '')
return text
def check_bank(text):
text = text.replace(' ', '')
bank_list = ['bankofchina','hangseng','hsbc','sc']
for bank in bank_list:
if bank in text:
return bank
else:
return False
def check_bank_name(img_path):
# BOCH - "Consolidated Statement 2023-01-01"
# HangSeng - "Statement of Prestige Banking 2023-03-0" OR "Statement of Preferred Banking 2023-03-07"
# HSBC - "Statement - HSBC One Account 2023-02-10"
# Standard Chartered - "statementOfAccount 2023-02-01"
standard_names = {'boch': "Consolidated Statement",
'hangseng': "Statement of",
'hsbc': "Statement - HSBC One Account",
'sc': "statementOfAccount"}
for bank_name in standard_names:
if bank_name in str(img_path) or standard_names[bank_name] in str(img_path):
return bank_name
def check_mr(text):
openings = ['mr', 'ms', 'miss', 'mrs']
words = text.lower().split()
if words and words[0] in openings:
return ''.join(words[1:])
else:
return text
def get_info_from_bank(img_path, file_name):
# Running the model
ocr = CnOcr(rec_model_name='densenet_lite_136-gru')
out = ocr.ocr(img_path)
# Data
bank_data = {
"nameStatement": "",
"address": "",
"bank": check_bank_name(file_name),
"date": "",
"asset": 0.0,
"liabilities": ""
}
asset_y = [722,747]
asset_equa = ''
asset_iterations = 2
liabilities_y = [747,800]
count = 0
invalid_list = ['', ' ', ',']
for item in out:
detected_text = item['text']
raw_detected_text = detected_text.lower()
#raw_detected_text = detected_text
positions = item['position']
if raw_detected_text in invalid_list or raw_detected_text is None:
pass
elif ((positions[0][0] >= 147) and (positions[0][1] >= 265) and (positions[2][0] <= 400) and (positions[2][1] <= 295)):
if (raw_detected_text != ''): # name
bank_data["nameStatement"] += raw_detected_text
bank_data["nameStatement"] = check_mr(bank_data["nameStatement"])
elif ((positions[0][0] >= 113) and (positions[0][1] >= 291) and (positions[2][0] <= 500) and (positions[2][1] <= 381)):
if (raw_detected_text != ''): # position
bank_data["address"] += raw_detected_text
bank_data["address"] += ' '
elif ((positions[0][0] >= 996) and (positions[0][1] >= 289) and (positions[2][0] <= 1083) and (positions[2][1] <= 314)):
if (raw_detected_text != ''): # statement date
bank_data["date"] += raw_detected_text
elif ((positions[0][0] >= 900) and (positions[0][1] >= asset_y[0]) and (positions[2][0] <= 1120) and (positions[2][1] <= asset_y[1])): #
# take a look at the y0/y1 position
if (raw_detected_text != '' and count <= asset_iterations and ('DR' not in raw_detected_text)): # asset
asset_equa += raw_detected_text
asset_equa += '+'
raw_detected_text = raw_detected_text.replace(',', '')
#raw_detected_text = validate(raw_detected_text).lower()
asset_float = float(raw_detected_text)
bank_data["asset"] += asset_float
asset_y[0] += 21
asset_y[1] += 27
liabilities_y[1] += 27
count += 1
elif 'DR' in raw_detected_text:
bank_data["liabilities"] = validate(raw_detected_text)
elif ((positions[0][0] >= 900) and (positions[0][1] >= liabilities_y[0]) and (positions[2][0] <= 1130) and (positions[2][1] <= liabilities_y[1])):
if (raw_detected_text != '' and 'dr' in raw_detected_text): # liabilities
raw_detected_text = raw_detected_text.replace('dr','')
bank_data["liabilities"] = validate(raw_detected_text)
elif check_bank(raw_detected_text) != False: # bank
bank_data["bank"] = check_bank(raw_detected_text)
# print('------------From bank statement------------')
# print(f'Name: {bank_data["nameStatement"]}')
# print(f'Address: {bank_data["address"]}')
# print(f'Bank: {bank_data["bank"]}')
# print(f'Date: {bank_data["date"]}')
# print(f'Asset: {asset_equa} = {bank_data["asset"]}')
# print(f'Liabilities: {bank_data["liabilities"]}')
# post_data(bank_data["bank"], bank_data["nameStatement"], bank_data["address"], bank_data["asset"], bank_data["liabilities"], bank_data["date"])
return bank_data
########## Posting data through API ############
import requests
import data_encryption
# POST /api/v1/users HTTP/1.1
def post_data(bank, name, address, asset, liabilities, date):
# endpoint = 'http://ipygg-api-test-env.ap-east-1.elasticbeanstalk.com/SBT/api/v1/users'
data = {
"endpoint": "/SBT",
"apiType": "store_statement_verif",
"requestId": 'request_1234',
"userId": 'user1',
"bank": bank,
"nameStatement": name,
"address": address,
"asset": str(asset),
"liability": liabilities,
"statementDate": date
}
encrypted_data = data_encryption.encrypt(data)
# request = requests.post(url=endpoint, data=encrypted_data)
# def extract_pdf_data(img_path='hangseng_page-0001.jpg'):
# page_number = 1
# images = f'hangseng_page-000{page_number}.jpg'
# get_info_from_bank(img_path)
|