Spaces:
Sleeping
Sleeping
################# cnocr ################## | |
from cnocr import CnOcr | |
def validate(text): | |
invalid_list = [' ',','] | |
for char in invalid_list: | |
text = text.replace(char, '') | |
return text | |
def check_bank(text): | |
text = text.replace(' ', '') | |
bank_list = ['bankofchina','hangseng','hsbc','sc'] | |
for bank in bank_list: | |
if bank in text: | |
return bank | |
else: | |
return False | |
def check_bank_name(img_path): | |
# BOCH - "Consolidated Statement 2023-01-01" | |
# HangSeng - "Statement of Prestige Banking 2023-03-0" OR "Statement of Preferred Banking 2023-03-07" | |
# HSBC - "Statement - HSBC One Account 2023-02-10" | |
# Standard Chartered - "statementOfAccount 2023-02-01" | |
standard_names = {'boch': "Consolidated Statement", | |
'hangseng': "Statement of", | |
'hsbc': "Statement - HSBC One Account", | |
'sc': "statementOfAccount"} | |
for bank_name in standard_names: | |
if bank_name in str(img_path) or standard_names[bank_name] in str(img_path): | |
return bank_name | |
def check_mr(text): | |
openings = ['mr', 'ms', 'miss', 'mrs'] | |
words = text.lower().split() | |
if words and words[0] in openings: | |
return ''.join(words[1:]) | |
else: | |
return text | |
def get_info_from_bank(img_path, file_name): | |
# Running the model | |
ocr = CnOcr(rec_model_name='densenet_lite_136-gru') | |
out = ocr.ocr(img_path) | |
# Data | |
bank_data = { | |
"nameStatement": "", | |
"address": "", | |
"bank": check_bank_name(file_name), | |
"date": "", | |
"asset": 0.0, | |
"liabilities": "" | |
} | |
asset_y = [722,747] | |
asset_equa = '' | |
asset_iterations = 2 | |
liabilities_y = [747,800] | |
count = 0 | |
invalid_list = ['', ' ', ','] | |
for item in out: | |
detected_text = item['text'] | |
raw_detected_text = detected_text.lower() | |
#raw_detected_text = detected_text | |
positions = item['position'] | |
if raw_detected_text in invalid_list or raw_detected_text is None: | |
pass | |
elif ((positions[0][0] >= 147) and (positions[0][1] >= 265) and (positions[2][0] <= 400) and (positions[2][1] <= 295)): | |
if (raw_detected_text != ''): # name | |
bank_data["nameStatement"] += raw_detected_text | |
bank_data["nameStatement"] = check_mr(bank_data["nameStatement"]) | |
elif ((positions[0][0] >= 113) and (positions[0][1] >= 291) and (positions[2][0] <= 500) and (positions[2][1] <= 381)): | |
if (raw_detected_text != ''): # position | |
bank_data["address"] += raw_detected_text | |
bank_data["address"] += ' ' | |
elif ((positions[0][0] >= 996) and (positions[0][1] >= 289) and (positions[2][0] <= 1083) and (positions[2][1] <= 314)): | |
if (raw_detected_text != ''): # statement date | |
bank_data["date"] += raw_detected_text | |
elif ((positions[0][0] >= 900) and (positions[0][1] >= asset_y[0]) and (positions[2][0] <= 1120) and (positions[2][1] <= asset_y[1])): # | |
# take a look at the y0/y1 position | |
if (raw_detected_text != '' and count <= asset_iterations and ('DR' not in raw_detected_text)): # asset | |
asset_equa += raw_detected_text | |
asset_equa += '+' | |
raw_detected_text = raw_detected_text.replace(',', '') | |
#raw_detected_text = validate(raw_detected_text).lower() | |
asset_float = float(raw_detected_text) | |
bank_data["asset"] += asset_float | |
asset_y[0] += 21 | |
asset_y[1] += 27 | |
liabilities_y[1] += 27 | |
count += 1 | |
elif 'DR' in raw_detected_text: | |
bank_data["liabilities"] = validate(raw_detected_text) | |
elif ((positions[0][0] >= 900) and (positions[0][1] >= liabilities_y[0]) and (positions[2][0] <= 1130) and (positions[2][1] <= liabilities_y[1])): | |
if (raw_detected_text != '' and 'dr' in raw_detected_text): # liabilities | |
raw_detected_text = raw_detected_text.replace('dr','') | |
bank_data["liabilities"] = validate(raw_detected_text) | |
elif check_bank(raw_detected_text) != False: # bank | |
bank_data["bank"] = check_bank(raw_detected_text) | |
# print('------------From bank statement------------') | |
# print(f'Name: {bank_data["nameStatement"]}') | |
# print(f'Address: {bank_data["address"]}') | |
# print(f'Bank: {bank_data["bank"]}') | |
# print(f'Date: {bank_data["date"]}') | |
# print(f'Asset: {asset_equa} = {bank_data["asset"]}') | |
# print(f'Liabilities: {bank_data["liabilities"]}') | |
# post_data(bank_data["bank"], bank_data["nameStatement"], bank_data["address"], bank_data["asset"], bank_data["liabilities"], bank_data["date"]) | |
return bank_data | |
########## Posting data through API ############ | |
import requests | |
import data_encryption | |
# POST /api/v1/users HTTP/1.1 | |
def post_data(bank, name, address, asset, liabilities, date): | |
# endpoint = 'http://ipygg-api-test-env.ap-east-1.elasticbeanstalk.com/SBT/api/v1/users' | |
data = { | |
"endpoint": "/SBT", | |
"apiType": "store_statement_verif", | |
"requestId": 'request_1234', | |
"userId": 'user1', | |
"bank": bank, | |
"nameStatement": name, | |
"address": address, | |
"asset": str(asset), | |
"liability": liabilities, | |
"statementDate": date | |
} | |
encrypted_data = data_encryption.encrypt(data) | |
# request = requests.post(url=endpoint, data=encrypted_data) | |
# def extract_pdf_data(img_path='hangseng_page-0001.jpg'): | |
# page_number = 1 | |
# images = f'hangseng_page-000{page_number}.jpg' | |
# get_info_from_bank(img_path) | |