Spaces:
No application file
No application file
Ahmedhisham
commited on
Commit
•
2ae875f
1
Parent(s):
19ab899
Upload 6 files
Browse files- ner.py +53 -0
- ner_camel_MSA.py +57 -0
- pdf_to_imgs.py +15 -0
- regex_extract.py +102 -0
- regex_format_after_OCR.py +25 -0
- stamp_detection.py +26 -0
ner.py
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
from transformers import AutoTokenizer, AutoModelForTokenClassification
|
3 |
+
import nltk
|
4 |
+
nltk.download('punkt')
|
5 |
+
from nltk.tokenize import word_tokenize
|
6 |
+
|
7 |
+
# Function to extract names using MAREFA NER model
|
8 |
+
def extract_arabic_names(json_data, model, tokenizer):
|
9 |
+
arabic_names = set()
|
10 |
+
|
11 |
+
for entry in json_data:
|
12 |
+
if "Arabic Text" in entry:
|
13 |
+
text = entry["Arabic Text"]
|
14 |
+
tokenized_text = tokenizer.tokenize(text)
|
15 |
+
inputs = tokenizer(text, return_tensors="pt")
|
16 |
+
outputs = model(**inputs)
|
17 |
+
predictions = outputs.logits.argmax(dim=-1)
|
18 |
+
predicted_labels = [model.config.id2label[label_id] for label_id in predictions[0]]
|
19 |
+
|
20 |
+
current_name = ""
|
21 |
+
for token, label in zip(tokenized_text, predicted_labels):
|
22 |
+
if label == "B-person":
|
23 |
+
current_name = token
|
24 |
+
elif label == "I-person":
|
25 |
+
current_name += " " + token
|
26 |
+
elif label != "O" and current_name:
|
27 |
+
arabic_names.add(current_name)
|
28 |
+
current_name = ""
|
29 |
+
|
30 |
+
if current_name:
|
31 |
+
arabic_names.add(current_name)
|
32 |
+
|
33 |
+
return arabic_names
|
34 |
+
|
35 |
+
# Load the MAREFA NER model and tokenizer
|
36 |
+
model_name = "marefa-nlp/marefa-ner"
|
37 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
38 |
+
model = AutoModelForTokenClassification.from_pretrained(model_name)
|
39 |
+
basic='cache/output/basic_info_frame.json'
|
40 |
+
# Load JSON data from the file
|
41 |
+
with open(basic, "r", encoding="utf-8") as file:
|
42 |
+
json_data = json.load(file)
|
43 |
+
|
44 |
+
# Extract names from the JSON data using MAREFA model
|
45 |
+
arabic_names = extract_arabic_names(json_data, model, tokenizer)
|
46 |
+
|
47 |
+
# Print the extracted names
|
48 |
+
if arabic_names:
|
49 |
+
print("Arabic names extracted:")
|
50 |
+
for name in arabic_names:
|
51 |
+
print("Name:", name)
|
52 |
+
else:
|
53 |
+
print("No Arabic names found.")
|
ner_camel_MSA.py
ADDED
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Load model directly
|
2 |
+
from transformers import AutoTokenizer, AutoModelForTokenClassification
|
3 |
+
import json
|
4 |
+
|
5 |
+
#tokenizer = AutoTokenizer.from_pretrained("CAMeL-Lab/bert-base-arabic-camelbert-msa-ner")
|
6 |
+
#model = AutoModelForTokenClassification.from_pretrained("CAMeL-Lab/bert-base-arabic-camelbert-msa-ner")
|
7 |
+
#print("Model loaded successfully")
|
8 |
+
|
9 |
+
from nltk.tokenize import word_tokenize
|
10 |
+
|
11 |
+
# Function to extract names using MAREFA NER model
|
12 |
+
def extract_arabic_names(json_data, model, tokenizer):
|
13 |
+
arabic_names = set()
|
14 |
+
|
15 |
+
for entry in json_data:
|
16 |
+
if "Arabic Text" in entry:
|
17 |
+
text = entry["Arabic Text"]
|
18 |
+
tokenized_text = tokenizer.tokenize(text)
|
19 |
+
inputs = tokenizer(text, return_tensors="pt")
|
20 |
+
outputs = model(**inputs)
|
21 |
+
predictions = outputs.logits.argmax(dim=-1)
|
22 |
+
predicted_labels = [model.config.id2label[label_id] for label_id in predictions[0]]
|
23 |
+
|
24 |
+
current_name = ""
|
25 |
+
for token, label in zip(tokenized_text, predicted_labels):
|
26 |
+
if label == "B-person":
|
27 |
+
current_name = token
|
28 |
+
elif label == "I-person":
|
29 |
+
current_name += " " + token
|
30 |
+
elif label != "O" and current_name:
|
31 |
+
arabic_names.add(current_name)
|
32 |
+
current_name = ""
|
33 |
+
|
34 |
+
if current_name:
|
35 |
+
arabic_names.add(current_name)
|
36 |
+
|
37 |
+
return arabic_names
|
38 |
+
|
39 |
+
# Load the MAREFA NER model and tokenizer
|
40 |
+
model_name = "CAMeL-Lab/bert-base-arabic-camelbert-msa-ner"
|
41 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
42 |
+
model = AutoModelForTokenClassification.from_pretrained(model_name)
|
43 |
+
basic='cache/output/basic_info_frame.json'
|
44 |
+
# Load JSON data from the file
|
45 |
+
with open(basic, "r", encoding="utf-8") as file:
|
46 |
+
json_data = json.load(file)
|
47 |
+
|
48 |
+
# Extract names from the JSON data using MAREFA model
|
49 |
+
arabic_names = extract_arabic_names(json_data, model, tokenizer)
|
50 |
+
|
51 |
+
# Print the extracted names
|
52 |
+
if arabic_names:
|
53 |
+
print("Arabic names extracted:")
|
54 |
+
for name in arabic_names:
|
55 |
+
print("Name:", name)
|
56 |
+
else:
|
57 |
+
print("No Arabic names found.")
|
pdf_to_imgs.py
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import fitz
|
2 |
+
import os
|
3 |
+
pdf_file_path="sample/GB.pdf"
|
4 |
+
pdf=fitz.open(pdf_file_path)
|
5 |
+
|
6 |
+
save_dir="cache/GB"
|
7 |
+
os.makedirs(save_dir,exist_ok=True)
|
8 |
+
for page_num in range(len(pdf)):
|
9 |
+
page=pdf[page_num]
|
10 |
+
pix=page.get_pixmap()
|
11 |
+
image_filename=os.path.join(save_dir,f"{os.path.splitext(os.path.basename(pdf_file_path))[0]}_page_{page_num+1}.png")
|
12 |
+
pix.save(image_filename)
|
13 |
+
|
14 |
+
|
15 |
+
pdf.close()
|
regex_extract.py
ADDED
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import csv
|
3 |
+
from translate import Translator
|
4 |
+
#print("translate imported")
|
5 |
+
import re
|
6 |
+
def load_regex_pattern(filename):
|
7 |
+
try:
|
8 |
+
with open(filename,'r',encoding="utf-8") as config_file:
|
9 |
+
config_data=json.load(config_file)
|
10 |
+
return config_data
|
11 |
+
except FileNotFoundError:
|
12 |
+
print("regex file not found")
|
13 |
+
return{}
|
14 |
+
|
15 |
+
def translate_date(date_text):
|
16 |
+
translation_dict={
|
17 |
+
"January":"يناير", "February":"فبراير",
|
18 |
+
"March":"مارس",
|
19 |
+
"April":"ابريل",
|
20 |
+
"May":"مايو",
|
21 |
+
"June":"يونيو",
|
22 |
+
"July":"يوليو",
|
23 |
+
"August":"أغسطس",
|
24 |
+
"September":"سبتمبر",
|
25 |
+
"October":"اكتوبر",
|
26 |
+
"November":"نوفمبر",
|
27 |
+
"ديسمبر":"December",
|
28 |
+
"٠":"0",
|
29 |
+
"١":"1",
|
30 |
+
"٢":"2",
|
31 |
+
"٣":"3",
|
32 |
+
"٤":"4",
|
33 |
+
"٥":"5",
|
34 |
+
"٦":"6",
|
35 |
+
"٧":"7",
|
36 |
+
"٨":"8",
|
37 |
+
"٩":"9"
|
38 |
+
|
39 |
+
}
|
40 |
+
#mapping digits and months into english
|
41 |
+
for ar_digit,en_digit in translation_dict.items():
|
42 |
+
date_text=date_text.replace(ar_digit,en_digit)
|
43 |
+
|
44 |
+
for ar_month,en_month in translation_dict.items():
|
45 |
+
date_text=date_text.replace(ar_month,en_month)
|
46 |
+
return date_text
|
47 |
+
|
48 |
+
def translate_text(text):
|
49 |
+
translator=Translator(to_lang='en',from_lang='ar')
|
50 |
+
translated_text=translator.translate(text)
|
51 |
+
return translated_text
|
52 |
+
|
53 |
+
def extract_and_store_info(input_file,output_csv,regex_patterns):
|
54 |
+
extracted_data={pattern_name:"" for pattern_name in regex_patterns}
|
55 |
+
with open(input_file,encoding="utf-8") as json_file:
|
56 |
+
json_data=json.load(json_file)
|
57 |
+
|
58 |
+
for pattern_name,pattern_data in regex_patterns.items():
|
59 |
+
if not extracted_data.get(pattern_name):
|
60 |
+
for entry in json_data:
|
61 |
+
if "Arabic text" in entry:
|
62 |
+
text=entry.get("Arabic text","")
|
63 |
+
if not extracted_data.get(pattern_name) and re.search(pattern_data["pattern"],text,re.IGNORECASE):
|
64 |
+
extracted_data[pattern_name]=re.search(pattern_data["pattern"],text,re.IGNORECASE).group()
|
65 |
+
|
66 |
+
if extracted_data.get(pattern_name):
|
67 |
+
break
|
68 |
+
|
69 |
+
|
70 |
+
#translate the company name into english
|
71 |
+
if extracted_data.get("company_title"):
|
72 |
+
extracted_data["company_title"]=translate_text(extracted_data["company_title"])
|
73 |
+
|
74 |
+
if "annual_pattern" in regex_patterns and "date_pattern" in regex_patterns:
|
75 |
+
if re.search(regex_patterns["annual_pattern"]["pattern"],extracted_data["annual_pattern"],re.IGNORECASE):
|
76 |
+
extracted_data["annual_pattern"]="Annual"
|
77 |
+
elif re.search(regex_patterns["half_annual_pattern"]["pattern"],extracted_data["half_annual_pattern"],re.IGNORECASE):
|
78 |
+
extracted_data["half_annual_pattern"]="Half Annual"
|
79 |
+
|
80 |
+
if "date_pattern" in regex_patterns:
|
81 |
+
extracted_data["date_pattern"]=translate_date(extracted_data["date_pattern"])
|
82 |
+
|
83 |
+
with open(output_csv,mode='w',encoding="utf-8",newline='') as csv_output_file:
|
84 |
+
fieldnames=["pattern_name","extracted_data"]
|
85 |
+
writer=csv.DictWriter(csv_output_file,fieldnames=fieldnames)
|
86 |
+
writer.writeheader()
|
87 |
+
for pattern_name ,data in extracted_data.items():
|
88 |
+
if data:
|
89 |
+
writer.writerow({"pattern_name":pattern_name,"extracted_data":data})
|
90 |
+
extracted_count=sum(1 for data in extracted_data.values() if data)
|
91 |
+
print(f"{extracted_count} pieces of data extracted and stored in:",output_csv)
|
92 |
+
|
93 |
+
|
94 |
+
if __name__=="__main__":
|
95 |
+
input_file="cache/output/basic_info_frame.json"
|
96 |
+
output_csv_file='cache/output/outputregex.csv'
|
97 |
+
regex_patterns=load_regex_pattern("mainpipeline/models/regex_config.json")
|
98 |
+
if regex_patterns:
|
99 |
+
extract_and_store_info(input_file,output_csv_file,regex_patterns)
|
100 |
+
else:
|
101 |
+
print("failed to load regex")
|
102 |
+
|
regex_format_after_OCR.py
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import glob
|
3 |
+
def extract_arabic_data(input_files,output_file):
|
4 |
+
arabic_data=[]
|
5 |
+
for input_file in input_files:
|
6 |
+
#load json data from the input file
|
7 |
+
with open(input_file,encoding="utf-8") as json_file:
|
8 |
+
json_data=json.load(json_file)
|
9 |
+
|
10 |
+
#extract only the arabic data
|
11 |
+
for entry in json_data:
|
12 |
+
if "lines" in entry:
|
13 |
+
for line in entry["lines"]:
|
14 |
+
text=line.get("text","")
|
15 |
+
arabic_data.append({"Arabic text":text})
|
16 |
+
|
17 |
+
|
18 |
+
with open(output_file,mode="w",encoding="utf-8") as json_output_file:
|
19 |
+
json.dump(arabic_data,json_output_file,ensure_ascii=False,indent=4)
|
20 |
+
print("Arabic data from",len(input_files),"json files has been extracted in :",output_file)
|
21 |
+
|
22 |
+
if __name__ =="__main__":
|
23 |
+
input_files=glob.glob("cache/GB/ocr_output*.json")
|
24 |
+
output_json_file="cache/output/basic_info_frame.json"
|
25 |
+
extract_arabic_data(input_files,output_json_file)
|
stamp_detection.py
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from ultralytics import YOLO
|
2 |
+
import os
|
3 |
+
import csv
|
4 |
+
model=YOLO('mainpipeline/models/stamp_detection_model.pt')
|
5 |
+
img_dir='cache/GB'
|
6 |
+
#output_csv='cache/extracted_data.csv'
|
7 |
+
#set a detection flag = false
|
8 |
+
output_csv='cache/output/appended_data.csv'
|
9 |
+
revision_status='unrevised'
|
10 |
+
|
11 |
+
for img_name in os.listdir(img_dir):
|
12 |
+
img_path=os.path.join(img_dir,img_name)
|
13 |
+
if os.path.isfile(img_path):
|
14 |
+
results=model.predict(img_path,conf=0.25,save=False)
|
15 |
+
if len(results)>0 :
|
16 |
+
revision_status="Revised"
|
17 |
+
break
|
18 |
+
with open(output_csv,mode='a',newline='') as file:
|
19 |
+
writer=csv.writer(file)
|
20 |
+
#writer.writerow(['Revision status',revision_status])
|
21 |
+
file.seek(0,os.SEEK_END)
|
22 |
+
if file.tell()==0:
|
23 |
+
writer.writerow(["Revision_status"])
|
24 |
+
writer.writerow(["revision_status",revision_status])
|
25 |
+
|
26 |
+
|