MrFeelgoood's picture
Update app.py
846b634
import spacy
from spacy.language import Language
from spacy.lang.it import Italian
import re
from transformers import pipeline
from gradio.inputs import File
import gradio as gr
from pdf2image import convert_from_path
import pytesseract
import tempfile
import os
from gradio.inputs import Dropdown
import gradio as gr
import tempfile
import os
from pdf2image import convert_from_path
import pytesseract
import fitz
from pdf2image import convert_from_bytes
def preprocess_punctuation(text):
pattern = r'(?<![a-z])[a-zA-Z\.]{1,4}(?:\.[a-zA-Z\.]{1,4})*\.(?!\s*[A-Z])'
matches = re.findall(pattern, text)
res = [*set(matches)]
#res = [r for r in res if not nlp(r).ents or
#not any(ent.label_ in nlp.get_pipe('ner').labels for ent in nlp(r).ents)] #optimized
return res
def preprocess_text(text):
prep_text = re.sub(r'\n\s*\n', '\n', text)
prep_text = re.sub(r'\n{2,}', '\n', prep_text)
#string_with_single_newlines_and_no_blank_lines = re.sub(r' {2,}', ' ', string_with_single_newlines_and_no_blank_lines)
#print(string_with_single_newlines_and_no_blank_lines)
return prep_text
@Language.component('custom_tokenizer')
def custom_tokenizer(doc):
# Define a custom rule to ignore colons as a sentence boundary
for token in doc[:-1]:
if (token.text == ":"):
doc[token.i+1].is_sent_start = False
return doc
def get_sentences(text, dictionary = None):
cl_sentences = []
chars_to_strip = [' ', '\n']
chars_to_strip_str = ''.join(set(chars_to_strip))
nlp = spacy.load("it_core_news_lg") #load ita moodel
nlp.add_pipe("custom_tokenizer", before="parser")
for punct in preprocess_punctuation(text):
nlp.tokenizer.add_special_case(punct, [{spacy.symbols.ORTH: punct, spacy.symbols.NORM: punct}])
doc = nlp(text) # Process the text with spaCy
sentences = list(doc.sents) # Split the text into sentences
for sentence in sentences:
sent = sentence.text
cl_sentence = ' '.join(filter(None, sent.lstrip(chars_to_strip_str).rstrip(chars_to_strip_str).split(' ')))
if cl_sentence!= '':
cl_sentences.append(cl_sentence)
return cl_sentences
def extract_numbers(text, given_strings):
# Split text into a list of words
words = text.split()
# Find the indices of the given strings in the list of words
indices = [i for i, word in enumerate(words) if any(s in word for s in given_strings)]
# Initialize an empty list to store the numbers
numbers = []
# Loop through each index
for index in indices:
# Define the range of words to search for numbers
start = max(index - 1, 0)
end = min(index + 2, len(words))
# Extract the words within the range
context = words[start:end]
# Check if the context contains mathematical operators
if any(re.match(r'[+\*/]', word) for word in context):
continue
# Find all numbers in the context
context_numbers = [
float(re.sub('[^0-9\.,]+', '', word).replace(',', '.'))
if re.sub('[^0-9\.,]+', '', word).replace(',', '.').replace('.', '', 1).isdigit()
else int(re.sub('[^0-9]+', '', word))
if re.sub('[^0-9]+', '', word).isdigit()
else None
for word in context
]
# Add the numbers to the list
numbers.extend(context_numbers)
return numbers
def get_text_and_values(text, key_list):
sentences = get_sentences(text)
total_numbers= []
infoDict = {}
for sentence in sentences:
numbers = extract_numbers(text = sentence, given_strings = key_list)
total_numbers.append(numbers)
if not numbers:
continue
else: infoDict[sentence] = numbers
return infoDict
def get_useful_text(dictionary):
keysList = list(dictionary.keys())
tx = ('\n------------------------\n'.join(keysList))
return tx
def get_values(dictionary):
pr = list(dictionary.values())
return pr
def initialize_qa_transformer(model):
qa = pipeline("text2text-generation", model=model)
return qa
def get_answers_unfiltered(dictionary, question, qa_pipeline):
keysList = list(dictionary.keys())
answers = []
for kl in keysList:
answer = qa_pipeline(f'{kl} Domanda: {question}')
answers.append(answer)
return answers
def get_total(answered_values, text, keywords, raw_values, unique_values = False):
numeric_list = [num for sublist in raw_values for num in sublist if isinstance(num, (int, float))]
#numbers = [float(x[0]['generated_text']) for x in answered_values if x[0]['generated_text'].isdigit()]
pattern = r'\d+(?:[.,]\d+)?'
numbers = []
for sub_lst in answered_values:
for d in sub_lst:
for k, v in d.items():
# Replace commas with dots
v = v.replace(',', '.')
# Extract numbers and convert to float
numbers += [float(match) for match in re.findall(pattern, v) if (float(match) >= 5.0) and (float(match) in numeric_list)]
###### remove duplicates
if unique_values:
numbers = list(set(numbers))
######
total = 0
sum = 0
total_list = []
# Define a regular expression pattern that will match a number
pattern = r'\d+'
# Loop through the keywords and search for them in the text
found = False
for keyword in keywords:
# Build a regular expression pattern that looks for the keyword
# followed by up to three words, then a number
keyword_pattern = f'{keyword}(\\s+\\w+){{0,3}}\\s+({pattern})'
match = re.search(keyword_pattern, text, re.IGNORECASE)
if match:
# If we find a match, print the number and set found to True
number = match.group(2)
if (number in numbers) and (number in numeric_list):
total_list.append(int(number))
print(f"Found a value ({number}) for keyword '{keyword}'.")
found = True
# If we didn't find a match
if not found:
for value in numbers:
if value in numeric_list:
total += value
total_list.append(total)
#If there is more than one total, it means different lots with many total measures for each house. Calculate the sum of the totals mq
for value in total_list:
sum += value
return numbers, sum
def extractor_clean(text, k_words, transformer, question, total_kwords, return_text = False):
tex = ''
dictionary = get_text_and_values(text, k_words)
raw = get_values(dictionary)
qa = initialize_qa_transformer(transformer)
val = get_answers_unfiltered(dictionary, question = question, qa_pipeline = qa)
keywords = ['totale', 'complessivo', 'complessiva']
values = get_total(answered_values= val, raw_values = raw, text = text, keywords = total_kwords, unique_values = True)
if return_text:
tex = get_useful_text(dictionary)
return values, return_text, tex
elif return_text == False:
return values, return_text
def pdf_ocr(file, model_t, question):
# Convert PDF to image
with tempfile.TemporaryDirectory() as path:
with open(file, "rb") as f:
content = f.read()
with fitz.open(stream=content, filetype="pdf") as doc:
num_pages = len(doc)
# Extract text from the PDF
text = ""
for page in doc:
text += page.get_text()
# Perform OCR on the PDF if the extracted text is empty
if not text:
# Convert PDF pages to images
images = convert_from_bytes(content)
for i, img in enumerate(images):
text += pytesseract.image_to_string(img, lang='ita')
# Clear the image list to free up memory
del images
ks = ('mq', 'MQ', 'Mq', 'metri quadri', 'm2')
quest = "Quanti metri quadri misura la superficie?"
totalK = ['totale', 'complessivo', 'complessiva']
extracted_values = extractor_clean(text=text, k_words=ks, transformer=model_t, question=question, total_kwords=totalK, return_text=True)
values_output = extracted_values[0][0]
sor_values = sorted(values_output)
total_output = f'{extracted_values[0][1]} Mq'
text_output = extracted_values[2]
immobile_values = [f'{i + 1}. Immobile : {value} Mq\n' for i, value in enumerate(sor_values)]
immobile_values = '\n'.join(immobile_values)
return immobile_values, total_output, text_output
def ocr_interface(pdf_file, model_t='it5/it5-base-question-answering', question="Quanti metri quadri misura l'immobile?"):
# Call the pdf_ocr function
values, total, text = pdf_ocr(pdf_file.name, model_t, question)
return values, total, text
# Start the UI
with gr.Blocks(theme=gr.themes.Soft()) as demo:
gr.Markdown(
'''
# PDF Mq Extractor
Demo for ITAL-IA
''')
with gr.Tab("Extractor"):
with gr.Row():
pdf_input = gr.components.File(label="PDF File")
with gr.Row():
model_input = gr.components.Dropdown(['it5/it5-base-question-answering', 'it5/it5-small-question-answering'],
value='it5/it5-base-question-answering', label = 'Select model')
question_input = gr.components.Dropdown(["Quanti metri quadri misura l'immobile?"],
value = "Quanti metri quadri misura l'immobile?", label = 'Question')
with gr.Column():
gr.Markdown(
'''
# Output values
Values extracted from the pdf document
''')
with gr.Row():
text_output = gr.components.Textbox(label="Ref. Text")
values_output = gr.components.Textbox(label="Area Values - sorted by value")
total_output = gr.components.Textbox(label="Total")
with gr.Row():
extract_button = gr.Button("Extract")
extract_button.click(fn = ocr_interface,
inputs=[pdf_input, model_input, question_input], outputs=[values_output, total_output, text_output])
gr.Examples(['Example1(scannedDoc).pdf', 'Example2.pdf', 'Example3Large.pdf'], inputs = pdf_input,
cache_examples = True, fn = ocr_interface, outputs = [values_output, total_output, text_output])
demo.launch()