import gradio as gr import os import shutil import fitz from PIL import Image import numpy as np import cv2 import pytesseract from pytesseract import Output import zipfile from pdf2image import convert_from_path import google.generativeai as genai import json from docx import Document from docx.shared import Pt, RGBColor, Inches from docx.enum.text import WD_ALIGN_PARAGRAPH from docx.enum.section import WD_SECTION from docx.oxml import OxmlElement from docx.oxml.ns import qn from typing import Dict, Any, List, Union # Ajout des imports typing nécessaires import logging # helpers functions from helpers.rapport_generator import RapportGenerator from helpers.text_extraction import * from helpers.gemini_functions import * def authenticate(username, password): return username == os.getenv("HF_USERNAME") and password == os.getenv("HF_PASSWORD") # Main Processing Function def process_pdf(pdf_file): template_dir = os.path.join(os.getcwd(), "templates") temp_dir = os.path.join(os.getcwd(), "temp_processing") output_dir = os.path.join(temp_dir, 'output_images') if os.path.exists(temp_dir): shutil.rmtree(temp_dir) os.makedirs(output_dir, exist_ok=True) path_to_data_to_extract = os.path.join(template_dir, "data_to_extract.json") text_file_path = os.path.join(output_dir, 'extracted_text.txt') try: # Convert PDF to images and process images = convert_from_path(pdf_file.name) annotated_images = [] # Process each page for i, img in enumerate(images): temp_img_path = os.path.join(temp_dir, f'temp_page_{i}.png') img.save(temp_img_path) blocks, annotated_image_path = process_image(temp_img_path, output_dir, i) annotated_images.append(annotated_image_path) save_extracted_text(blocks, i + 1, output_dir) # Create ZIP file zip_path = os.path.join(temp_dir, "annotated_images.zip") with zipfile.ZipFile(zip_path, 'w') as zipf: for img_path in annotated_images: zipf.write(img_path, os.path.basename(img_path)) # Process with Gemini extracted_data = extract_data_with_gemini(text_file_path, path_to_data_to_extract) # Save extracted data to JSON file json_path = os.path.join(temp_dir, "extracted_data.json") with open(json_path, 'w', encoding='utf-8') as f: json.dump(extracted_data, f, ensure_ascii=False, indent=2) # Generate DOCX report docx_path = os.path.join(temp_dir, "rapport_extraction.docx") generator = RapportGenerator(json_path, docx_path) generator.generate_report() return text_file_path, zip_path, json_path, docx_path except Exception as e: raise gr.Error(f"Error processing PDF: {str(e)}") # Gradio Interface css = """ .gradio-container { font-family: 'IBM Plex Sans', sans-serif; } .gr-button { color: white; border-radius: 8px; background: linear-gradient(45deg, #7928CA, #FF0080); border: none; } """ demo = gr.Interface( fn=process_pdf, inputs=[ gr.File( label="Télécharger un document PDF", file_types=[".pdf"], type="filepath" ) ], outputs=[ gr.File(label="Texte extrait (TXT)"), gr.File(label="Images annotées (ZIP)"), gr.File(label="Données extraites (JSON)"), gr.File(label="Rapport généré (DOCX)") # Nouvelle sortie ], title="Extraction de texte PDF et création d'un rapport DOCX", description=""" Téléchargez un document PDF pour : 1. Extraire le contenu textuel 2. Obtenir des images annotées montrant les blocs de texte détectés 3. Extraire des données structurées grâce à une analyse IA 4. Générer un rapport formaté au format DOCX Prend en charge les documents multi-pages et les documents juridiques français. """, css=css, examples=[], cache_examples=False, theme=gr.themes.Soft() ) # Launch the app if __name__ == "__main__": demo.launch( debug=False, auth=authenticate ).launch()