tldw / App_Function_Libraries /Plaintext /XML_Ingestion_Lib.py
oceansweep's picture
Upload 169 files
c5b0bb7 verified
# XML_Ingestion.py
# Description: This file contains functions for reading and writing XML files.
# Imports
import logging
import xml.etree.ElementTree as ET
#
# External Imports
#
# Local Imports
from App_Function_Libraries.Gradio_UI.Import_Functionality import import_data
from App_Function_Libraries.Summarization.Summarization_General_Lib import perform_summarization
from App_Function_Libraries.Chunk_Lib import chunk_xml
from App_Function_Libraries.DB.DB_Manager import add_media_to_database
#
#######################################################################################################################
#
# Functions:
def xml_to_text(xml_file):
try:
tree = ET.parse(xml_file)
root = tree.getroot()
# Extract text content recursively
text_content = []
for elem in root.iter():
if elem.text and elem.text.strip():
text_content.append(elem.text.strip())
return '\n'.join(text_content)
except ET.ParseError as e:
logging.error(f"Error parsing XML file: {str(e)}")
return None
def import_xml_handler(import_file, title, author, keywords, system_prompt,
custom_prompt, auto_summarize, api_name, api_key):
if not import_file:
return "Please upload an XML file"
try:
# Parse XML and extract text with structure
tree = ET.parse(import_file.name)
root = tree.getroot()
# Create chunk options
chunk_options = {
'method': 'xml',
'max_size': 1000, # Adjust as needed
'overlap': 200, # Adjust as needed
'language': 'english' # Add language detection if needed
}
# Use the chunk_xml function to get structured chunks
chunks = chunk_xml(ET.tostring(root, encoding='unicode'), chunk_options)
# Convert chunks to segments format expected by add_media_to_database
segments = []
for chunk in chunks:
segment = {
'Text': chunk['text'],
'metadata': chunk['metadata'] # Preserve XML structure metadata
}
segments.append(segment)
# Create info_dict
info_dict = {
'title': title or 'Untitled XML Document',
'uploader': author or 'Unknown',
'file_type': 'xml',
'structure': root.tag # Save root element type
}
# Process keywords
keyword_list = [kw.strip() for kw in keywords.split(',') if kw.strip()] if keywords else []
# Handle summarization
if auto_summarize and api_name and api_key:
# Combine all chunks for summarization
full_text = '\n'.join(chunk['text'] for chunk in chunks)
summary = perform_summarization(api_name, full_text, custom_prompt, api_key)
else:
summary = "No summary provided"
# Add to database
result = add_media_to_database(
url=import_file.name, # Using filename as URL
info_dict=info_dict,
segments=segments,
summary=summary,
keywords=keyword_list,
custom_prompt_input=custom_prompt,
whisper_model="XML Import",
media_type="xml_document",
overwrite=False
)
return f"XML file '{import_file.name}' import complete. Database result: {result}"
except ET.ParseError as e:
logging.error(f"XML parsing error: {str(e)}")
return f"Error parsing XML file: {str(e)}"
except Exception as e:
logging.error(f"Error processing XML file: {str(e)}")
return f"Error processing XML file: {str(e)}"
#
# End of XML_Ingestion_Lib.py
#######################################################################################################################