Spaces:

oceansweep
/

tldw

Running

File size: 10,002 Bytes

43cd37c

# Media_Wiki.py
# Description: This file contains the functions to import MediaWiki dumps into the media_db and Chroma databases.
#######################################################################################################################
#
# Imports
import json
import logging
import os
import re
import traceback
from typing import List, Dict, Any, Iterator, Optional
# 3rd-Party Imports
import mwparserfromhell
import mwxml
import yaml
#
# Local Imports
from App_Function_Libraries.DB.DB_Manager import add_media_with_keywords
from App_Function_Libraries.RAG.ChromaDB_Library import process_and_store_content
#
#######################################################################################################################
#
# Functions:
# Load configuration
def load_mediawiki_import_config():
    config_path = os.path.join(os.path.dirname(__file__), '..', '..', 'Config_Files', 'mediawiki_import_config.yaml')
    with open(config_path, 'r') as f:
        return yaml.safe_load(f)

config = load_mediawiki_import_config()


def setup_logger(name: str, level: int = logging.INFO, log_file: Optional[str] = None) -> logging.Logger:
    """Set up and return a logger with the given name and level."""
    logger = logging.getLogger(name)
    logger.setLevel(level)
    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')

    if log_file:
        file_handler = logging.FileHandler(log_file)
        file_handler.setFormatter(formatter)
        logger.addHandler(file_handler)

    console_handler = logging.StreamHandler()
    console_handler.setFormatter(formatter)
    logger.addHandler(console_handler)

    return logger

# Usage
logger = setup_logger('mediawiki_import', log_file='mediawiki_import.log')

# End of setup
#######################################################################################################################
#
# Functions:


def parse_mediawiki_dump(file_path: str, namespaces: List[int] = None, skip_redirects: bool = False) -> Iterator[
    Dict[str, Any]]:
    dump = mwxml.Dump.from_file(open(file_path, encoding='utf-8'))
    for page in dump.pages:
        if skip_redirects and page.redirect:
            continue
        if namespaces and page.namespace not in namespaces:
            continue

        for revision in page:
            wikicode = mwparserfromhell.parse(revision.text)
            plain_text = wikicode.strip_code()
            yield {
                "title": page.title,
                "content": plain_text,
                "namespace": page.namespace,
                "page_id": page.id,
                "revision_id": revision.id,
                "timestamp": revision.timestamp
            }
        logger.debug(f"Yielded page: {page.title}")


def optimized_chunking(text: str, chunk_options: Dict[str, Any]) -> List[Dict[str, Any]]:
    sections = re.split(r'\n==\s*(.*?)\s*==\n', text)
    chunks = []
    current_chunk = ""
    current_size = 0

    logging.debug(f"optimized_chunking: Processing text with {len(sections) // 2} sections")
    for i in range(0, len(sections), 2):
        section_title = sections[i] if i > 0 else "Introduction"
        section_content = sections[i + 1] if i + 1 < len(sections) else ""

        if current_size + len(section_content) > chunk_options['max_size']:
            if current_chunk:
                chunks.append({"text": current_chunk, "metadata": {"section": section_title}})
            current_chunk = section_content
            current_size = len(section_content)
        else:
            current_chunk += f"\n== {section_title} ==\n" + section_content
            current_size += len(section_content)

    if current_chunk:
        chunks.append({"text": current_chunk, "metadata": {"section": "End"}})

    return chunks





def process_single_item(content: str, title: str, wiki_name: str, chunk_options: Dict[str, Any],

                        is_combined: bool = False, item: Dict[str, Any] = None, api_name: str = None):
    try:
        logging.debug(f"process_single_item: Processing item: {title}")

        # Create a unique URL using the wiki name and article title
        encoded_title = title.replace(" ", "_")
        url = f"mediawiki:{wiki_name}:{encoded_title}"
        logging.debug(f"Generated URL: {url}")

        result = add_media_with_keywords(
            url=url,  # Use the generated URL here
            title=title,
            media_type="mediawiki_dump" if is_combined else "mediawiki_article",
            content=content,
            keywords=f"mediawiki,{wiki_name}" + (",full_dump" if is_combined else ",article"),
            prompt="",
            summary="",
            transcription_model="",
            author="MediaWiki",
            ingestion_date=item['timestamp'].strftime('%Y-%m-%d') if item else None
        )
        logging.debug(f"Result from add_media_with_keywords: {result}")

        # Unpack the result
        media_id, message = result
        logging.info(f"Media item result: {message}")
        logging.debug(f"Final media_id: {media_id}")

        chunks = optimized_chunking(content, chunk_options)
        for i, chunk in enumerate(chunks):
            logging.debug(f"Processing chunk {i + 1}/{len(chunks)} for item: {title}")

            # FIXME
            # def process_and_store_content(content: str, collection_name: str, media_id: int, file_name: str,
            #                               create_embeddings: bool = False, create_summary: bool = False,
            #                               api_name: str = None):
            if api_name:
                process_and_store_content(chunk['text'], f"mediawiki_{wiki_name}", media_id, title, True, True, api_name)
            else:
                process_and_store_content(chunk['text'], f"mediawiki_{wiki_name}", media_id, title)
        logging.info(f"Successfully processed item: {title}")
    except Exception as e:
        logging.error(f"Error processing item {title}: {str(e)}")
        logging.error(f"Exception details: {traceback.format_exc()}")


def load_checkpoint(file_path: str) -> int:
    if os.path.exists(file_path):
        with open(file_path, 'r') as f:
            return json.load(f)['last_processed_id']
    return 0


def save_checkpoint(file_path: str, last_processed_id: int):
    with open(file_path, 'w') as f:
        json.dump({'last_processed_id': last_processed_id}, f)


def import_mediawiki_dump(

        file_path: str,

        wiki_name: str,

        namespaces: List[int] = None,

        skip_redirects: bool = False,

        chunk_options: Dict[str, Any] = None,

        single_item: bool = False,

        progress_callback: Any = None,

        api_name: str = None,

        api_key: str = None

) -> Iterator[str]:
    try:
        logging.info(f"Importing MediaWiki dump: {file_path}")
        if chunk_options is None:
            chunk_options = config['chunking']

        checkpoint_file = f"{wiki_name}_import_checkpoint.json"
        last_processed_id = load_checkpoint(checkpoint_file)

        total_pages = count_pages(file_path, namespaces, skip_redirects)
        processed_pages = 0

        yield f"Found {total_pages} pages to process."

        for item in parse_mediawiki_dump(file_path, namespaces, skip_redirects):
            if item['page_id'] <= last_processed_id:
                continue
            # FIXME - ensure this works...
            if api_name is not None:
                # FIXME - add API key to the call/params
                process_single_item(item['content'], item['title'], wiki_name, chunk_options, False, item, api_name)
            process_single_item(item['content'], item['title'], wiki_name, chunk_options, False, item)
            save_checkpoint(checkpoint_file, item['page_id'])
            processed_pages += 1
            if progress_callback is not None:
                progress_callback(processed_pages / total_pages, f"Processed page: {item['title']}")
            yield f"Processed page {processed_pages}/{total_pages}: {item['title']}"

        os.remove(checkpoint_file)  # Remove checkpoint file after successful import
        yield f"Successfully imported and indexed MediaWiki dump: {wiki_name}"
    except FileNotFoundError:
        logger.error(f"MediaWiki dump file not found: {file_path}")
        yield f"Error: File not found - {file_path}"
    except PermissionError:
        logger.error(f"Permission denied when trying to read: {file_path}")
        yield f"Error: Permission denied - {file_path}"
    except Exception as e:
        logger.exception(f"Error during MediaWiki import: {str(e)}")
        yield f"Error during import: {str(e)}"

def count_pages(file_path: str, namespaces: List[int] = None, skip_redirects: bool = False) -> int:
    """

    Count the number of pages in a MediaWiki XML dump file.



    Args:

    file_path (str): Path to the MediaWiki XML dump file.

    namespaces (List[int], optional): List of namespace IDs to include. If None, include all namespaces.

    skip_redirects (bool, optional): Whether to skip redirect pages.



    Returns:

    int: The number of pages in the dump file.

    """
    try:
        dump = mwxml.Dump.from_file(open(file_path, encoding='utf-8'))
        count = 0
        for page in dump.pages:
            if skip_redirects and page.redirect:
                continue
            if namespaces and page.namespace not in namespaces:
                continue
            count += 1
        return count
    except Exception as e:
        logger.error(f"Error counting pages in MediaWiki dump: {str(e)}")
        return 0

#
# End of Media_Wiki.py
#######################################################################################################################