File size: 2,749 Bytes
9a2b6d1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
# import re
# import os
# from xml.etree import ElementTree as ET
# from xml.dom import minidom

# def process_dorian_grey():
#     # Create processed directory if it doesn't exist
#     os.makedirs('texts/processed', exist_ok=True)
    
#     # Read the file
#     with open('texts/dorian_grey.txt', 'r', encoding='utf-8') as f:
#         text = f.read()
    
#     # Create root XML element
#     root = ET.Element("book")
#     root.set("title", "The Picture of Dorian Gray")
    
#     # Split into chapters using regex
#     # Look for chapter markers and keep them with the content
#     chapter_pattern = r'(CHAPTER [IVXLC\d]+\..*?)(?=CHAPTER [IVXLC\d]+\.|$)'
#     chapters = re.findall(chapter_pattern, text, re.DOTALL)
    
#     # Process chapters
#     for i, content in enumerate(chapters):
#         # Create chapter element
#         chapter = ET.SubElement(root, "chapter")
#         chapter.set("id", f"chapter_{i}")
#         chapter.set("title", f"Chapter {i}")
#         chapter.text = content.strip()
    
#     # Pretty print XML
#     xml_str = minidom.parseString(ET.tostring(root)).toprettyxml(indent="  ")
    
#     # Save as XML
#     output_path = 'texts/processed/dorian_grey.xml'
#     with open(output_path, 'w', encoding='utf-8') as f:
#         f.write(xml_str)
    
#     print(f"Processed and saved to {output_path}")

# def process_time_machine():
#     # Create processed directory if it doesn't exist
#     os.makedirs('texts/processed', exist_ok=True)
    
#     # Read the file
#     with open('texts/time_machine.txt', 'r', encoding='utf-8') as f:
#         text = f.read()
    
#     # Create root XML element
#     root = ET.Element("book")
#     root.set("title", "The Time Machine")
    
#     # Split into chapters using 4 or more newlines as separator
#     chapters = re.split(r'\n{4,}', text)
    
#     # Track actual chapter number (no skipping)
#     chapter_num = 1
    
#     # Process chapters
#     for content in chapters:
#         if content.strip():  # Only process non-empty chapters
#             # Create chapter element
#             chapter = ET.SubElement(root, "chapter")
#             chapter.set("id", f"chapter_{chapter_num-1}")  # Keep 0-based ids
#             chapter.set("title", f"Chapter {chapter_num}")
#             chapter.text = content.strip()
#             chapter_num += 1
    
#     # Pretty print XML
#     xml_str = minidom.parseString(ET.tostring(root)).toprettyxml(indent="  ")
    
#     # Save as XML
#     output_path = 'texts/processed/time_machine.xml'
#     with open(output_path, 'w', encoding='utf-8') as f:
#         f.write(xml_str)
    
#     print(f"Processed and saved to {output_path}")

# if __name__ == "__main__":
#     process_time_machine()