DevBM commited on
Commit
b4c86a6
β€’
1 Parent(s): 61e6e50

Add get_text_from_document function for extracting text from any document type

Browse files
Files changed (1) hide show
  1. text_processing.py +103 -41
text_processing.py CHANGED
@@ -1,41 +1,103 @@
1
- import re
2
- import pymupdf
3
- from nltk.tokenize import sent_tokenize
4
-
5
- def get_pdf_text(pdf_file):
6
- doc = pymupdf.open(stream=pdf_file.read(), filetype="pdf")
7
- text = ""
8
- for page_num in range(doc.page_count):
9
- page = doc.load_page(page_num)
10
- text += page.get_text()
11
- return text
12
-
13
- def clean_text(text):
14
- text = re.sub(r"[^\x00-\x7F]", " ", text)
15
- text = re.sub(r"[\n]", " ", text)
16
- text = re.sub(r'\s+', ' ', text).strip()
17
- text = re.sub(r'[β€œβ€]', '"', text)
18
- text = re.sub(r"[β€˜β€™]", "'", text)
19
- text = text.replace('\xad', '')
20
- text = re.sub(r'[‒–—―]', '-', text)
21
- return text
22
-
23
- # Function to create text chunks
24
- def segment_text(text, max_segment_length=700, batch_size=7):
25
- sentences = sent_tokenize(text)
26
- segments = []
27
- current_segment = ""
28
-
29
- for sentence in sentences:
30
- if len(current_segment) + len(sentence) <= max_segment_length:
31
- current_segment += sentence + " "
32
- else:
33
- segments.append(current_segment.strip())
34
- current_segment = sentence + " "
35
-
36
- if current_segment:
37
- segments.append(current_segment.strip())
38
-
39
- # Create batches
40
- batches = [segments[i:i + batch_size] for i in range(0, len(segments), batch_size)]
41
- return batches
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import pymupdf
3
+ from nltk.tokenize import sent_tokenize
4
+ from docx.api import Document
5
+ from pptx import Presentation
6
+ from bs4 import BeautifulSoup
7
+ import pypandoc
8
+
9
+ def clean_text(text):
10
+ text = re.sub(r"[^\x00-\x7F]", " ", text)
11
+ text = re.sub(r"[\n]", " ", text)
12
+ text = re.sub(r'\s+', ' ', text).strip()
13
+ text = re.sub(r'[β€œβ€]', '"', text)
14
+ text = re.sub(r"[β€˜β€™]", "'", text)
15
+ text = text.replace('\xad', '')
16
+ text = re.sub(r'[‒–—―]', '-', text)
17
+ return text
18
+
19
+ # Function to create text chunks
20
+ def segment_text(text, max_segment_length=700, batch_size=7):
21
+ sentences = sent_tokenize(text)
22
+ segments = []
23
+ current_segment = ""
24
+
25
+ for sentence in sentences:
26
+ if len(current_segment) + len(sentence) <= max_segment_length:
27
+ current_segment += sentence + " "
28
+ else:
29
+ segments.append(current_segment.strip())
30
+ current_segment = sentence + " "
31
+
32
+ if current_segment:
33
+ segments.append(current_segment.strip())
34
+
35
+ # Create batches
36
+ batches = [segments[i:i + batch_size] for i in range(0, len(segments), batch_size)]
37
+ return batches
38
+
39
+ def get_pdf_text(pdf_file):
40
+ doc = pymupdf.open(stream=pdf_file.read(), filetype="pdf")
41
+ text = ""
42
+ for page_num in range(doc.page_count):
43
+ page = doc.load_page(page_num)
44
+ text += page.get_text()
45
+ return text
46
+
47
+ # Function to get text from a DOCX file
48
+ def get_doc_text(doc_files):
49
+ doc = Document(doc_files)
50
+ text = ""
51
+ for paragraph in doc.paragraphs:
52
+ text += paragraph.text + "\n"
53
+ return text
54
+
55
+ # Function to get text from a PPTX file
56
+ def get_ppt_text(ppt_files):
57
+ prs = Presentation(ppt_files)
58
+ text = ""
59
+ for slide in prs.slides:
60
+ for shape in slide.shapes:
61
+ if hasattr(shape, "text"):
62
+ text += shape.text + "\n"
63
+ return text
64
+
65
+ # Function to get text from HTML files
66
+ def get_html_text(html_files):
67
+ text = ""
68
+ for html_file in html_files:
69
+ with open(html_file, 'r', encoding='utf-8') as f:
70
+ soup = BeautifulSoup(f, 'html.parser')
71
+ text += soup.get_text()
72
+ return text
73
+
74
+ # Function to get text from LaTeX files
75
+ def get_latex_text(latex_files):
76
+ text = ""
77
+ for latex_file in latex_files:
78
+ output = pypandoc.convert_file(latex_file, 'plain')
79
+ text += output
80
+ return text
81
+
82
+ # Function to parse text from a file
83
+ def parse_text(file):
84
+ text = file.getvalue().decode("utf-8")
85
+ return text
86
+
87
+ # Function to get text from uploaded documents
88
+ def get_text_from_document(file):
89
+ content = ""
90
+ if file is not None:
91
+ if file.name.endswith('.pdf'):
92
+ content += get_pdf_text(file)
93
+ elif file.name.endswith('.docx') or file.name.endswith('.doc'):
94
+ content += get_doc_text(file)
95
+ elif file.name.endswith('.pptx') or file.name.endswith('.ppt'):
96
+ content += get_ppt_text(file)
97
+ elif file.name.endswith('.html'):
98
+ content += get_html_text(file)
99
+ elif file.name.endswith('.tex'):
100
+ content += get_latex_text(file)
101
+ elif file.name.endswith('.txt'):
102
+ content += parse_text(file)
103
+ return content