ShebMichel commited on
Commit
aeba1c4
1 Parent(s): e251c7d

Update exam_data_scrapper.py

Browse files
Files changed (1) hide show
  1. exam_data_scrapper.py +91 -89
exam_data_scrapper.py CHANGED
@@ -1,90 +1,92 @@
1
- #!pip install python-docx
2
- #!pip install PyPDF2 --upgrade
3
-
4
-
5
- import os
6
- import json
7
- from PyPDF2 import PdfReader
8
- from docx import Document
9
-
10
- def extract_from_pdf(pdf_path):
11
- """Extract text from a PDF file."""
12
- pdf_data = ""
13
- with open(pdf_path, "rb") as pdf_file:
14
- reader = PdfReader(pdf_file)
15
- for page_num in range(len(reader.pages)):
16
- page = reader.pages[page_num]
17
- pdf_data += page.extract_text()
18
- return pdf_data
19
-
20
- def extract_from_json(json_path):
21
- """Extract data from a JSON file."""
22
- with open(json_path, "r") as json_file:
23
- json_data = json.load(json_file)
24
- return json_data
25
-
26
- def extract_from_word(word_path):
27
- """Extract text from a Word (.docx) file."""
28
- doc = Document(word_path)
29
- word_data = ""
30
- for para in doc.paragraphs:
31
- word_data += para.text + "\n"
32
- return word_data
33
-
34
- def extract_data(file_path):
35
- """Extract data from a file based on its extension."""
36
- _, file_extension = os.path.splitext(file_path)
37
-
38
- if file_extension == ".pdf":
39
- return extract_from_pdf(file_path)
40
- elif file_extension == ".json":
41
- return extract_from_json(file_path)
42
- elif file_extension == ".docx":
43
- return extract_from_word(file_path)
44
- else:
45
- raise ValueError("Unsupported file extension: " + file_extension)
46
-
47
- def create_data_dictionary(files):
48
- """Create a dictionary containing data from files based on their extension."""
49
- data_dict = {}
50
- for file_path in files:
51
- try:
52
- file_data = extract_data(file_path)
53
- data_dict[file_path] = file_data
54
- except ValueError as e:
55
- print(e)
56
- return data_dict
57
-
58
- # Usage example
59
- path = r'C:\Users\00110138\OneDrive - The University of Western Australia\Project\KaggleX FellowshipProgram\code\Exam_Data'
60
- # Usage example
61
- files = [str(path)+"/Geology_Geophysics_Exam.pdf", str(path)+"/Geology_Geophysics_Exam.json", str(path)+"/Geology_Geophysics_Exam.docx"]
62
- exam_data = [files[1]]
63
- data_dict = create_data_dictionary(exam_data)
64
- ##
65
- school_data = ['university','department','course_code','course_title','date','duration','instructor']
66
- qcm_data = ['question','options', 'answer']
67
- short_data = ['question','answer']
68
- #print(data_dict[str(exam_data[0])]['multiple_choice_questions'])
69
- multiple_choice_questions = data_dict[str(exam_data[0])]['multiple_choice_questions']
70
- short_answer_questions = data_dict[str(exam_data[0])]['short_answer_questions']
71
- long_answer_questions = data_dict[str(exam_data[0])]['long_answer_questions']
72
-
73
- for s_data in school_data:
74
- print(f" {s_data}: {data_dict[str(exam_data[0])]['header'][str(s_data)]}")
75
- print(f"***************'school data'************************")
76
-
77
- for idx,qcm in enumerate(multiple_choice_questions):
78
- print(f" Index is: {idx} and 'Question': {qcm['question']}")
79
- print(f" Index is: {idx} and 'Options': {qcm['options']}")
80
- print(f" Index is: {idx} and 'Answer': {qcm['answer']}")
81
- print(f"***************'multiple_choice_questions'************************")
82
- for idx,qcm in enumerate(short_answer_questions):
83
- print(f" Index is: {idx} and 'Question': {qcm['question']}")
84
- print(f" Index is: {idx} and 'Answer': {qcm['answer']}")
85
- print(f"***************' END short_answer_questions'************************")
86
- print(f"***************' START long_answer_questions'************************")
87
- for idx,qcm in enumerate(long_answer_questions):
88
- print(f" Index is: {idx} and 'Question': {qcm['question']}")
89
- print(f" Index is: {idx} and 'Answer': {qcm['answer']}")
 
 
90
  print(f"***************' END long_answer_questions'************************")
 
1
+ #!pip install python-docx
2
+ #!pip install PyPDF2 --upgrade
3
+
4
+
5
+ import os
6
+ import json
7
+ from PyPDF2 import PdfReader
8
+ from docx import Document
9
+
10
+ def extract_from_pdf(pdf_path):
11
+ """Extract text from a PDF file."""
12
+ pdf_data = ""
13
+ with open(pdf_path, "rb") as pdf_file:
14
+ reader = PdfReader(pdf_file)
15
+ for page_num in range(len(reader.pages)):
16
+ page = reader.pages[page_num]
17
+ pdf_data += page.extract_text()
18
+ return pdf_data
19
+
20
+ def extract_from_json(json_path):
21
+ """Extract data from a JSON file."""
22
+ with open(json_path, "r") as json_file:
23
+ json_data = json.load(json_file)
24
+ return json_data
25
+
26
+ def extract_from_word(word_path):
27
+ """Extract text from a Word (.docx) file."""
28
+ doc = Document(word_path)
29
+ word_data = ""
30
+ for para in doc.paragraphs:
31
+ word_data += para.text + "\n"
32
+ return word_data
33
+
34
+ def extract_data(file_path):
35
+ """Extract data from a file based on its extension."""
36
+ _, file_extension = os.path.splitext(file_path)
37
+
38
+ if file_extension == ".pdf":
39
+ return extract_from_pdf(file_path)
40
+ elif file_extension == ".json":
41
+ return extract_from_json(file_path)
42
+ elif file_extension == ".docx":
43
+ return extract_from_word(file_path)
44
+ else:
45
+ raise ValueError("Unsupported file extension: " + file_extension)
46
+
47
+ def create_data_dictionary(files):
48
+ """Create a dictionary containing data from files based on their extension."""
49
+ data_dict = {}
50
+ for file_path in files:
51
+ try:
52
+ file_data = extract_data(file_path)
53
+ data_dict[file_path] = file_data
54
+ except ValueError as e:
55
+ print(e)
56
+ return data_dict
57
+
58
+ # Usage example
59
+ path = ''
60
+ # Usage example
61
+ exam_files = 'data'
62
+ #exam_data = [files[1]]
63
+ print(exam_files)
64
+ data_dict = create_data_dictionary(exam_files)
65
+
66
+ ##
67
+ school_data = ['university','department','course_code','course_title','date','duration','instructor']
68
+ qcm_data = ['question','options', 'answer']
69
+ short_data = ['question','answer']
70
+ #print(data_dict[str(exam_data[0])]['multiple_choice_questions'])
71
+ multiple_choice_questions = data_dict[str(exam_data[0])]['multiple_choice_questions']
72
+ short_answer_questions = data_dict[str(exam_data[0])]['short_answer_questions']
73
+ long_answer_questions = data_dict[str(exam_data[0])]['long_answer_questions']
74
+
75
+ for s_data in school_data:
76
+ print(f" {s_data}: {data_dict[str(exam_data[0])]['header'][str(s_data)]}")
77
+ print(f"***************'school data'************************")
78
+
79
+ for idx,qcm in enumerate(multiple_choice_questions):
80
+ print(f" Index is: {idx} and 'Question': {qcm['question']}")
81
+ print(f" Index is: {idx} and 'Options': {qcm['options']}")
82
+ print(f" Index is: {idx} and 'Answer': {qcm['answer']}")
83
+ print(f"***************'multiple_choice_questions'************************")
84
+ for idx,qcm in enumerate(short_answer_questions):
85
+ print(f" Index is: {idx} and 'Question': {qcm['question']}")
86
+ print(f" Index is: {idx} and 'Answer': {qcm['answer']}")
87
+ print(f"***************' END short_answer_questions'************************")
88
+ print(f"***************' START long_answer_questions'************************")
89
+ for idx,qcm in enumerate(long_answer_questions):
90
+ print(f" Index is: {idx} and 'Question': {qcm['question']}")
91
+ print(f" Index is: {idx} and 'Answer': {qcm['answer']}")
92
  print(f"***************' END long_answer_questions'************************")