#!pip install python-docx #!pip install PyPDF2 --upgrade import os import json from PyPDF2 import PdfReader from docx import Document def extract_from_pdf(pdf_path): """Extract text from a PDF file.""" pdf_data = "" with open(pdf_path, "rb") as pdf_file: reader = PdfReader(pdf_file) for page_num in range(len(reader.pages)): page = reader.pages[page_num] pdf_data += page.extract_text() return pdf_data def extract_from_json(json_path): """Extract data from a JSON file.""" with open(json_path, "r") as json_file: json_data = json.load(json_file) return json_data def extract_from_word(word_path): """Extract text from a Word (.docx) file.""" doc = Document(word_path) word_data = "" for para in doc.paragraphs: word_data += para.text + "\n" return word_data def extract_data(file_path): """Extract data from a file based on its extension.""" _, file_extension = os.path.splitext(file_path) if file_extension == ".pdf": return extract_from_pdf(file_path) elif file_extension == ".json": return extract_from_json(file_path) elif file_extension == ".docx": return extract_from_word(file_path) else: raise ValueError("Unsupported file extension: " + file_extension) def create_data_dictionary(files): """Create a dictionary containing data from files based on their extension.""" data_dict = {} for file_path in files: try: file_data = extract_data(file_path) data_dict[file_path] = file_data except ValueError as e: print(e) return data_dict # Usage example path = '' # Usage example exam_files = 'data' #exam_data = [files[1]] print(exam_files) data_dict = create_data_dictionary(exam_files) ## school_data = ['university','department','course_code','course_title','date','duration','instructor'] qcm_data = ['question','options', 'answer'] short_data = ['question','answer'] #print(data_dict[str(exam_data[0])]['multiple_choice_questions']) multiple_choice_questions = data_dict[str(exam_data[0])]['multiple_choice_questions'] short_answer_questions = data_dict[str(exam_data[0])]['short_answer_questions'] long_answer_questions = data_dict[str(exam_data[0])]['long_answer_questions'] for s_data in school_data: print(f" {s_data}: {data_dict[str(exam_data[0])]['header'][str(s_data)]}") print(f"***************'school data'************************") for idx,qcm in enumerate(multiple_choice_questions): print(f" Index is: {idx} and 'Question': {qcm['question']}") print(f" Index is: {idx} and 'Options': {qcm['options']}") print(f" Index is: {idx} and 'Answer': {qcm['answer']}") print(f"***************'multiple_choice_questions'************************") for idx,qcm in enumerate(short_answer_questions): print(f" Index is: {idx} and 'Question': {qcm['question']}") print(f" Index is: {idx} and 'Answer': {qcm['answer']}") print(f"***************' END short_answer_questions'************************") print(f"***************' START long_answer_questions'************************") for idx,qcm in enumerate(long_answer_questions): print(f" Index is: {idx} and 'Question': {qcm['question']}") print(f" Index is: {idx} and 'Answer': {qcm['answer']}") print(f"***************' END long_answer_questions'************************")