import os from fastapi import UploadFile import mimetypes from app.parser.parsers import * from app.schemas.document import Document async def get_document_from_file(file: UploadFile, temp_file_path="/tmp/temp_file"): mimetype = file.content_type stream = await file.read() with open(temp_file_path, "wb") as file: file.write(stream) try: parsed_text = await extract_text_with_mimetype(temp_file_path, mimetype) except Exception as e: os.remove(temp_file_path) raise Exception("Couldn't get document from file") os.remove(temp_file_path) return Document( text=parsed_text, ) async def extract_text_with_mimetype(file_path, mimetype): if mimetype is None: mimetype, _ = mimetypes.guess_type(file_path) if mimetype is None: raise Exception("Unsupported file type") if mimetype == "application/pdf": parsed_text = PdfParser.parse(file_path) elif mimetype == "text/plain": parsed_text = TxtParser.parse(file_path) elif ( mimetype == "application/vnd.openxmlformats-officedocument.wordprocessingml.document" ): parsed_text = DocxParser.parse(file_path) return parsed_text