Spaces:

dl4ds
/

dl4ds_tutor

Build error

App Files Files Community

XThomasBU commited on Feb 12

Commit

dbc26b1

•

1 Parent(s): 849b2ae

commit to add lecture pdfs in context

Browse files

Files changed (4) hide show

code/modules/data_loader.py +34 -2
code/modules/helpers.py +15 -6
code/modules/vector_db.py +11 -0
requirements.txt +1 -0

code/modules/data_loader.py CHANGED Viewed

@@ -48,6 +48,27 @@ class DataLoader:
             self.splitter = None
         logger.info("InfoLoader instance created")
     def get_chunks(self, uploaded_files, weblinks):
         # Main list of all documents
         self.document_chunks_full = []
@@ -78,6 +99,13 @@ class DataLoader:
                 logger.info(f"\tNumber of pages after skipping: {len(document_chunks)}")
             return document_chunks
         def get_pdf(temp_file_path: str, title: str):
             """
             Function to process PDF files
@@ -201,7 +229,10 @@ class DataLoader:
             # Handle different file types
             if file_type == "pdf":
-                title, document_chunks = get_pdf(file_path, file_name)
             elif file_type == "txt":
                 title, document_chunks = get_txt(file_path, file_name)
             elif file_type == "docx":
@@ -215,7 +246,7 @@ class DataLoader:
             if self.config["splitter_options"]["remove_chunks"]:
                 document_chunks = remove_chunks(document_chunks)
-            logger.info(f"\t\tExtracted no. of chunks: {len(document_chunks)}")
             self.document_names.append(title)
             self.document_chunks_full.extend(document_chunks)
@@ -243,6 +274,7 @@ class DataLoader:
                     self.document_chunks_full.extend(document_chunks)
                 except:
                     logger.info(f"\t\tError splitting link {link_index+1} : {link}")
         logger.info(
             f"\tNumber of document chunks extracted in total: {len(self.document_chunks_full)}\n\n"

             self.splitter = None
         logger.info("InfoLoader instance created")
+    def extract_text_from_pdf(self, pdf_path):
+        text = ""
+        with open(pdf_path, "rb") as file:
+            reader = PyPDF2.PdfReader(file)
+            num_pages = len(reader.pages)
+            for page_num in range(num_pages):
+                page = reader.pages[page_num]
+                text += page.extract_text()
+        return text
+    def download_pdf_from_url(self, pdf_url):
+        response = requests.get(pdf_url)
+        if response.status_code == 200:
+            with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
+                temp_file.write(response.content)
+                temp_file_path = temp_file.name
+            return temp_file_path
+        else:
+            print("Failed to download PDF from URL:", pdf_url)
+            return None
     def get_chunks(self, uploaded_files, weblinks):
         # Main list of all documents
         self.document_chunks_full = []
                 logger.info(f"\tNumber of pages after skipping: {len(document_chunks)}")
             return document_chunks
+        def get_pdf_from_url(pdf_url: str):
+            temp_pdf_path = self.download_pdf_from_url(pdf_url)
+            if temp_pdf_path:
+                title, document_chunks = get_pdf(temp_pdf_path, pdf_url)
+                os.remove(temp_pdf_path)
+                return title, document_chunks
         def get_pdf(temp_file_path: str, title: str):
             """
             Function to process PDF files
             # Handle different file types
             if file_type == "pdf":
+                try:
+                    title, document_chunks = get_pdf(file_path, file_name)
+                except:
+                    title, document_chunks = get_pdf_from_url(file_path)
             elif file_type == "txt":
                 title, document_chunks = get_txt(file_path, file_name)
             elif file_type == "docx":
             if self.config["splitter_options"]["remove_chunks"]:
                 document_chunks = remove_chunks(document_chunks)
+            logger.info(f"\t\tExtracted no. of chunks: {len(document_chunks)} from {file_name}")
             self.document_names.append(title)
             self.document_chunks_full.extend(document_chunks)
                     self.document_chunks_full.extend(document_chunks)
                 except:
                     logger.info(f"\t\tError splitting link {link_index+1} : {link}")
+                    exit()
         logger.info(
             f"\tNumber of document chunks extracted in total: {len(self.document_chunks_full)}\n\n"

code/modules/helpers.py CHANGED Viewed

@@ -36,6 +36,10 @@ class WebpageCrawler:
         soup = BeautifulSoup(html_data, "html.parser")
         list_links = []
         for link in soup.find_all("a", href=True):
             # Append to list if new link contains original link
             if str(link["href"]).startswith((str(website_link))):
                 list_links.append(link["href"])
@@ -56,14 +60,19 @@ class WebpageCrawler:
     def get_subpage_links(self, l, base_url):
         for link in tqdm(l):
-            # If not crawled through this page start crawling and get links
-            if l[link] == "Not-checked":
-                dict_links_subpages = self.get_links(link, base_url)
-                # Change the dictionary value of the link to "Checked"
                 l[link] = "Checked"
-            else:
-                # Create an empty dictionary in case every link is checked
                 dict_links_subpages = {}
             # Add new dictionary to old dictionary
             l = {**dict_links_subpages, **l}
         return l

         soup = BeautifulSoup(html_data, "html.parser")
         list_links = []
         for link in soup.find_all("a", href=True):
+            # clean the link
+            # remove empty spaces
+            link["href"] = link["href"].strip()
             # Append to list if new link contains original link
             if str(link["href"]).startswith((str(website_link))):
                 list_links.append(link["href"])
     def get_subpage_links(self, l, base_url):
         for link in tqdm(l):
+            print('checking link:', link)
+            if not link.endswith("/"):
                 l[link] = "Checked"
                 dict_links_subpages = {}
+            else:
+                # If not crawled through this page start crawling and get links
+                if l[link] == "Not-checked":
+                    dict_links_subpages = self.get_links(link, base_url)
+                    # Change the dictionary value of the link to "Checked"
+                    l[link] = "Checked"
+                else:
+                    # Create an empty dictionary in case every link is checked
+                    dict_links_subpages = {}
             # Add new dictionary to old dictionary
             l = {**dict_links_subpages, **l}
         return l

code/modules/vector_db.py CHANGED Viewed

@@ -60,6 +60,14 @@ class VectorDB:
             urls = all_urls
         return files, urls
     def create_embedding_model(self):
         self.logger.info("Creating embedding function")
         self.embedding_model_loader = EmbeddingModelLoader(self.config)
@@ -79,6 +87,9 @@ class VectorDB:
         data_loader = DataLoader(self.config)
         self.logger.info("Loading data")
         files, urls = self.load_files()
         document_chunks, document_names = data_loader.get_chunks(files, urls)
         self.logger.info("Completed loading data")

             urls = all_urls
         return files, urls
+    def clean_url_list(self, urls):
+        # get lecture pdf links
+        lecture_pdfs = [link for link in urls if link.endswith(".pdf")]
+        lecture_pdfs = [link for link in lecture_pdfs if "lecture" in link.lower()]
+        urls = [link for link in urls if link.endswith("/")] # only keep links that end with a '/'. Extract Files Seperately
+        return urls, lecture_pdfs
     def create_embedding_model(self):
         self.logger.info("Creating embedding function")
         self.embedding_model_loader = EmbeddingModelLoader(self.config)
         data_loader = DataLoader(self.config)
         self.logger.info("Loading data")
         files, urls = self.load_files()
+        urls, lecture_pdfs = self.clean_url_list(urls)
+        files += lecture_pdfs
+        files.remove('storage/data/urls.txt')
         document_chunks, document_names = data_loader.get_chunks(files, urls)
         self.logger.info("Completed loading data")

requirements.txt CHANGED Viewed

@@ -16,3 +16,4 @@ beautifulsoup4==4.12.2
 fake-useragent==1.4.0
 git+https://github.com/huggingface/accelerate.git
 llama-cpp-python

 fake-useragent==1.4.0
 git+https://github.com/huggingface/accelerate.git
 llama-cpp-python
+PyPDF2==3.0.1