XThomasBU commited on
Commit
4fc2bf8
1 Parent(s): 1ef2150

added timeout

Browse files
code/modules/config/constants.py CHANGED
@@ -3,6 +3,8 @@ import os
3
 
4
  load_dotenv()
5
 
 
 
6
  # API Keys - Loaded from the .env file
7
 
8
  OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
 
3
 
4
  load_dotenv()
5
 
6
+ TIMEOUT = 60
7
+
8
  # API Keys - Loaded from the .env file
9
 
10
  OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
code/modules/dataloader/data_loader.py CHANGED
@@ -22,6 +22,7 @@ from modules.dataloader.pdf_readers.base import PDFReader
22
  from modules.dataloader.pdf_readers.llama import LlamaParser
23
  from modules.dataloader.pdf_readers.gpt import GPTParser
24
  from modules.dataloader.helpers import get_metadata
 
25
 
26
  logger = logging.getLogger(__name__)
27
  BASE_DIR = os.getcwd()
@@ -32,7 +33,7 @@ class HTMLReader:
32
  pass
33
 
34
  def read_url(self, url):
35
- response = requests.get(url)
36
  if response.status_code == 200:
37
  return response.text
38
  else:
@@ -52,7 +53,7 @@ class HTMLReader:
52
  absolute_url = urljoin(base_url, href)
53
  link["href"] = absolute_url
54
 
55
- resp = requests.head(absolute_url)
56
  if resp.status_code != 200:
57
  logger.warning(
58
  f"Link {absolute_url} is broken. Status code: {resp.status_code}"
@@ -127,7 +128,7 @@ class FileReader:
127
  return [Document(page_content=self.web_reader.read_html(url))]
128
 
129
  def read_tex_from_url(self, tex_url):
130
- response = requests.get(tex_url)
131
  if response.status_code == 200:
132
  return [Document(page_content=response.text)]
133
  else:
 
22
  from modules.dataloader.pdf_readers.llama import LlamaParser
23
  from modules.dataloader.pdf_readers.gpt import GPTParser
24
  from modules.dataloader.helpers import get_metadata
25
+ from modules.config.constants import TIMEOUT
26
 
27
  logger = logging.getLogger(__name__)
28
  BASE_DIR = os.getcwd()
 
33
  pass
34
 
35
  def read_url(self, url):
36
+ response = requests.get(url, timeout=TIMEOUT)
37
  if response.status_code == 200:
38
  return response.text
39
  else:
 
53
  absolute_url = urljoin(base_url, href)
54
  link["href"] = absolute_url
55
 
56
+ resp = requests.head(absolute_url, timeout=TIMEOUT)
57
  if resp.status_code != 200:
58
  logger.warning(
59
  f"Link {absolute_url} is broken. Status code: {resp.status_code}"
 
128
  return [Document(page_content=self.web_reader.read_html(url))]
129
 
130
  def read_tex_from_url(self, tex_url):
131
+ response = requests.get(tex_url, timeout=TIMEOUT)
132
  if response.status_code == 200:
133
  return [Document(page_content=response.text)]
134
  else:
code/modules/dataloader/helpers.py CHANGED
@@ -2,6 +2,7 @@ import requests
2
  from bs4 import BeautifulSoup
3
  from urllib.parse import urlparse
4
  import tempfile
 
5
 
6
 
7
  def get_urls_from_file(file_path: str):
@@ -27,11 +28,11 @@ def get_metadata(lectures_url, schedule_url):
27
  lecture_metadata = {}
28
 
29
  # Get the main lectures page content
30
- r_lectures = requests.get(lectures_url)
31
  soup_lectures = BeautifulSoup(r_lectures.text, "html.parser")
32
 
33
  # Get the main schedule page content
34
- r_schedule = requests.get(schedule_url)
35
  soup_schedule = BeautifulSoup(r_schedule.text, "html.parser")
36
 
37
  # Find all lecture blocks
@@ -119,7 +120,7 @@ def download_pdf_from_url(pdf_url):
119
  Returns:
120
  str: The local file path of the downloaded PDF file.
121
  """
122
- response = requests.get(pdf_url)
123
  if response.status_code == 200:
124
  with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
125
  temp_file.write(response.content)
 
2
  from bs4 import BeautifulSoup
3
  from urllib.parse import urlparse
4
  import tempfile
5
+ from modules.config.constants import TIMEOUT
6
 
7
 
8
  def get_urls_from_file(file_path: str):
 
28
  lecture_metadata = {}
29
 
30
  # Get the main lectures page content
31
+ r_lectures = requests.get(lectures_url, timeout=TIMEOUT)
32
  soup_lectures = BeautifulSoup(r_lectures.text, "html.parser")
33
 
34
  # Get the main schedule page content
35
+ r_schedule = requests.get(schedule_url, timeout=TIMEOUT)
36
  soup_schedule = BeautifulSoup(r_schedule.text, "html.parser")
37
 
38
  # Find all lecture blocks
 
120
  Returns:
121
  str: The local file path of the downloaded PDF file.
122
  """
123
+ response = requests.get(pdf_url, timeout=TIMEOUT)
124
  if response.status_code == 200:
125
  with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
126
  temp_file.write(response.content)
code/modules/dataloader/pdf_readers/gpt.py CHANGED
@@ -6,6 +6,7 @@ from io import BytesIO
6
  from openai import OpenAI
7
  from pdf2image import convert_from_path
8
  from langchain.schema import Document
 
9
 
10
 
11
  class GPTParser:
@@ -59,6 +60,7 @@ class GPTParser:
59
  "https://api.openai.com/v1/chat/completions",
60
  headers=headers,
61
  json=payload,
 
62
  )
63
 
64
  resp = response.json()
 
6
  from openai import OpenAI
7
  from pdf2image import convert_from_path
8
  from langchain.schema import Document
9
+ from modules.config.constants import TIMEOUT
10
 
11
 
12
  class GPTParser:
 
60
  "https://api.openai.com/v1/chat/completions",
61
  headers=headers,
62
  json=payload,
63
+ timeout=TIMEOUT,
64
  )
65
 
66
  resp = response.json()
code/modules/dataloader/pdf_readers/llama.py CHANGED
@@ -2,7 +2,7 @@ import os
2
  import requests
3
  from llama_parse import LlamaParse
4
  from langchain.schema import Document
5
- from modules.config.constants import OPENAI_API_KEY, LLAMA_CLOUD_API_KEY
6
  from modules.dataloader.helpers import download_pdf_from_url
7
 
8
 
@@ -52,7 +52,11 @@ class LlamaParser:
52
  files = [
53
  (
54
  "file",
55
- ("file", requests.get(pdf_url).content, "application/octet-stream"),
 
 
 
 
56
  )
57
  ]
58
 
 
2
  import requests
3
  from llama_parse import LlamaParse
4
  from langchain.schema import Document
5
+ from modules.config.constants import OPENAI_API_KEY, LLAMA_CLOUD_API_KEY, TIMEOUT
6
  from modules.dataloader.helpers import download_pdf_from_url
7
 
8
 
 
52
  files = [
53
  (
54
  "file",
55
+ (
56
+ "file",
57
+ requests.get(pdf_url, timeout=TIMEOUT).content,
58
+ "application/octet-stream",
59
+ ),
60
  )
61
  ]
62
 
code/modules/dataloader/webpage_crawler.py CHANGED
@@ -4,6 +4,7 @@ import asyncio
4
  import requests
5
  from bs4 import BeautifulSoup
6
  from urllib.parse import urljoin, urldefrag
 
7
 
8
 
9
  class WebpageCrawler:
@@ -19,7 +20,7 @@ class WebpageCrawler:
19
 
20
  def url_exists(self, url: str) -> bool:
21
  try:
22
- response = requests.head(url)
23
  return response.status_code == 200
24
  except requests.ConnectionError:
25
  return False
@@ -89,7 +90,7 @@ class WebpageCrawler:
89
 
90
  def is_webpage(self, url: str) -> bool:
91
  try:
92
- response = requests.head(url, allow_redirects=True)
93
  content_type = response.headers.get("Content-Type", "").lower()
94
  return "text/html" in content_type
95
  except requests.RequestException:
 
4
  import requests
5
  from bs4 import BeautifulSoup
6
  from urllib.parse import urljoin, urldefrag
7
+ from modules.config.constants import TIMEOUT
8
 
9
 
10
  class WebpageCrawler:
 
20
 
21
  def url_exists(self, url: str) -> bool:
22
  try:
23
+ response = requests.head(url, timeout=TIMEOUT)
24
  return response.status_code == 200
25
  except requests.ConnectionError:
26
  return False
 
90
 
91
  def is_webpage(self, url: str) -> bool:
92
  try:
93
+ response = requests.head(url, allow_redirects=True, timeout=TIMEOUT)
94
  content_type = response.headers.get("Content-Type", "").lower()
95
  return "text/html" in content_type
96
  except requests.RequestException: