XThomasBU commited on
Commit
4308a1a
1 Parent(s): e17a5d0

remove hard coded values

Browse files
code/modules/config/{user_config.yml → project_config.yml} RENAMED
@@ -1,3 +1,7 @@
1
  retriever:
2
  retriever_hf_paths:
3
  RAGatouille: "XThomasBU/Colbert_Index"
 
 
 
 
 
1
  retriever:
2
  retriever_hf_paths:
3
  RAGatouille: "XThomasBU/Colbert_Index"
4
+
5
+ metadata:
6
+ metada_links: ["https://dl4ds.github.io/sp2024/lectures/", "https://dl4ds.github.io/sp2024/schedule/"]
7
+ slide_base_link: "https://dl4ds.github.io"
code/modules/dataloader/data_loader.py CHANGED
@@ -222,8 +222,7 @@ class ChunkProcessor:
222
 
223
  def chunk_docs(self, file_reader, uploaded_files, weblinks):
224
  addl_metadata = get_metadata(
225
- "https://dl4ds.github.io/sp2024/lectures/",
226
- "https://dl4ds.github.io/sp2024/schedule/",
227
  ) # For any additional metadata
228
 
229
  # remove already processed files if reparse_files is False
@@ -426,6 +425,12 @@ if __name__ == "__main__":
426
  with open("../code/modules/config/config.yml", "r") as f:
427
  config = yaml.safe_load(f)
428
 
 
 
 
 
 
 
429
  STORAGE_DIR = os.path.join(BASE_DIR, config["vectorstore"]["data_path"])
430
  uploaded_files = [
431
  os.path.join(STORAGE_DIR, file)
@@ -434,6 +439,7 @@ if __name__ == "__main__":
434
  ]
435
 
436
  data_loader = DataLoader(config, logger=logger)
 
437
  document_chunks, document_names, documents, document_metadata = (
438
  data_loader.get_chunks(
439
  [
 
222
 
223
  def chunk_docs(self, file_reader, uploaded_files, weblinks):
224
  addl_metadata = get_metadata(
225
+ *self.config["metadata"]["metada_links"], self.config
 
226
  ) # For any additional metadata
227
 
228
  # remove already processed files if reparse_files is False
 
425
  with open("../code/modules/config/config.yml", "r") as f:
426
  config = yaml.safe_load(f)
427
 
428
+ with open("../code/modules/config/project_config.yml", "r") as f:
429
+ project_config = yaml.safe_load(f)
430
+
431
+ # Combine project config with the main config
432
+ config.update(project_config)
433
+
434
  STORAGE_DIR = os.path.join(BASE_DIR, config["vectorstore"]["data_path"])
435
  uploaded_files = [
436
  os.path.join(STORAGE_DIR, file)
 
439
  ]
440
 
441
  data_loader = DataLoader(config, logger=logger)
442
+ # Just for testing
443
  document_chunks, document_names, documents, document_metadata = (
444
  data_loader.get_chunks(
445
  [
code/modules/dataloader/helpers.py CHANGED
@@ -21,7 +21,8 @@ def get_base_url(url):
21
  return base_url
22
 
23
 
24
- def get_metadata(lectures_url, schedule_url):
 
25
  """
26
  Function to get the lecture metadata from the lectures and schedule URLs.
27
  """
@@ -50,7 +51,9 @@ def get_metadata(lectures_url, schedule_url):
50
  slides_link_tag = description_div.find("a", title="Download slides")
51
  slides_link = slides_link_tag["href"].strip() if slides_link_tag else None
52
  slides_link = (
53
- f"https://dl4ds.github.io{slides_link}" if slides_link else None
 
 
54
  )
55
  if slides_link:
56
  date_mapping[slides_link] = date
@@ -70,7 +73,9 @@ def get_metadata(lectures_url, schedule_url):
70
  slides_link_tag = block.find("a", title="Download slides")
71
  slides_link = slides_link_tag["href"].strip() if slides_link_tag else None
72
  slides_link = (
73
- f"https://dl4ds.github.io{slides_link}" if slides_link else None
 
 
74
  )
75
 
76
  # Extract the link to the lecture recording
 
21
  return base_url
22
 
23
 
24
+ ### THIS FUNCTION IS NOT GENERALIZABLE.. IT IS SPECIFIC TO THE COURSE WEBSITE ###
25
+ def get_metadata(lectures_url, schedule_url, config):
26
  """
27
  Function to get the lecture metadata from the lectures and schedule URLs.
28
  """
 
51
  slides_link_tag = description_div.find("a", title="Download slides")
52
  slides_link = slides_link_tag["href"].strip() if slides_link_tag else None
53
  slides_link = (
54
+ f"{config['metadata']['slide_base_link']}{slides_link}"
55
+ if slides_link
56
+ else None
57
  )
58
  if slides_link:
59
  date_mapping[slides_link] = date
 
73
  slides_link_tag = block.find("a", title="Download slides")
74
  slides_link = slides_link_tag["href"].strip() if slides_link_tag else None
75
  slides_link = (
76
+ f"{config['metadata']['slide_base_link']}{slides_link}"
77
+ if slides_link
78
+ else None
79
  )
80
 
81
  # Extract the link to the lecture recording
code/modules/vectorstore/store_manager.py CHANGED
@@ -168,19 +168,21 @@ if __name__ == "__main__":
168
 
169
  with open("modules/config/config.yml", "r") as f:
170
  config = yaml.safe_load(f)
171
- with open("modules/config/user_config.yml", "r") as f:
172
- user_config = yaml.safe_load(f)
 
 
 
173
  print(config)
174
- print(user_config)
175
  print(f"Trying to create database with config: {config}")
176
  vector_db = VectorStoreManager(config)
177
  if config["vectorstore"]["load_from_HF"]:
178
  if (
179
  config["vectorstore"]["db_option"]
180
- in user_config["retriever"]["retriever_hf_paths"]
181
  ):
182
  vector_db.load_from_HF(
183
- HF_PATH=user_config["retriever"]["retriever_hf_paths"][
184
  config["vectorstore"]["db_option"]
185
  ]
186
  )
 
168
 
169
  with open("modules/config/config.yml", "r") as f:
170
  config = yaml.safe_load(f)
171
+ with open("modules/config/project_config.yml", "r") as f:
172
+ project_config = yaml.safe_load(f)
173
+
174
+ # combine the two configs
175
+ config.update(project_config)
176
  print(config)
 
177
  print(f"Trying to create database with config: {config}")
178
  vector_db = VectorStoreManager(config)
179
  if config["vectorstore"]["load_from_HF"]:
180
  if (
181
  config["vectorstore"]["db_option"]
182
+ in config["retriever"]["retriever_hf_paths"]
183
  ):
184
  vector_db.load_from_HF(
185
+ HF_PATH=config["retriever"]["retriever_hf_paths"][
186
  config["vectorstore"]["db_option"]
187
  ]
188
  )
docs/setup.md CHANGED
@@ -124,4 +124,4 @@ CHAINLIT_URL=<your_chainlit_url>
124
  # Configuration
125
 
126
  The configuration file `code/modules/config.yaml` contains the parameters that control the behaviour of your app.
127
- The configuration file `code/modules/user_config.yaml` contains user-defined parameters.
 
124
  # Configuration
125
 
126
  The configuration file `code/modules/config.yaml` contains the parameters that control the behaviour of your app.
127
+ The configuration file `code/modules/project_config.yaml` contains project-specific parameters.