dh-mc commited on
Commit
25ef847
1 Parent(s): 036b9f7

added Faiss support

Browse files
.env.example CHANGED
@@ -57,7 +57,8 @@ LLAMACPP_MODEL_PATH="./models/wizardLM-7B.ggmlv3.q4_1.bin"
57
  LLAMACPP_DOWNLOAD_LINK=https://huggingface.co/TheBloke/wizardLM-7B-GGML/resolve/main/wizardLM-7B.ggmlv3.q4_1.bin
58
 
59
  # Index for AI Books PDF files - chunk_size=1024 chunk_overlap=512
60
- CHROMADB_INDEX_PATH="./data/chromadb_1024_512/"
 
61
 
62
  QUESTIONS_FILE_PATH="./data/questions.txt"
63
 
 
57
  LLAMACPP_DOWNLOAD_LINK=https://huggingface.co/TheBloke/wizardLM-7B-GGML/resolve/main/wizardLM-7B.ggmlv3.q4_1.bin
58
 
59
  # Index for AI Books PDF files - chunk_size=1024 chunk_overlap=512
60
+ # CHROMADB_INDEX_PATH="./data/chromadb_1024_512/"
61
+ FAISS_INDEX_PATH="./data/faiss_1024_512/"
62
 
63
  QUESTIONS_FILE_PATH="./data/questions.txt"
64
 
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ data/faiss_1024_512/index.faiss filter=lfs diff=lfs merge=lfs -text
37
+ data/faiss_1024_512/index.pkl filter=lfs diff=lfs merge=lfs -text
data/faiss_1024_512/index.faiss ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dcb86f8f32c953c7d5c99662a27e43d6261da7b7b4342bac638e6d19bf7ee530
3
+ size 78975021
data/faiss_1024_512/index.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:313a047fb82ef5c43661a12b2424aeae88688d7631e4bdaf7de283a0b0763dc9
3
+ size 26672894
ingest.py CHANGED
@@ -6,7 +6,9 @@ from typing import List
6
  from langchain.document_loaders import PyPDFDirectoryLoader
7
  from langchain.embeddings import HuggingFaceInstructEmbeddings
8
  from langchain.text_splitter import RecursiveCharacterTextSplitter
 
9
  from langchain.vectorstores.chroma import Chroma
 
10
 
11
  from app_modules.utils import *
12
 
@@ -24,13 +26,23 @@ def split_chunks(documents: List, chunk_size, chunk_overlap) -> List:
24
  return text_splitter.split_documents(documents)
25
 
26
 
27
- def generate_index(chunks: List, embeddings: HuggingFaceInstructEmbeddings) -> Chroma:
28
- chromadb_instructor_embeddings = Chroma.from_documents(
29
- documents=chunks, embedding=embeddings, persist_directory=index_path
30
- )
 
 
 
 
 
 
 
 
 
 
31
 
32
- chromadb_instructor_embeddings.persist()
33
- return chromadb_instructor_embeddings
34
 
35
 
36
  # Constants
@@ -40,7 +52,8 @@ device_type, hf_pipeline_device_type = get_device_types()
40
  hf_embeddings_model_name = (
41
  os.environ.get("HF_EMBEDDINGS_MODEL_NAME") or "hkunlp/instructor-xl"
42
  )
43
- index_path = os.environ.get("CHROMADB_INDEX_PATH")
 
44
  source_pdfs_path = os.environ.get("SOURCE_PDFS_PATH")
45
  chunk_size = os.environ.get("CHUNCK_SIZE")
46
  chunk_overlap = os.environ.get("CHUNK_OVERLAP")
@@ -71,7 +84,11 @@ if not os.path.isdir(index_path):
71
  index = generate_index(chunks, embeddings)
72
  else:
73
  print("The index persist directory is present. Loading index ...")
74
- index = Chroma(embedding_function=embeddings, persist_directory=index_path)
 
 
 
 
75
 
76
  end = timer()
77
 
 
6
  from langchain.document_loaders import PyPDFDirectoryLoader
7
  from langchain.embeddings import HuggingFaceInstructEmbeddings
8
  from langchain.text_splitter import RecursiveCharacterTextSplitter
9
+ from langchain.vectorstores.base import VectorStore
10
  from langchain.vectorstores.chroma import Chroma
11
+ from langchain.vectorstores.faiss import FAISS
12
 
13
  from app_modules.utils import *
14
 
 
26
  return text_splitter.split_documents(documents)
27
 
28
 
29
+ def generate_index(
30
+ chunks: List, embeddings: HuggingFaceInstructEmbeddings
31
+ ) -> VectorStore:
32
+ if using_faiss:
33
+ faiss_instructor_embeddings = FAISS.from_documents(
34
+ documents=chunks, embedding=embeddings
35
+ )
36
+
37
+ faiss_instructor_embeddings.save_local(index_path)
38
+ return faiss_instructor_embeddings
39
+ else:
40
+ chromadb_instructor_embeddings = Chroma.from_documents(
41
+ documents=chunks, embedding=embeddings, persist_directory=index_path
42
+ )
43
 
44
+ chromadb_instructor_embeddings.persist()
45
+ return chromadb_instructor_embeddings
46
 
47
 
48
  # Constants
 
52
  hf_embeddings_model_name = (
53
  os.environ.get("HF_EMBEDDINGS_MODEL_NAME") or "hkunlp/instructor-xl"
54
  )
55
+ index_path = os.environ.get("FAISS_INDEX_PATH") or os.environ.get("CHROMADB_INDEX_PATH")
56
+ using_faiss = os.environ.get("FAISS_INDEX_PATH") is not None
57
  source_pdfs_path = os.environ.get("SOURCE_PDFS_PATH")
58
  chunk_size = os.environ.get("CHUNCK_SIZE")
59
  chunk_overlap = os.environ.get("CHUNK_OVERLAP")
 
84
  index = generate_index(chunks, embeddings)
85
  else:
86
  print("The index persist directory is present. Loading index ...")
87
+ index = (
88
+ FAISS.load_local(index_path, embeddings)
89
+ if using_faiss
90
+ else Chroma(embedding_function=embeddings, persist_directory=index_path)
91
+ )
92
 
93
  end = timer()
94