angry-meow commited on
Commit
d523035
1 Parent(s): eeebf9d

loaded more docs

Browse files
__pycache__/constants.cpython-311.pyc ADDED
Binary file (733 Bytes). View file
 
__pycache__/models.cpython-311.pyc ADDED
Binary file (2.68 kB). View file
 
load_existing_docs.py CHANGED
@@ -1,9 +1,11 @@
1
  import models
2
- import constants
3
- from langchain_experimental.text_splitter import SemanticChunker
4
  from langchain_qdrant import QdrantVectorStore, Qdrant
5
- from langchain_community.document_loaders import PyPDFLoader
6
  from qdrant_client.http.models import VectorParams
 
 
7
 
8
  #qdrant = QdrantVectorStore.from_existing_collection(
9
  # embedding=models.basic_embeddings,
@@ -11,26 +13,91 @@ from qdrant_client.http.models import VectorParams
11
  # url=constants.QDRANT_ENDPOINT,
12
  #)
13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
  #gather kai's docs
16
- filepaths = ["./test_docs/Employee Statistics FINAL.pdf","./test_docs/Employer Statistics FINAL.pdf"]
17
- all_documents = []
18
- for file in filepaths:
19
- loader = PyPDFLoader(file)
20
- documents = loader.load()
21
- for doc in documents:
22
- doc.metadata = {
23
- "source": file,
24
- "tag": "employee" if "employee" in file.lower() else "employer"
25
- }
26
- all_documents.extend(documents)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
  #chunk them
29
- semantic_split_docs = models.semanticChunker.split_documents(all_documents)
30
 
31
 
32
  #add them to the existing qdrant client
33
- collection_name = "kai_test_docs"
34
 
35
  collections = models.qdrant_client.get_collections()
36
  collection_names = [collection.name for collection in collections.collections]
@@ -41,13 +108,13 @@ if collection_name not in collection_names:
41
  vectors_config=VectorParams(size=1536, distance="Cosine")
42
  )
43
 
44
- qdrant_vector_store = Qdrant(
45
  client=models.qdrant_client,
46
  collection_name=collection_name,
47
- embeddings=models.te3_small
48
  )
49
 
50
- qdrant_vector_store.add_documents(semantic_split_docs)
51
 
52
 
53
 
 
1
  import models
2
+ #import constants
3
+ #from langchain_experimental.text_splitter import SemanticChunker
4
  from langchain_qdrant import QdrantVectorStore, Qdrant
5
+ from langchain_community.document_loaders import PyPDFLoader, UnstructuredURLLoader
6
  from qdrant_client.http.models import VectorParams
7
+ import pymupdf
8
+ import requests
9
 
10
  #qdrant = QdrantVectorStore.from_existing_collection(
11
  # embedding=models.basic_embeddings,
 
13
  # url=constants.QDRANT_ENDPOINT,
14
  #)
15
 
16
+ def extract_links_from_pdf(pdf_path):
17
+ links = []
18
+ doc = pymupdf.open(pdf_path)
19
+ for page in doc:
20
+ for link in page.get_links():
21
+ if link['uri']:
22
+ links.append(link['uri'])
23
+ return links
24
+
25
+ def load_documents_from_url(url):
26
+ try:
27
+ # Check if it's a PDF
28
+ if url.endswith(".pdf"):
29
+ try:
30
+ loader = PyPDFLoader(url)
31
+ return loader.load()
32
+ except Exception as e:
33
+ print(f"Error loading PDF from {url}: {e}")
34
+ return None
35
+
36
+ # Fetch the content and check for video pages
37
+ try:
38
+ response = requests.head(url, timeout=10) # Timeout for fetching headers
39
+ content_type = response.headers.get('Content-Type', '')
40
+ except Exception as e:
41
+ print(f"Error fetching headers from {url}: {e}")
42
+ return None
43
+
44
+ # Ignore video content (flagged for now)
45
+ if 'video' in content_type:
46
+ return None
47
+ if 'youtube' in url:
48
+ return None
49
+
50
+ # Otherwise, treat it as an HTML page
51
+ try:
52
+ loader = UnstructuredURLLoader([url])
53
+ return loader.load()
54
+ except Exception as e:
55
+ print(f"Error loading HTML from {url}: {e}")
56
+ return None
57
+ except Exception as e:
58
+ print(f"General error loading from {url}: {e}")
59
+ return None
60
+
61
 
62
  #gather kai's docs
63
+ filepaths = ["./test_docs/Employee Statistics FINAL.pdf","./test_docs/Employer Statistics FINAL.pdf","./test_docs/Articles To Share.pdf"]
64
+
65
+ all_links = []
66
+ for pdf_path in filepaths:
67
+ all_links.extend(extract_links_from_pdf(pdf_path))
68
+
69
+ unique_links = list(set(all_links))
70
+ print(unique_links)
71
+
72
+ documents = []
73
+ for link in unique_links:
74
+ doc = load_documents_from_url(link)
75
+ #print(f"loaded doc from {link}")
76
+ if doc:
77
+ documents.extend(doc)
78
+
79
+
80
+ #print(len(documents))
81
+ semantic_split_docs = models.semanticChunker.split_documents(documents)
82
+ RCTS_split_docs = models.RCTS.split_documents(documents)
83
+
84
+
85
+ #for file in filepaths:
86
+ # loader = PyPDFLoader(file)
87
+ # documents = loader.load()
88
+ # for doc in documents:
89
+ # doc.metadata = {
90
+ # "source": file,
91
+ # "tag": "employee" if "employee" in file.lower() else "employer"
92
+ # }
93
+ # all_documents.extend(documents)
94
 
95
  #chunk them
96
+ #semantic_split_docs = models.semanticChunker.split_documents(all_documents)
97
 
98
 
99
  #add them to the existing qdrant client
100
+ collection_name = "docs_from_ripped_urls_recursive"
101
 
102
  collections = models.qdrant_client.get_collections()
103
  collection_names = [collection.name for collection in collections.collections]
 
108
  vectors_config=VectorParams(size=1536, distance="Cosine")
109
  )
110
 
111
+ qdrant_vector_store = QdrantVectorStore(
112
  client=models.qdrant_client,
113
  collection_name=collection_name,
114
+ embedding=models.te3_small
115
  )
116
 
117
+ qdrant_vector_store.add_documents(RCTS_split_docs)
118
 
119
 
120
 
models.py CHANGED
@@ -3,11 +3,11 @@ from langchain_openai import ChatOpenAI
3
  from langchain.callbacks.manager import CallbackManager
4
  from langchain.callbacks.tracers import LangChainTracer
5
  from langchain_huggingface.embeddings import HuggingFaceEmbeddings
6
- from langchain_community.embeddings import HuggingFaceInstructEmbeddings
7
  from langchain_experimental.text_splitter import SemanticChunker
8
  from langchain_openai.embeddings import OpenAIEmbeddings
9
  from langchain_community.vectorstores import Qdrant
10
  from qdrant_client import QdrantClient
 
11
  import constants
12
  import os
13
 
@@ -24,7 +24,7 @@ opus3 = ChatAnthropic(
24
  api_key=constants.ANTRHOPIC_API_KEY,
25
  temperature=0,
26
  model='claude-3-opus-20240229',
27
- callback_manager=callback_manager
28
  )
29
 
30
  sonnet35 = ChatAnthropic(
@@ -32,7 +32,7 @@ sonnet35 = ChatAnthropic(
32
  temperature=0,
33
  model='claude-3-5-sonnet-20240620',
34
  max_tokens=4096,
35
- callback_manager=callback_manager
36
  )
37
 
38
  gpt4 = ChatOpenAI(
@@ -42,7 +42,7 @@ gpt4 = ChatOpenAI(
42
  timeout=None,
43
  max_retries=2,
44
  api_key=constants.OPENAI_API_KEY,
45
- callback_manager=callback_manager
46
  )
47
 
48
  gpt4o = ChatOpenAI(
@@ -52,7 +52,7 @@ gpt4o = ChatOpenAI(
52
  timeout=None,
53
  max_retries=2,
54
  api_key=constants.OPENAI_API_KEY,
55
- callback_manager=callback_manager
56
  )
57
 
58
  gpt4o_mini = ChatOpenAI(
@@ -62,7 +62,7 @@ gpt4o_mini = ChatOpenAI(
62
  timeout=None,
63
  max_retries=2,
64
  api_key=constants.OPENAI_API_KEY,
65
- callback_manager=callback_manager
66
  )
67
 
68
  basic_embeddings = HuggingFaceEmbeddings(model_name="snowflake/snowflake-arctic-embed-l")
@@ -76,4 +76,11 @@ te3_small = OpenAIEmbeddings(api_key=constants.OPENAI_API_KEY, model="text-embed
76
  semanticChunker = SemanticChunker(
77
  te3_small,
78
  breakpoint_threshold_type="percentile"
 
 
 
 
 
 
 
79
  )
 
3
  from langchain.callbacks.manager import CallbackManager
4
  from langchain.callbacks.tracers import LangChainTracer
5
  from langchain_huggingface.embeddings import HuggingFaceEmbeddings
 
6
  from langchain_experimental.text_splitter import SemanticChunker
7
  from langchain_openai.embeddings import OpenAIEmbeddings
8
  from langchain_community.vectorstores import Qdrant
9
  from qdrant_client import QdrantClient
10
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
11
  import constants
12
  import os
13
 
 
24
  api_key=constants.ANTRHOPIC_API_KEY,
25
  temperature=0,
26
  model='claude-3-opus-20240229',
27
+ callbacks=callback_manager
28
  )
29
 
30
  sonnet35 = ChatAnthropic(
 
32
  temperature=0,
33
  model='claude-3-5-sonnet-20240620',
34
  max_tokens=4096,
35
+ callbacks=callback_manager
36
  )
37
 
38
  gpt4 = ChatOpenAI(
 
42
  timeout=None,
43
  max_retries=2,
44
  api_key=constants.OPENAI_API_KEY,
45
+ callbacks=callback_manager
46
  )
47
 
48
  gpt4o = ChatOpenAI(
 
52
  timeout=None,
53
  max_retries=2,
54
  api_key=constants.OPENAI_API_KEY,
55
+ callbacks=callback_manager
56
  )
57
 
58
  gpt4o_mini = ChatOpenAI(
 
62
  timeout=None,
63
  max_retries=2,
64
  api_key=constants.OPENAI_API_KEY,
65
+ callbacks=callback_manager
66
  )
67
 
68
  basic_embeddings = HuggingFaceEmbeddings(model_name="snowflake/snowflake-arctic-embed-l")
 
76
  semanticChunker = SemanticChunker(
77
  te3_small,
78
  breakpoint_threshold_type="percentile"
79
+ )
80
+
81
+ RCTS = RecursiveCharacterTextSplitter(
82
+ # Set a really small chunk size, just to show.
83
+ chunk_size=500,
84
+ chunk_overlap=25,
85
+ length_function=len,
86
  )
test_docs/Articles To Share.pdf ADDED
Binary file (281 kB). View file