Spaces:
Paused
Paused
angry-meow
commited on
Commit
•
d523035
1
Parent(s):
eeebf9d
loaded more docs
Browse files- __pycache__/constants.cpython-311.pyc +0 -0
- __pycache__/models.cpython-311.pyc +0 -0
- load_existing_docs.py +86 -19
- models.py +13 -6
- test_docs/Articles To Share.pdf +0 -0
__pycache__/constants.cpython-311.pyc
ADDED
Binary file (733 Bytes). View file
|
|
__pycache__/models.cpython-311.pyc
ADDED
Binary file (2.68 kB). View file
|
|
load_existing_docs.py
CHANGED
@@ -1,9 +1,11 @@
|
|
1 |
import models
|
2 |
-
import constants
|
3 |
-
from langchain_experimental.text_splitter import SemanticChunker
|
4 |
from langchain_qdrant import QdrantVectorStore, Qdrant
|
5 |
-
from langchain_community.document_loaders import PyPDFLoader
|
6 |
from qdrant_client.http.models import VectorParams
|
|
|
|
|
7 |
|
8 |
#qdrant = QdrantVectorStore.from_existing_collection(
|
9 |
# embedding=models.basic_embeddings,
|
@@ -11,26 +13,91 @@ from qdrant_client.http.models import VectorParams
|
|
11 |
# url=constants.QDRANT_ENDPOINT,
|
12 |
#)
|
13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
|
15 |
#gather kai's docs
|
16 |
-
filepaths = ["./test_docs/Employee Statistics FINAL.pdf","./test_docs/Employer Statistics FINAL.pdf"]
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
|
28 |
#chunk them
|
29 |
-
semantic_split_docs = models.semanticChunker.split_documents(all_documents)
|
30 |
|
31 |
|
32 |
#add them to the existing qdrant client
|
33 |
-
collection_name = "
|
34 |
|
35 |
collections = models.qdrant_client.get_collections()
|
36 |
collection_names = [collection.name for collection in collections.collections]
|
@@ -41,13 +108,13 @@ if collection_name not in collection_names:
|
|
41 |
vectors_config=VectorParams(size=1536, distance="Cosine")
|
42 |
)
|
43 |
|
44 |
-
qdrant_vector_store =
|
45 |
client=models.qdrant_client,
|
46 |
collection_name=collection_name,
|
47 |
-
|
48 |
)
|
49 |
|
50 |
-
qdrant_vector_store.add_documents(
|
51 |
|
52 |
|
53 |
|
|
|
1 |
import models
|
2 |
+
#import constants
|
3 |
+
#from langchain_experimental.text_splitter import SemanticChunker
|
4 |
from langchain_qdrant import QdrantVectorStore, Qdrant
|
5 |
+
from langchain_community.document_loaders import PyPDFLoader, UnstructuredURLLoader
|
6 |
from qdrant_client.http.models import VectorParams
|
7 |
+
import pymupdf
|
8 |
+
import requests
|
9 |
|
10 |
#qdrant = QdrantVectorStore.from_existing_collection(
|
11 |
# embedding=models.basic_embeddings,
|
|
|
13 |
# url=constants.QDRANT_ENDPOINT,
|
14 |
#)
|
15 |
|
16 |
+
def extract_links_from_pdf(pdf_path):
|
17 |
+
links = []
|
18 |
+
doc = pymupdf.open(pdf_path)
|
19 |
+
for page in doc:
|
20 |
+
for link in page.get_links():
|
21 |
+
if link['uri']:
|
22 |
+
links.append(link['uri'])
|
23 |
+
return links
|
24 |
+
|
25 |
+
def load_documents_from_url(url):
|
26 |
+
try:
|
27 |
+
# Check if it's a PDF
|
28 |
+
if url.endswith(".pdf"):
|
29 |
+
try:
|
30 |
+
loader = PyPDFLoader(url)
|
31 |
+
return loader.load()
|
32 |
+
except Exception as e:
|
33 |
+
print(f"Error loading PDF from {url}: {e}")
|
34 |
+
return None
|
35 |
+
|
36 |
+
# Fetch the content and check for video pages
|
37 |
+
try:
|
38 |
+
response = requests.head(url, timeout=10) # Timeout for fetching headers
|
39 |
+
content_type = response.headers.get('Content-Type', '')
|
40 |
+
except Exception as e:
|
41 |
+
print(f"Error fetching headers from {url}: {e}")
|
42 |
+
return None
|
43 |
+
|
44 |
+
# Ignore video content (flagged for now)
|
45 |
+
if 'video' in content_type:
|
46 |
+
return None
|
47 |
+
if 'youtube' in url:
|
48 |
+
return None
|
49 |
+
|
50 |
+
# Otherwise, treat it as an HTML page
|
51 |
+
try:
|
52 |
+
loader = UnstructuredURLLoader([url])
|
53 |
+
return loader.load()
|
54 |
+
except Exception as e:
|
55 |
+
print(f"Error loading HTML from {url}: {e}")
|
56 |
+
return None
|
57 |
+
except Exception as e:
|
58 |
+
print(f"General error loading from {url}: {e}")
|
59 |
+
return None
|
60 |
+
|
61 |
|
62 |
#gather kai's docs
|
63 |
+
filepaths = ["./test_docs/Employee Statistics FINAL.pdf","./test_docs/Employer Statistics FINAL.pdf","./test_docs/Articles To Share.pdf"]
|
64 |
+
|
65 |
+
all_links = []
|
66 |
+
for pdf_path in filepaths:
|
67 |
+
all_links.extend(extract_links_from_pdf(pdf_path))
|
68 |
+
|
69 |
+
unique_links = list(set(all_links))
|
70 |
+
print(unique_links)
|
71 |
+
|
72 |
+
documents = []
|
73 |
+
for link in unique_links:
|
74 |
+
doc = load_documents_from_url(link)
|
75 |
+
#print(f"loaded doc from {link}")
|
76 |
+
if doc:
|
77 |
+
documents.extend(doc)
|
78 |
+
|
79 |
+
|
80 |
+
#print(len(documents))
|
81 |
+
semantic_split_docs = models.semanticChunker.split_documents(documents)
|
82 |
+
RCTS_split_docs = models.RCTS.split_documents(documents)
|
83 |
+
|
84 |
+
|
85 |
+
#for file in filepaths:
|
86 |
+
# loader = PyPDFLoader(file)
|
87 |
+
# documents = loader.load()
|
88 |
+
# for doc in documents:
|
89 |
+
# doc.metadata = {
|
90 |
+
# "source": file,
|
91 |
+
# "tag": "employee" if "employee" in file.lower() else "employer"
|
92 |
+
# }
|
93 |
+
# all_documents.extend(documents)
|
94 |
|
95 |
#chunk them
|
96 |
+
#semantic_split_docs = models.semanticChunker.split_documents(all_documents)
|
97 |
|
98 |
|
99 |
#add them to the existing qdrant client
|
100 |
+
collection_name = "docs_from_ripped_urls_recursive"
|
101 |
|
102 |
collections = models.qdrant_client.get_collections()
|
103 |
collection_names = [collection.name for collection in collections.collections]
|
|
|
108 |
vectors_config=VectorParams(size=1536, distance="Cosine")
|
109 |
)
|
110 |
|
111 |
+
qdrant_vector_store = QdrantVectorStore(
|
112 |
client=models.qdrant_client,
|
113 |
collection_name=collection_name,
|
114 |
+
embedding=models.te3_small
|
115 |
)
|
116 |
|
117 |
+
qdrant_vector_store.add_documents(RCTS_split_docs)
|
118 |
|
119 |
|
120 |
|
models.py
CHANGED
@@ -3,11 +3,11 @@ from langchain_openai import ChatOpenAI
|
|
3 |
from langchain.callbacks.manager import CallbackManager
|
4 |
from langchain.callbacks.tracers import LangChainTracer
|
5 |
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
|
6 |
-
from langchain_community.embeddings import HuggingFaceInstructEmbeddings
|
7 |
from langchain_experimental.text_splitter import SemanticChunker
|
8 |
from langchain_openai.embeddings import OpenAIEmbeddings
|
9 |
from langchain_community.vectorstores import Qdrant
|
10 |
from qdrant_client import QdrantClient
|
|
|
11 |
import constants
|
12 |
import os
|
13 |
|
@@ -24,7 +24,7 @@ opus3 = ChatAnthropic(
|
|
24 |
api_key=constants.ANTRHOPIC_API_KEY,
|
25 |
temperature=0,
|
26 |
model='claude-3-opus-20240229',
|
27 |
-
|
28 |
)
|
29 |
|
30 |
sonnet35 = ChatAnthropic(
|
@@ -32,7 +32,7 @@ sonnet35 = ChatAnthropic(
|
|
32 |
temperature=0,
|
33 |
model='claude-3-5-sonnet-20240620',
|
34 |
max_tokens=4096,
|
35 |
-
|
36 |
)
|
37 |
|
38 |
gpt4 = ChatOpenAI(
|
@@ -42,7 +42,7 @@ gpt4 = ChatOpenAI(
|
|
42 |
timeout=None,
|
43 |
max_retries=2,
|
44 |
api_key=constants.OPENAI_API_KEY,
|
45 |
-
|
46 |
)
|
47 |
|
48 |
gpt4o = ChatOpenAI(
|
@@ -52,7 +52,7 @@ gpt4o = ChatOpenAI(
|
|
52 |
timeout=None,
|
53 |
max_retries=2,
|
54 |
api_key=constants.OPENAI_API_KEY,
|
55 |
-
|
56 |
)
|
57 |
|
58 |
gpt4o_mini = ChatOpenAI(
|
@@ -62,7 +62,7 @@ gpt4o_mini = ChatOpenAI(
|
|
62 |
timeout=None,
|
63 |
max_retries=2,
|
64 |
api_key=constants.OPENAI_API_KEY,
|
65 |
-
|
66 |
)
|
67 |
|
68 |
basic_embeddings = HuggingFaceEmbeddings(model_name="snowflake/snowflake-arctic-embed-l")
|
@@ -76,4 +76,11 @@ te3_small = OpenAIEmbeddings(api_key=constants.OPENAI_API_KEY, model="text-embed
|
|
76 |
semanticChunker = SemanticChunker(
|
77 |
te3_small,
|
78 |
breakpoint_threshold_type="percentile"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
79 |
)
|
|
|
3 |
from langchain.callbacks.manager import CallbackManager
|
4 |
from langchain.callbacks.tracers import LangChainTracer
|
5 |
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
|
|
|
6 |
from langchain_experimental.text_splitter import SemanticChunker
|
7 |
from langchain_openai.embeddings import OpenAIEmbeddings
|
8 |
from langchain_community.vectorstores import Qdrant
|
9 |
from qdrant_client import QdrantClient
|
10 |
+
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
11 |
import constants
|
12 |
import os
|
13 |
|
|
|
24 |
api_key=constants.ANTRHOPIC_API_KEY,
|
25 |
temperature=0,
|
26 |
model='claude-3-opus-20240229',
|
27 |
+
callbacks=callback_manager
|
28 |
)
|
29 |
|
30 |
sonnet35 = ChatAnthropic(
|
|
|
32 |
temperature=0,
|
33 |
model='claude-3-5-sonnet-20240620',
|
34 |
max_tokens=4096,
|
35 |
+
callbacks=callback_manager
|
36 |
)
|
37 |
|
38 |
gpt4 = ChatOpenAI(
|
|
|
42 |
timeout=None,
|
43 |
max_retries=2,
|
44 |
api_key=constants.OPENAI_API_KEY,
|
45 |
+
callbacks=callback_manager
|
46 |
)
|
47 |
|
48 |
gpt4o = ChatOpenAI(
|
|
|
52 |
timeout=None,
|
53 |
max_retries=2,
|
54 |
api_key=constants.OPENAI_API_KEY,
|
55 |
+
callbacks=callback_manager
|
56 |
)
|
57 |
|
58 |
gpt4o_mini = ChatOpenAI(
|
|
|
62 |
timeout=None,
|
63 |
max_retries=2,
|
64 |
api_key=constants.OPENAI_API_KEY,
|
65 |
+
callbacks=callback_manager
|
66 |
)
|
67 |
|
68 |
basic_embeddings = HuggingFaceEmbeddings(model_name="snowflake/snowflake-arctic-embed-l")
|
|
|
76 |
semanticChunker = SemanticChunker(
|
77 |
te3_small,
|
78 |
breakpoint_threshold_type="percentile"
|
79 |
+
)
|
80 |
+
|
81 |
+
RCTS = RecursiveCharacterTextSplitter(
|
82 |
+
# Set a really small chunk size, just to show.
|
83 |
+
chunk_size=500,
|
84 |
+
chunk_overlap=25,
|
85 |
+
length_function=len,
|
86 |
)
|
test_docs/Articles To Share.pdf
ADDED
Binary file (281 kB). View file
|
|