tykiww commited on
Commit
d69c6f7
1 Parent(s): 31c1456

Create connections/pinecone.py

Browse files
Files changed (1) hide show
  1. connections/pinecone.py +71 -0
connections/pinecone.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pinecone.grpc import PineconeGRPC
2
+ from pinecone import ServerlessSpec
3
+
4
+ from llama_index.embeddings.huggingface import HuggingFaceEmbedding
5
+ from llama_index.vector_stores import PineconeVectorStore
6
+ from llama_index.node_parser import SemanticSplitterNodeParser
7
+ from llama_index.ingestion import IngestionPipeline
8
+
9
+ class PineconeConnector:
10
+ """
11
+ A way to call the pinecone service
12
+ """
13
+ def __init__(self, api_key, index_name, embedding):
14
+ self.pinecone_api_key = api_key
15
+ self.vector_db = index_name
16
+ self.embedding = embedding
17
+ print("Connecting to Pinecone DB")
18
+ self.pc = self.connect()
19
+ print("Retrieving Embedder")
20
+ self.embedder = HuggingFaceEmbedding(model_name=embedding)
21
+
22
+ def connect(self):
23
+ """Connects to vectorstore"""
24
+ # connect
25
+ pc = PineconeGRPC(api_key=self.pinecone_api_key)
26
+ return pc
27
+
28
+ def create_pipeline(self):
29
+ """Create a pipeline given an index name"""
30
+ # Create your index if index does not exist
31
+ indexes = [i.name for i in pc.list_indexes()]
32
+ index_exists = any([self.vector_db in i for i in indexes])
33
+
34
+ if index_exists:
35
+ print("Index already exists")
36
+ else:
37
+ print("Creating index")
38
+ self.pc.create_index(
39
+ self.vector_db,
40
+ dimension=768,
41
+ metric="cosine",
42
+ spec=ServerlessSpec(cloud="aws", region="us-east-1"),
43
+ )
44
+
45
+ # Initialize your index
46
+ pinecone_index = self.pc.Index(self.vector_db)
47
+
48
+ # Initialize VectorStore
49
+ vector_store = PineconeVectorStore(pinecone_index=pinecone_index)
50
+
51
+ # create pipeline (abstracts away the need to adaptively process and batch)
52
+ pipeline = IngestionPipeline(
53
+ transformations=[
54
+ # creating appropriate chunks and cutoffs (this needs to be worked on).
55
+ SemanticSplitterNodeParser(
56
+ buffer_size=10, # 1 = each sentence is a node
57
+ breakpoint_percentile_threshold=95,
58
+ embed_model=self.embedder,
59
+ ),
60
+ self.embedder,
61
+ ],
62
+ vector_store=vector_store
63
+ )
64
+
65
+ return pipeline
66
+
67
+ def run(self):
68
+ """creates the pipeline, returns the service and the embedder."""
69
+ pipeline = create_pipeline()
70
+ return self.pc, pipeline, self.embedder
71
+