File size: 2,592 Bytes
d69c6f7
 
 
 
 
 
 
 
 
 
0b69e0d
d69c6f7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c052420
d69c6f7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0b69e0d
a9edefa
0b69e0d
 
 
 
 
 
d69c6f7
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
from pinecone.grpc import PineconeGRPC
from pinecone import ServerlessSpec

from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.vector_stores import PineconeVectorStore
from llama_index.node_parser import SemanticSplitterNodeParser
from llama_index.ingestion import IngestionPipeline

class PineconeConnector:
    """
    A way to call the pinecone DB
    """
    def __init__(self, api_key, index_name, embedding):
        self.pinecone_api_key = api_key
        self.vector_db = index_name
        self.embedding = embedding
        print("Connecting to Pinecone DB")
        self.pc = self.connect()
        print("Retrieving Embedder")
        self.embedder = HuggingFaceEmbedding(model_name=embedding)
    
    def connect(self):
        """Connects to vectorstore"""
        # connect
        pc = PineconeGRPC(api_key=self.pinecone_api_key)
        return pc
    
    def create_pipeline(self):
        """Create a pipeline given an index name"""
        # Create your index if index does not exist
        indexes = [i.name for i in self.pc.list_indexes()]
        index_exists = any([self.vector_db in i for i in indexes])

        if index_exists:
            print("Index already exists")
        else:
            print("Creating index")
            self.pc.create_index(
                self.vector_db,
                dimension=768,
                metric="cosine",
                spec=ServerlessSpec(cloud="aws", region="us-east-1"),
        )

        # Initialize your index
        pinecone_index = self.pc.Index(self.vector_db)

        # Initialize VectorStore
        vector_store = PineconeVectorStore(pinecone_index=pinecone_index)
        
        # create pipeline (abstracts away the need to adaptively process and batch)
        pipeline = IngestionPipeline(
            transformations=[
                # creating appropriate chunks and cutoffs (this needs to be worked on).
                SemanticSplitterNodeParser(
                    buffer_size=10, # 1 = each sentence is a node
                    breakpoint_percentile_threshold=95,
                    embed_model=self.embedder,
                    ),
                self.embedder,
                ],
                vector_store=vector_store
            )

        return pipeline

    def run(self):
        """creates the pipeline, returns the connection and the embedder."""
        pipeline = self.create_pipeline()
        return {
            "connection": self.pc,
            "pipeline": pipeline,
            "embedder": self.embedder
            
        }