multi-meeting-QnA

Sleeping

File size: 2,592 Bytes

from pinecone.grpc import PineconeGRPC
from pinecone import ServerlessSpec

from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.vector_stores import PineconeVectorStore
from llama_index.node_parser import SemanticSplitterNodeParser
from llama_index.ingestion import IngestionPipeline

class PineconeConnector:
    """
    A way to call the pinecone DB
    """
    def __init__(self, api_key, index_name, embedding):
        self.pinecone_api_key = api_key
        self.vector_db = index_name
        self.embedding = embedding
        print("Connecting to Pinecone DB")
        self.pc = self.connect()
        print("Retrieving Embedder")
        self.embedder = HuggingFaceEmbedding(model_name=embedding)
    
    def connect(self):
        """Connects to vectorstore"""
        # connect
        pc = PineconeGRPC(api_key=self.pinecone_api_key)
        return pc
    
    def create_pipeline(self):
        """Create a pipeline given an index name"""
        # Create your index if index does not exist
        indexes = [i.name for i in self.pc.list_indexes()]
        index_exists = any([self.vector_db in i for i in indexes])

        if index_exists:
            print("Index already exists")
        else:
            print("Creating index")
            self.pc.create_index(
                self.vector_db,
                dimension=768,
                metric="cosine",
                spec=ServerlessSpec(cloud="aws", region="us-east-1"),
        )

        # Initialize your index
        pinecone_index = self.pc.Index(self.vector_db)

        # Initialize VectorStore
        vector_store = PineconeVectorStore(pinecone_index=pinecone_index)
        
        # create pipeline (abstracts away the need to adaptively process and batch)
        pipeline = IngestionPipeline(
            transformations=[
                # creating appropriate chunks and cutoffs (this needs to be worked on).
                SemanticSplitterNodeParser(
                    buffer_size=10, # 1 = each sentence is a node
                    breakpoint_percentile_threshold=95,
                    embed_model=self.embedder,
                    ),
                self.embedder,
                ],
                vector_store=vector_store
            )

        return pipeline

    def run(self):
        """creates the pipeline, returns the connection and the embedder."""
        pipeline = self.create_pipeline()
        return {
            "connection": self.pc,
            "pipeline": pipeline,
            "embedder": self.embedder
            
        }