Spaces:
Sleeping
Sleeping
File size: 2,592 Bytes
d69c6f7 0b69e0d d69c6f7 c052420 d69c6f7 0b69e0d a9edefa 0b69e0d d69c6f7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 |
from pinecone.grpc import PineconeGRPC
from pinecone import ServerlessSpec
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.vector_stores import PineconeVectorStore
from llama_index.node_parser import SemanticSplitterNodeParser
from llama_index.ingestion import IngestionPipeline
class PineconeConnector:
"""
A way to call the pinecone DB
"""
def __init__(self, api_key, index_name, embedding):
self.pinecone_api_key = api_key
self.vector_db = index_name
self.embedding = embedding
print("Connecting to Pinecone DB")
self.pc = self.connect()
print("Retrieving Embedder")
self.embedder = HuggingFaceEmbedding(model_name=embedding)
def connect(self):
"""Connects to vectorstore"""
# connect
pc = PineconeGRPC(api_key=self.pinecone_api_key)
return pc
def create_pipeline(self):
"""Create a pipeline given an index name"""
# Create your index if index does not exist
indexes = [i.name for i in self.pc.list_indexes()]
index_exists = any([self.vector_db in i for i in indexes])
if index_exists:
print("Index already exists")
else:
print("Creating index")
self.pc.create_index(
self.vector_db,
dimension=768,
metric="cosine",
spec=ServerlessSpec(cloud="aws", region="us-east-1"),
)
# Initialize your index
pinecone_index = self.pc.Index(self.vector_db)
# Initialize VectorStore
vector_store = PineconeVectorStore(pinecone_index=pinecone_index)
# create pipeline (abstracts away the need to adaptively process and batch)
pipeline = IngestionPipeline(
transformations=[
# creating appropriate chunks and cutoffs (this needs to be worked on).
SemanticSplitterNodeParser(
buffer_size=10, # 1 = each sentence is a node
breakpoint_percentile_threshold=95,
embed_model=self.embedder,
),
self.embedder,
],
vector_store=vector_store
)
return pipeline
def run(self):
"""creates the pipeline, returns the connection and the embedder."""
pipeline = self.create_pipeline()
return {
"connection": self.pc,
"pipeline": pipeline,
"embedder": self.embedder
}
|