tykiww commited on
Commit
5ac6c55
β€’
1 Parent(s): 61cefee

Delete utilities/transcript_embedder.py

Browse files
Files changed (1) hide show
  1. utilities/transcript_embedder.py +0 -139
utilities/transcript_embedder.py DELETED
@@ -1,139 +0,0 @@
1
- # each type of embeddings have a different dimensionset.
2
-
3
- from llama_index.embeddings.huggingface import HuggingFaceEmbedding
4
-
5
- from pinecone.grpc import PineconeGRPC
6
- from pinecone import ServerlessSpec
7
-
8
- from llama_index.vector_stores import PineconeVectorStore
9
- from llama_index.node_parser import SemanticSplitterNodeParser
10
- from llama_index.ingestion import IngestionPipeline
11
-
12
- import gc
13
- import re
14
-
15
-
16
- class DocumentEmbedder:
17
- """
18
- Takes a document and embeds it directly into a pinecone data store.
19
- Process retrieves, cleans, embeds, and sends the documents to vector
20
- store.
21
-
22
- Currently supports hugginface embeddings only. Gotta keep things cheap.
23
- """
24
-
25
- def __init__(self, api_keys, files, embedding, index_name):
26
- # api keys
27
- self.pinecone_api_key = api_keys['pinecone']
28
- self.openai_api_key = api_keys['openai']
29
- self.huggingface_api_key = api_keys['huggingface']
30
- # pinecone
31
- self.embedding = embedding
32
- self.vector_db = index_name
33
- # basic items
34
- self.files = files
35
- self.interactive = interactive
36
-
37
-
38
- def clean_text(self, content: str) -> str:
39
- """
40
- Remove unwanted characters and patterns in text input.
41
- :param content: Text input.
42
- :return: Cleaned version of original text input.
43
- """
44
-
45
- # Fix hyphenated words broken by newline
46
- content = re.sub(r'(\w+)-\n(\w+)', r'\1\2', content)
47
-
48
- # Remove specific unwanted patterns and characters
49
- unwanted_patterns = [
50
- "\\n", " β€”", "β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”", "β€”β€”β€”β€”β€”β€”β€”β€”β€”", "β€”β€”β€”β€”β€”",
51
- r'\\u[\dA-Fa-f]{4}', r'\uf075', r'\uf0b7'
52
- ]
53
- for pattern in unwanted_patterns:
54
- content = re.sub(pattern, "", content)
55
-
56
- # Fix improperly spaced hyphenated words and normalize whitespace
57
- content = re.sub(r'(\w)\s*-\s*(\w)', r'\1-\2', content)
58
- content = re.sub(r'\s+', ' ', content)
59
-
60
- return content
61
-
62
-
63
- def create_embedder(self):
64
- """Get the right embedding model"""
65
- embedding = HuggingFaceEmbedding(model_name=self.embedding)
66
- return embedding, metadata['dimensions']
67
-
68
-
69
- def pinecone_pipeline(self, embedding, dimensions):
70
- """Initialize pinecone connection and vectorstore"""
71
-
72
- # connect
73
- pc = PineconeGRPC(api_key=self.pinecone_api_key)
74
-
75
- # Create your index if index does not exist
76
- indexes = [i.name for i in pc.list_indexes()]
77
- index_exists = any([self.vector_db in i for i in indexes])
78
-
79
- if index_exists:
80
- print("Index already exists")
81
- else:
82
- print("Creating index")
83
- pc.create_index(
84
- self.vector_db,
85
- dimension=dimensions,
86
- metric="cosine",
87
- spec=ServerlessSpec(cloud="aws", region="us-east-1"),
88
- )
89
-
90
- # Initialize your index
91
- pinecone_index = pc.Index(self.vector_db)
92
-
93
- # Initialize VectorStore
94
- vector_store = PineconeVectorStore(pinecone_index=pinecone_index)
95
-
96
- # create pipeline (abstracts away the need to adaptively process and batch)
97
- pipeline = IngestionPipeline(
98
- transformations=[
99
- # creating appropriate chunks and cutoffs (this needs to be worked on).
100
- SemanticSplitterNodeParser(
101
- buffer_size=10, # 1 = each sentence is a node
102
- breakpoint_percentile_threshold=95,
103
- embed_model=embedding,
104
- ),
105
- embedding,
106
- ],
107
- vector_store=vector_store
108
- )
109
-
110
- return pipeline
111
-
112
- def embed(self):
113
- """stringing process above to embed and upsert directly to pinecone"""
114
-
115
- # read_file
116
- print("reading files")
117
- results = self.files
118
-
119
- # Call clean function
120
- print("cleaning files")
121
- for d in range(len(results)):
122
- results[d].text = self.clean_text(results[d].text)
123
-
124
- # set up embedder
125
- print("retrieving embedder")
126
- embedder, metadata = self.create_embedder()
127
-
128
- # set up pinecone pipeline
129
- print("initializing pinecone db")
130
- pipeline = self.pinecone_pipeline(embedder, metadata)
131
-
132
- # run pinecone in batches (of 1) for memory preservation.
133
- print("reading into pinecone db")
134
- batchsize = 1
135
- for i in range(0, len(results), batchsize):
136
- gc.collect()
137
- batch = pipeline.run(documents=results[i:i+batchsize])
138
- print("completed batch %s" % ((i+batchsize)/batchsize))
139
-