nkcong206 commited on
Commit
33afc2e
1 Parent(s): 1d2cbea
Files changed (2) hide show
  1. Raptor.py +173 -0
  2. app.py +8 -193
Raptor.py ADDED
@@ -0,0 +1,173 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from sklearn.mixture import GaussianMixture
3
+ from langchain_core.prompts import ChatPromptTemplate
4
+ from langchain_core.output_parsers import StrOutputParser
5
+
6
+ from typing import Dict, List, Optional, Tuple
7
+
8
+ import numpy as np
9
+ import pandas as pd
10
+ import umap
11
+
12
+
13
+ def global_cluster_embeddings(
14
+ embeddings: np.ndarray,
15
+ dim: int,
16
+ n_neighbors: Optional[int] = None,
17
+ metric: str = "cosine",
18
+ ) -> np.ndarray:
19
+ if n_neighbors is None:
20
+ n_neighbors = int((len(embeddings) - 1) ** 0.5)
21
+ return umap.UMAP(
22
+ n_neighbors=n_neighbors, n_components=dim, metric=metric
23
+ ).fit_transform(embeddings)
24
+
25
+
26
+ def local_cluster_embeddings(
27
+ embeddings: np.ndarray, dim: int, num_neighbors: int = 10, metric: str = "cosine"
28
+ ) -> np.ndarray:
29
+ return umap.UMAP(
30
+ n_neighbors=num_neighbors, n_components=dim, metric=metric
31
+ ).fit_transform(embeddings)
32
+
33
+
34
+ def get_optimal_clusters(
35
+ embeddings: np.ndarray, max_clusters: int = 50, random_state: int = 200
36
+ ) -> int:
37
+ max_clusters = min(max_clusters, len(embeddings))
38
+ n_clusters = np.arange(1, max_clusters)
39
+ bics = []
40
+ for n in n_clusters:
41
+ gm = GaussianMixture(n_components=n, random_state=random_state)
42
+ gm.fit(embeddings)
43
+ bics.append(gm.bic(embeddings))
44
+ return n_clusters[np.argmin(bics)]
45
+
46
+
47
+ def GMM_cluster(embeddings: np.ndarray, threshold: float, random_state: int = 0):
48
+ n_clusters = get_optimal_clusters(embeddings, random_state = 200)
49
+ gm = GaussianMixture(n_components=n_clusters, random_state=random_state)
50
+ gm.fit(embeddings)
51
+ probs = gm.predict_proba(embeddings)
52
+ labels = [np.where(prob > threshold)[0] for prob in probs]
53
+ return labels, n_clusters
54
+
55
+
56
+ def perform_clustering(
57
+ embeddings: np.ndarray,
58
+ dim: int,
59
+ threshold: float,
60
+ ) -> List[np.ndarray]:
61
+ if len(embeddings) <= dim + 1:
62
+ return [np.array([0]) for _ in range(len(embeddings))]
63
+
64
+ reduced_embeddings_global = global_cluster_embeddings(embeddings, dim)
65
+ global_clusters, n_global_clusters = GMM_cluster(
66
+ reduced_embeddings_global, threshold
67
+ )
68
+
69
+ all_local_clusters = [np.array([]) for _ in range(len(embeddings))]
70
+ total_clusters = 0
71
+
72
+ for i in range(n_global_clusters):
73
+ global_cluster_embeddings_ = embeddings[
74
+ np.array([i in gc for gc in global_clusters])
75
+ ]
76
+ if len(global_cluster_embeddings_) == 0:
77
+ continue
78
+ if len(global_cluster_embeddings_) <= dim + 1:
79
+ local_clusters = [np.array([0]) for _ in global_cluster_embeddings_]
80
+ n_local_clusters = 1
81
+ else:
82
+ reduced_embeddings_local = local_cluster_embeddings(
83
+ global_cluster_embeddings_, dim
84
+ )
85
+ local_clusters, n_local_clusters = GMM_cluster(
86
+ reduced_embeddings_local, threshold
87
+ )
88
+ for j in range(n_local_clusters):
89
+ local_cluster_embeddings_ = global_cluster_embeddings_[
90
+ np.array([j in lc for lc in local_clusters])
91
+ ]
92
+ indices = np.where(
93
+ (embeddings == local_cluster_embeddings_[:, None]).all(-1)
94
+ )[1]
95
+ for idx in indices:
96
+ all_local_clusters[idx] = np.append(
97
+ all_local_clusters[idx], j + total_clusters
98
+ )
99
+
100
+ total_clusters += n_local_clusters
101
+
102
+ return all_local_clusters
103
+
104
+ def embed(embd,texts):
105
+ text_embeddings = embd.embed_documents(texts)
106
+ text_embeddings_np = np.array(text_embeddings)
107
+ return text_embeddings_np
108
+
109
+ def embed_cluster_texts(embd,texts):
110
+ text_embeddings_np = embed(embd,texts) # Generate embeddings
111
+ cluster_labels = perform_clustering(
112
+ text_embeddings_np, 10, 0.1
113
+ )
114
+ df = pd.DataFrame() # Initialize a DataFrame to store the results
115
+ df["text"] = texts # Store original texts
116
+ df["embd"] = list(text_embeddings_np) # Store embeddings as a list in the DataFrame
117
+ df["cluster"] = cluster_labels # Store cluster labels
118
+ return df
119
+
120
+ def fmt_txt(df: pd.DataFrame) -> str:
121
+ unique_txt = df["text"].tolist()
122
+ return "--- --- \n --- --- ".join(unique_txt)
123
+
124
+
125
+ def embed_cluster_summarize_texts(model,embd,
126
+ texts: List[str], level: int
127
+ ) -> Tuple[pd.DataFrame, pd.DataFrame]:
128
+ df_clusters = embed_cluster_texts(embd,texts)
129
+ expanded_list = []
130
+ for index, row in df_clusters.iterrows():
131
+ for cluster in row["cluster"]:
132
+ expanded_list.append(
133
+ {"text": row["text"], "embd": row["embd"], "cluster": cluster}
134
+ )
135
+ expanded_df = pd.DataFrame(expanded_list)
136
+ all_clusters = expanded_df["cluster"].unique()
137
+ template = """Bạn là một chatbot hỗ trợ tuyển sinh và sinh viên đại học, hãy tóm tắt chi tiết tài liệu quy chế dưới đây.
138
+ Đảm bảo rằng nội dung tóm tắt giúp người dùng hiểu rõ các quy định và quy trình liên quan đến tuyển sinh hoặc đào tạo tại đại học.
139
+ Tài liệu:
140
+ {context}
141
+ """
142
+ prompt = ChatPromptTemplate.from_template(template)
143
+ chain = prompt | model | StrOutputParser()
144
+
145
+ summaries = []
146
+ for i in all_clusters:
147
+ df_cluster = expanded_df[expanded_df["cluster"] == i]
148
+ formatted_txt = fmt_txt(df_cluster)
149
+ summaries.append(chain.invoke({"context": formatted_txt}))
150
+ df_summary = pd.DataFrame(
151
+ {
152
+ "summaries": summaries,
153
+ "level": [level] * len(summaries),
154
+ "cluster": list(all_clusters),
155
+ }
156
+ )
157
+ return df_clusters, df_summary
158
+
159
+ def recursive_embed_cluster_summarize(model,embd,
160
+ texts: List[str], level: int = 1, n_levels: int = 3
161
+ ) -> Dict[int, Tuple[pd.DataFrame, pd.DataFrame]]:
162
+ results = {}
163
+ df_clusters, df_summary = embed_cluster_summarize_texts(model,embd,texts, level)
164
+ results[level] = (df_clusters, df_summary)
165
+ unique_clusters = df_summary["cluster"].nunique()
166
+ if level < n_levels and unique_clusters > 1:
167
+ new_texts = df_summary["summaries"].tolist()
168
+ next_level_results = recursive_embed_cluster_summarize(model,embd,
169
+ new_texts, level + 1, n_levels
170
+ )
171
+ results.update(next_level_results)
172
+
173
+ return results
app.py CHANGED
@@ -6,202 +6,11 @@ from langchain_community.document_loaders import TextLoader
6
  from langchain_huggingface import HuggingFaceEmbeddings
7
  from langchain.prompts import PromptTemplate
8
 
9
- from typing import Dict, List, Optional, Tuple
10
-
11
- import numpy as np
12
- import pandas as pd
13
- import umap
14
  from langchain_core.output_parsers import StrOutputParser
15
- from sklearn.mixture import GaussianMixture
16
 
17
  from langchain_core.runnables import RunnablePassthrough
18
  from langchain_chroma import Chroma
19
-
20
-
21
-
22
-
23
- def global_cluster_embeddings(
24
- embeddings: np.ndarray,
25
- dim: int,
26
- n_neighbors: Optional[int] = None,
27
- metric: str = "cosine",
28
- ) -> np.ndarray:
29
- if n_neighbors is None:
30
- n_neighbors = int((len(embeddings) - 1) ** 0.5)
31
- return umap.UMAP(
32
- n_neighbors=n_neighbors, n_components=dim, metric=metric
33
- ).fit_transform(embeddings)
34
-
35
-
36
- def local_cluster_embeddings(
37
- embeddings: np.ndarray, dim: int, num_neighbors: int = 10, metric: str = "cosine"
38
- ) -> np.ndarray:
39
- return umap.UMAP(
40
- n_neighbors=num_neighbors, n_components=dim, metric=metric
41
- ).fit_transform(embeddings)
42
-
43
-
44
- def get_optimal_clusters(
45
- embeddings: np.ndarray, max_clusters: int = 50, random_state: int = 200
46
- ) -> int:
47
- max_clusters = min(max_clusters, len(embeddings))
48
- n_clusters = np.arange(1, max_clusters)
49
- bics = []
50
- for n in n_clusters:
51
- gm = GaussianMixture(n_components=n, random_state=random_state)
52
- gm.fit(embeddings)
53
- bics.append(gm.bic(embeddings))
54
- return n_clusters[np.argmin(bics)]
55
-
56
-
57
- def GMM_cluster(embeddings: np.ndarray, threshold: float, random_state: int = 0):
58
- n_clusters = get_optimal_clusters(embeddings, random_state = 200)
59
- gm = GaussianMixture(n_components=n_clusters, random_state=random_state)
60
- gm.fit(embeddings)
61
- probs = gm.predict_proba(embeddings)
62
- labels = [np.where(prob > threshold)[0] for prob in probs]
63
- return labels, n_clusters
64
-
65
-
66
- def perform_clustering(
67
- embeddings: np.ndarray,
68
- dim: int,
69
- threshold: float,
70
- ) -> List[np.ndarray]:
71
- if len(embeddings) <= dim + 1:
72
- # Avoid clustering when there's insufficient data
73
- return [np.array([0]) for _ in range(len(embeddings))]
74
-
75
- # Global dimensionality reduction
76
- reduced_embeddings_global = global_cluster_embeddings(embeddings, dim)
77
- # Global clustering
78
- global_clusters, n_global_clusters = GMM_cluster(
79
- reduced_embeddings_global, threshold
80
- )
81
-
82
- all_local_clusters = [np.array([]) for _ in range(len(embeddings))]
83
- total_clusters = 0
84
-
85
- # Iterate through each global cluster to perform local clustering
86
- for i in range(n_global_clusters):
87
- # Extract embeddings belonging to the current global cluster
88
- global_cluster_embeddings_ = embeddings[
89
- np.array([i in gc for gc in global_clusters])
90
- ]
91
-
92
- if len(global_cluster_embeddings_) == 0:
93
- continue
94
- if len(global_cluster_embeddings_) <= dim + 1:
95
- # Handle small clusters with direct assignment
96
- local_clusters = [np.array([0]) for _ in global_cluster_embeddings_]
97
- n_local_clusters = 1
98
- else:
99
- # Local dimensionality reduction and clustering
100
- reduced_embeddings_local = local_cluster_embeddings(
101
- global_cluster_embeddings_, dim
102
- )
103
- local_clusters, n_local_clusters = GMM_cluster(
104
- reduced_embeddings_local, threshold
105
- )
106
-
107
- # Assign local cluster IDs, adjusting for total clusters already processed
108
- for j in range(n_local_clusters):
109
- local_cluster_embeddings_ = global_cluster_embeddings_[
110
- np.array([j in lc for lc in local_clusters])
111
- ]
112
- indices = np.where(
113
- (embeddings == local_cluster_embeddings_[:, None]).all(-1)
114
- )[1]
115
- for idx in indices:
116
- all_local_clusters[idx] = np.append(
117
- all_local_clusters[idx], j + total_clusters
118
- )
119
-
120
- total_clusters += n_local_clusters
121
-
122
- return all_local_clusters
123
-
124
- def embed(embd,texts):
125
- text_embeddings = embd.embed_documents(texts)
126
- text_embeddings_np = np.array(text_embeddings)
127
- return text_embeddings_np
128
-
129
- def embed_cluster_texts(embd,texts):
130
- text_embeddings_np = embed(embd,texts) # Generate embeddings
131
- cluster_labels = perform_clustering(
132
- text_embeddings_np, 10, 0.1
133
- ) # Perform clustering on the embeddings
134
- df = pd.DataFrame() # Initialize a DataFrame to store the results
135
- df["text"] = texts # Store original texts
136
- df["embd"] = list(text_embeddings_np) # Store embeddings as a list in the DataFrame
137
- df["cluster"] = cluster_labels # Store cluster labels
138
- return df
139
-
140
- def fmt_txt(df: pd.DataFrame) -> str:
141
- unique_txt = df["text"].tolist()
142
- return "--- --- \n --- --- ".join(unique_txt)
143
-
144
-
145
- def embed_cluster_summarize_texts(model,embd,
146
- texts: List[str], level: int
147
- ) -> Tuple[pd.DataFrame, pd.DataFrame]:
148
- df_clusters = embed_cluster_texts(embd,texts)
149
-
150
- # Prepare to expand the DataFrame for easier manipulation of clusters
151
- expanded_list = []
152
-
153
- # Expand DataFrame entries to document-cluster pairings for straightforward processing
154
- for index, row in df_clusters.iterrows():
155
- for cluster in row["cluster"]:
156
- expanded_list.append(
157
- {"text": row["text"], "embd": row["embd"], "cluster": cluster}
158
- )
159
-
160
- # Create a new DataFrame from the expanded list
161
- expanded_df = pd.DataFrame(expanded_list)
162
-
163
- # Retrieve unique cluster identifiers for processing
164
- all_clusters = expanded_df["cluster"].unique()
165
- # Summarization
166
- template = """Bạn là một chatbot hỗ trợ tuyển sinh và sinh viên đại học, hãy tóm tắt chi tiết tài liệu quy chế dưới đây.
167
- Đảm bảo rằng nội dung tóm tắt giúp người dùng hiểu rõ các quy định và quy trình liên quan đến tuyển sinh hoặc đào tạo tại đại học.
168
- Tài liệu:
169
- {context}
170
- """
171
- prompt = ChatPromptTemplate.from_template(template)
172
- chain = prompt | model | StrOutputParser()
173
-
174
- summaries = []
175
- for i in all_clusters:
176
- df_cluster = expanded_df[expanded_df["cluster"] == i]
177
- formatted_txt = fmt_txt(df_cluster)
178
- summaries.append(chain.invoke({"context": formatted_txt}))
179
- df_summary = pd.DataFrame(
180
- {
181
- "summaries": summaries,
182
- "level": [level] * len(summaries),
183
- "cluster": list(all_clusters),
184
- }
185
- )
186
- return df_clusters, df_summary
187
-
188
- def recursive_embed_cluster_summarize(model,embd,
189
- texts: List[str], level: int = 1, n_levels: int = 3
190
- ) -> Dict[int, Tuple[pd.DataFrame, pd.DataFrame]]:
191
- results = {}
192
- df_clusters, df_summary = embed_cluster_summarize_texts(model,embd,texts, level)
193
-
194
- results[level] = (df_clusters, df_summary)
195
-
196
- unique_clusters = df_summary["cluster"].nunique()
197
- if level < n_levels and unique_clusters > 1:
198
- new_texts = df_summary["summaries"].tolist()
199
- next_level_results = recursive_embed_cluster_summarize(model,embd,
200
- new_texts, level + 1, n_levels
201
- )
202
- results.update(next_level_results)
203
-
204
- return results
205
 
206
  page = st.title("Chat with AskUSTH")
207
 
@@ -313,11 +122,17 @@ def format_docs(docs):
313
 
314
  @st.cache_resource
315
  def compute_rag_chain(_model, _embd, docs_texts):
316
- results = recursive_embed_cluster_summarize(_model, _embd, docs_texts, level=1, n_levels=3)
317
  all_texts = docs_texts.copy()
 
318
  for level in sorted(results.keys()):
319
  summaries = results[level][1]["summaries"].tolist()
320
  all_texts.extend(summaries)
 
 
 
 
 
321
  vectorstore = Chroma.from_texts(texts=all_texts, embedding=_embd)
322
  retriever = vectorstore.as_retriever()
323
  template = """
 
6
  from langchain_huggingface import HuggingFaceEmbeddings
7
  from langchain.prompts import PromptTemplate
8
 
 
 
 
 
 
9
  from langchain_core.output_parsers import StrOutputParser
 
10
 
11
  from langchain_core.runnables import RunnablePassthrough
12
  from langchain_chroma import Chroma
13
+ import Raptor
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
  page = st.title("Chat with AskUSTH")
16
 
 
122
 
123
  @st.cache_resource
124
  def compute_rag_chain(_model, _embd, docs_texts):
125
+ results = Raptor.recursive_embed_cluster_summarize(_model, _embd, docs_texts, level=1, n_levels=3)
126
  all_texts = docs_texts.copy()
127
+ i = 0
128
  for level in sorted(results.keys()):
129
  summaries = results[level][1]["summaries"].tolist()
130
  all_texts.extend(summaries)
131
+ print(f"summary {i} -------------------------------------------------")
132
+ print(summaries)
133
+ i += 1
134
+ print("all_texts ______________________________________")
135
+ print(all_texts)
136
  vectorstore = Chroma.from_texts(texts=all_texts, embedding=_embd)
137
  retriever = vectorstore.as_retriever()
138
  template = """