Spaces:
Running
Running
lingyit1108
commited on
Commit
β’
b580d80
1
Parent(s):
ac8a60b
to create RAGAs result with triad of metrics
Browse files- .gitignore +2 -2
- archive/{requirements.txt β dependencies/requirements_backup.txt} +0 -0
- archive/dependencies/requirements_llama-index==0.9.24.txt +259 -0
- archive/{test.py β experiments/test.py} +0 -0
- archive/{init_setup.py β model_evaluation/init_setup.py} +0 -0
- archive/{main.py β model_evaluation/main.py} +0 -0
- archive/model_evaluation/main_new.py +180 -0
- archive/model_evaluation/utils.py +160 -0
- archive/model_evaluation/utils_new.py +95 -0
- database/mock_qna_source.csv +2 -2
- evaluate_model.py +83 -0
- models/trulens_eval.sqlite +3 -0
- notebooks/002_persisted-embedding-model.ipynb +1 -0
- pages/1_Leaderboard.py +1 -1
- pages/2_Evaluations.py +2 -2
- pages/3_app.py +4 -6
- qna_prompting.py +11 -6
- raw_documents/eval_answers.txt +2 -2
- raw_documents/eval_questions.txt +2 -2
- raw_documents/qna.txt +2 -2
- requirements.txt +5 -4
- streamlit_app.py +1 -1
- utils.py +3 -72
.gitignore
CHANGED
@@ -3,10 +3,10 @@
|
|
3 |
.streamlit/
|
4 |
results/
|
5 |
|
6 |
-
*.sqlite
|
7 |
data/
|
8 |
|
9 |
notebooks/test_model
|
10 |
screenshot_questions/
|
11 |
|
12 |
-
# ux/
|
|
|
|
3 |
.streamlit/
|
4 |
results/
|
5 |
|
|
|
6 |
data/
|
7 |
|
8 |
notebooks/test_model
|
9 |
screenshot_questions/
|
10 |
|
11 |
+
# ux/
|
12 |
+
# *.sqlite
|
archive/{requirements.txt β dependencies/requirements_backup.txt}
RENAMED
File without changes
|
archive/dependencies/requirements_llama-index==0.9.24.txt
ADDED
@@ -0,0 +1,259 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
aiohttp==3.9.1
|
2 |
+
aiosignal==1.3.1
|
3 |
+
alembic==1.13.1
|
4 |
+
altair==5.2.0
|
5 |
+
annotated-types==0.6.0
|
6 |
+
anyio==4.2.0
|
7 |
+
appnope==0.1.3
|
8 |
+
argon2-cffi==23.1.0
|
9 |
+
argon2-cffi-bindings==21.2.0
|
10 |
+
arrow==1.3.0
|
11 |
+
asgiref==3.7.2
|
12 |
+
asttokens==2.4.1
|
13 |
+
async-lru==2.0.4
|
14 |
+
async-timeout==4.0.3
|
15 |
+
attrs==23.2.0
|
16 |
+
Babel==2.14.0
|
17 |
+
backoff==2.2.1
|
18 |
+
bcrypt==4.1.2
|
19 |
+
beautifulsoup4==4.12.3
|
20 |
+
bleach==6.1.0
|
21 |
+
blinker==1.7.0
|
22 |
+
bs4==0.0.2
|
23 |
+
build==1.0.3
|
24 |
+
cachetools==5.3.2
|
25 |
+
certifi==2023.11.17
|
26 |
+
cffi==1.16.0
|
27 |
+
charset-normalizer==3.3.2
|
28 |
+
chroma-hnswlib==0.7.3
|
29 |
+
chromadb==0.4.22
|
30 |
+
click==8.1.7
|
31 |
+
coloredlogs==15.0.1
|
32 |
+
comm==0.2.0
|
33 |
+
contourpy==1.2.0
|
34 |
+
cycler==0.12.1
|
35 |
+
dataclasses-json==0.6.3
|
36 |
+
debugpy==1.8.0
|
37 |
+
decorator==5.1.1
|
38 |
+
defusedxml==0.7.1
|
39 |
+
Deprecated==1.2.14
|
40 |
+
dill==0.3.7
|
41 |
+
dirtyjson==1.0.8
|
42 |
+
distro==1.9.0
|
43 |
+
entrypoints==0.4
|
44 |
+
exceptiongroup==1.2.0
|
45 |
+
executing==2.0.1
|
46 |
+
Faker==22.0.0
|
47 |
+
fastapi==0.109.0
|
48 |
+
fastjsonschema==2.19.1
|
49 |
+
favicon==0.7.0
|
50 |
+
filelock==3.13.1
|
51 |
+
flatbuffers==23.5.26
|
52 |
+
fonttools==4.47.0
|
53 |
+
fqdn==1.5.1
|
54 |
+
frozendict==2.4.0
|
55 |
+
frozenlist==1.4.1
|
56 |
+
fsspec==2023.12.2
|
57 |
+
gitdb==4.0.11
|
58 |
+
GitPython==3.1.40
|
59 |
+
google-auth==2.27.0
|
60 |
+
googleapis-common-protos==1.62.0
|
61 |
+
greenlet==3.0.3
|
62 |
+
grpcio==1.60.0
|
63 |
+
h11==0.14.0
|
64 |
+
htbuilder==0.6.2
|
65 |
+
httpcore==1.0.2
|
66 |
+
httptools==0.6.1
|
67 |
+
httpx==0.26.0
|
68 |
+
huggingface-hub==0.20.1
|
69 |
+
humanfriendly==10.0
|
70 |
+
humanize==4.9.0
|
71 |
+
idna==3.6
|
72 |
+
importlib-metadata==6.11.0
|
73 |
+
importlib-resources==6.1.1
|
74 |
+
ipykernel==6.28.0
|
75 |
+
ipython==8.18.1
|
76 |
+
ipywidgets==8.1.1
|
77 |
+
isoduration==20.11.0
|
78 |
+
jedi==0.19.1
|
79 |
+
Jinja2==3.1.2
|
80 |
+
joblib==1.3.2
|
81 |
+
json5==0.9.14
|
82 |
+
jsonpatch==1.33
|
83 |
+
jsonpointer==2.4
|
84 |
+
jsonschema==4.20.0
|
85 |
+
jsonschema-specifications==2023.12.1
|
86 |
+
jupyter==1.0.0
|
87 |
+
jupyter-console==6.6.3
|
88 |
+
jupyter-events==0.9.0
|
89 |
+
jupyter-lsp==2.2.1
|
90 |
+
jupyter_client==8.6.0
|
91 |
+
jupyter_core==5.6.1
|
92 |
+
jupyter_server==2.12.1
|
93 |
+
jupyter_server_terminals==0.5.1
|
94 |
+
jupyterlab==4.0.10
|
95 |
+
jupyterlab-widgets==3.0.9
|
96 |
+
jupyterlab_pygments==0.3.0
|
97 |
+
jupyterlab_server==2.25.2
|
98 |
+
kiwisolver==1.4.5
|
99 |
+
kubernetes==29.0.0
|
100 |
+
langchain==0.0.354
|
101 |
+
langchain-community==0.0.8
|
102 |
+
langchain-core==0.1.23
|
103 |
+
langsmith==0.0.87
|
104 |
+
llama-index==0.9.24
|
105 |
+
lxml==5.1.0
|
106 |
+
Mako==1.3.0
|
107 |
+
Markdown==3.5.1
|
108 |
+
markdown-it-py==3.0.0
|
109 |
+
markdownlit==0.0.7
|
110 |
+
MarkupSafe==2.1.3
|
111 |
+
marshmallow==3.20.1
|
112 |
+
matplotlib==3.8.2
|
113 |
+
matplotlib-inline==0.1.6
|
114 |
+
mdurl==0.1.2
|
115 |
+
merkle-json==1.0.0
|
116 |
+
millify==0.1.1
|
117 |
+
mistune==3.0.2
|
118 |
+
mmh3==4.1.0
|
119 |
+
monotonic==1.6
|
120 |
+
more-itertools==10.1.0
|
121 |
+
mpmath==1.3.0
|
122 |
+
multidict==6.0.4
|
123 |
+
munch==4.0.0
|
124 |
+
mypy-extensions==1.0.0
|
125 |
+
nbclient==0.9.0
|
126 |
+
nbconvert==7.14.0
|
127 |
+
nbformat==5.9.2
|
128 |
+
nest-asyncio==1.5.8
|
129 |
+
networkx==3.2.1
|
130 |
+
nltk==3.8.1
|
131 |
+
notebook==7.0.6
|
132 |
+
notebook_shim==0.2.3
|
133 |
+
numpy==1.26.2
|
134 |
+
oauthlib==3.2.2
|
135 |
+
onnxruntime==1.17.0
|
136 |
+
openai==1.6.1
|
137 |
+
opentelemetry-api==1.22.0
|
138 |
+
opentelemetry-exporter-otlp-proto-common==1.22.0
|
139 |
+
opentelemetry-exporter-otlp-proto-grpc==1.22.0
|
140 |
+
opentelemetry-instrumentation==0.43b0
|
141 |
+
opentelemetry-instrumentation-asgi==0.43b0
|
142 |
+
opentelemetry-instrumentation-fastapi==0.43b0
|
143 |
+
opentelemetry-proto==1.22.0
|
144 |
+
opentelemetry-sdk==1.22.0
|
145 |
+
opentelemetry-semantic-conventions==0.43b0
|
146 |
+
opentelemetry-util-http==0.43b0
|
147 |
+
overrides==7.4.0
|
148 |
+
packaging==23.2
|
149 |
+
pandas==2.1.4
|
150 |
+
pandocfilters==1.5.0
|
151 |
+
parso==0.8.3
|
152 |
+
pexpect==4.9.0
|
153 |
+
pillow==10.2.0
|
154 |
+
platformdirs==4.1.0
|
155 |
+
posthog==3.3.3
|
156 |
+
prometheus-client==0.19.0
|
157 |
+
prompt-toolkit==3.0.43
|
158 |
+
protobuf==4.25.1
|
159 |
+
psutil==5.9.7
|
160 |
+
ptyprocess==0.7.0
|
161 |
+
pulsar-client==3.4.0
|
162 |
+
pure-eval==0.2.2
|
163 |
+
pyarrow==14.0.2
|
164 |
+
pyasn1==0.5.1
|
165 |
+
pyasn1-modules==0.3.0
|
166 |
+
pycparser==2.21
|
167 |
+
pydantic==2.5.3
|
168 |
+
pydantic_core==2.14.6
|
169 |
+
pydeck==0.8.1b0
|
170 |
+
Pygments==2.17.2
|
171 |
+
pymdown-extensions==10.7
|
172 |
+
PyMuPDF==1.23.22
|
173 |
+
PyMuPDFb==1.23.22
|
174 |
+
pyparsing==3.1.1
|
175 |
+
pypdf==4.0.1
|
176 |
+
PyPika==0.48.9
|
177 |
+
pyproject_hooks==1.0.0
|
178 |
+
python-dateutil==2.8.2
|
179 |
+
python-decouple==3.8
|
180 |
+
python-dotenv==1.0.0
|
181 |
+
python-json-logger==2.0.7
|
182 |
+
pytz==2023.3.post1
|
183 |
+
PyYAML==6.0.1
|
184 |
+
pyzmq==25.1.2
|
185 |
+
qtconsole==5.5.1
|
186 |
+
QtPy==2.4.1
|
187 |
+
referencing==0.32.0
|
188 |
+
regex==2023.12.25
|
189 |
+
requests==2.31.0
|
190 |
+
requests-oauthlib==1.3.1
|
191 |
+
rfc3339-validator==0.1.4
|
192 |
+
rfc3986-validator==0.1.1
|
193 |
+
rich==13.7.0
|
194 |
+
rpds-py==0.16.2
|
195 |
+
rsa==4.9
|
196 |
+
safetensors==0.4.1
|
197 |
+
scikit-learn==1.4.0
|
198 |
+
scipy==1.12.0
|
199 |
+
Send2Trash==1.8.2
|
200 |
+
sentence-transformers==2.3.0
|
201 |
+
sentencepiece==0.1.99
|
202 |
+
six==1.16.0
|
203 |
+
smmap==5.0.1
|
204 |
+
sniffio==1.3.0
|
205 |
+
soupsieve==2.5
|
206 |
+
SQLAlchemy==2.0.24
|
207 |
+
st-annotated-text==4.0.1
|
208 |
+
stack-data==0.6.3
|
209 |
+
starlette==0.35.1
|
210 |
+
streamlit==1.31.1
|
211 |
+
streamlit-aggrid==0.3.4.post3
|
212 |
+
streamlit-camera-input-live==0.2.0
|
213 |
+
streamlit-card==1.0.0
|
214 |
+
streamlit-embedcode==0.1.2
|
215 |
+
streamlit-extras==0.3.6
|
216 |
+
streamlit-faker==0.0.3
|
217 |
+
streamlit-feedback==0.1.3
|
218 |
+
streamlit-image-coordinates==0.1.6
|
219 |
+
streamlit-keyup==0.2.2
|
220 |
+
streamlit-toggle-switch==1.0.2
|
221 |
+
streamlit-vertical-slider==2.5.5
|
222 |
+
sympy==1.12
|
223 |
+
tenacity==8.2.3
|
224 |
+
terminado==0.18.0
|
225 |
+
threadpoolctl==3.2.0
|
226 |
+
tiktoken==0.5.2
|
227 |
+
tinycss2==1.2.1
|
228 |
+
tokenizers==0.15.2
|
229 |
+
toml==0.10.2
|
230 |
+
tomli==2.0.1
|
231 |
+
toolz==0.12.0
|
232 |
+
torch==2.1.2
|
233 |
+
tornado==6.4
|
234 |
+
tqdm==4.66.1
|
235 |
+
traitlets==5.14.0
|
236 |
+
transformers==4.37.2
|
237 |
+
trulens==0.13.4
|
238 |
+
trulens-eval==0.20.0
|
239 |
+
typer==0.9.0
|
240 |
+
types-python-dateutil==2.8.19.14
|
241 |
+
typing-inspect==0.9.0
|
242 |
+
typing_extensions==4.9.0
|
243 |
+
tzdata==2023.4
|
244 |
+
tzlocal==5.2
|
245 |
+
uri-template==1.3.0
|
246 |
+
urllib3==2.1.0
|
247 |
+
uvicorn==0.27.0
|
248 |
+
uvloop==0.19.0
|
249 |
+
validators==0.22.0
|
250 |
+
watchfiles==0.21.0
|
251 |
+
wcwidth==0.2.12
|
252 |
+
webcolors==1.13
|
253 |
+
webencodings==0.5.1
|
254 |
+
websocket-client==1.7.0
|
255 |
+
websockets==12.0
|
256 |
+
widgetsnbextension==4.0.9
|
257 |
+
wrapt==1.16.0
|
258 |
+
yarl==1.9.4
|
259 |
+
zipp==3.17.0
|
archive/{test.py β experiments/test.py}
RENAMED
File without changes
|
archive/{init_setup.py β model_evaluation/init_setup.py}
RENAMED
File without changes
|
archive/{main.py β model_evaluation/main.py}
RENAMED
File without changes
|
archive/model_evaluation/main_new.py
ADDED
@@ -0,0 +1,180 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import utils
|
2 |
+
import os
|
3 |
+
|
4 |
+
import numpy as np
|
5 |
+
import nest_asyncio
|
6 |
+
import openai
|
7 |
+
import chromadb
|
8 |
+
|
9 |
+
from llama_index.legacy import (
|
10 |
+
VectorStoreIndex,
|
11 |
+
SimpleDirectoryReader
|
12 |
+
)
|
13 |
+
from llama_index.core import (
|
14 |
+
StorageContext,
|
15 |
+
Document,
|
16 |
+
Settings
|
17 |
+
)
|
18 |
+
from llama_index.vector_stores.chroma.base import ChromaVectorStore
|
19 |
+
from llama_index.llms.openai import OpenAI
|
20 |
+
from llama_index.embeddings.huggingface.base import HuggingFaceEmbedding
|
21 |
+
from trulens_eval import Tru
|
22 |
+
|
23 |
+
from utils import get_prebuilt_trulens_recorder
|
24 |
+
import time
|
25 |
+
|
26 |
+
nest_asyncio.apply()
|
27 |
+
openai.api_key = utils.get_openai_api_key()
|
28 |
+
|
29 |
+
def main():
|
30 |
+
|
31 |
+
if not os.path.exists("./default.sqlite"):
|
32 |
+
|
33 |
+
start_time = time.time()
|
34 |
+
|
35 |
+
llm = OpenAI(model="gpt-3.5-turbo-1106", temperature=0.0)
|
36 |
+
fine_tuned_path = "local:./models/fine-tuned-embeddings"
|
37 |
+
|
38 |
+
Settings.llm = llm
|
39 |
+
Settings.embed_model = fine_tuned_path
|
40 |
+
|
41 |
+
db = chromadb.PersistentClient(path="./models/chroma_db")
|
42 |
+
chroma_collection = db.get_or_create_collection("quickstart")
|
43 |
+
|
44 |
+
# assign chroma as the vector_store to the context
|
45 |
+
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
|
46 |
+
storage_context = StorageContext.from_defaults(vector_store=vector_store)
|
47 |
+
|
48 |
+
# create your index
|
49 |
+
index = VectorStoreIndex.from_vector_store(
|
50 |
+
vector_store=vector_store,
|
51 |
+
storage_context=storage_context
|
52 |
+
)
|
53 |
+
query_engine = index.as_query_engine()
|
54 |
+
|
55 |
+
separator = "\n\n"
|
56 |
+
eval_questions = []
|
57 |
+
with open('./raw_documents/eval_questions.txt', 'r') as file:
|
58 |
+
content = file.read()
|
59 |
+
|
60 |
+
for question in content.split(separator):
|
61 |
+
print(question)
|
62 |
+
print(separator)
|
63 |
+
eval_questions.append(question.strip())
|
64 |
+
|
65 |
+
response = query_engine.query(eval_questions[0])
|
66 |
+
print(str(response))
|
67 |
+
|
68 |
+
tru = Tru(database_file="./models/trulens_eval.sqlite")
|
69 |
+
tru_recorder = get_prebuilt_trulens_recorder(query_engine,
|
70 |
+
app_id="Direct Query Engine")
|
71 |
+
|
72 |
+
print("Sending each question to llm ..")
|
73 |
+
with tru_recorder as recording:
|
74 |
+
for question in eval_questions:
|
75 |
+
response = query_engine.query(question)
|
76 |
+
|
77 |
+
records, feedback = tru.get_records_and_feedback(app_ids=[])
|
78 |
+
|
79 |
+
os.makedirs("./results", exist_ok=True)
|
80 |
+
records.to_csv("./results/records.csv", index=False)
|
81 |
+
|
82 |
+
print(tru.db.engine.url.render_as_string(hide_password=False))
|
83 |
+
|
84 |
+
end_time = time.time()
|
85 |
+
time_spent_mins = (end_time - start_time) / 60
|
86 |
+
with open("./results/time_cost.txt", "w") as fp:
|
87 |
+
fp.write(f"Takes {int(time_spent_mins)} mins to create llm evaluation.")
|
88 |
+
|
89 |
+
if __name__ == "__main__":
|
90 |
+
|
91 |
+
# main()
|
92 |
+
if False:
|
93 |
+
start_time = time.time()
|
94 |
+
|
95 |
+
llm = OpenAI(model="gpt-3.5-turbo-1106", temperature=0.0)
|
96 |
+
fine_tuned_path = "local:./models/fine-tuned-embeddings"
|
97 |
+
|
98 |
+
Settings.llm = llm
|
99 |
+
Settings.embed_model = fine_tuned_path
|
100 |
+
|
101 |
+
db = chromadb.PersistentClient(path="./models/chroma_db")
|
102 |
+
chroma_collection = db.get_or_create_collection("quickstart")
|
103 |
+
|
104 |
+
# assign chroma as the vector_store to the context
|
105 |
+
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
|
106 |
+
storage_context = StorageContext.from_defaults(vector_store=vector_store)
|
107 |
+
|
108 |
+
# create your index
|
109 |
+
index = VectorStoreIndex.from_vector_store(
|
110 |
+
vector_store=vector_store,
|
111 |
+
storage_context=storage_context
|
112 |
+
)
|
113 |
+
query_engine = index.as_query_engine()
|
114 |
+
|
115 |
+
separator = "\n\n"
|
116 |
+
eval_questions = []
|
117 |
+
with open('./raw_documents/eval_questions.txt', 'r') as file:
|
118 |
+
content = file.read()
|
119 |
+
|
120 |
+
for question in content.split(separator):
|
121 |
+
print(question)
|
122 |
+
print(separator)
|
123 |
+
eval_questions.append(question.strip())
|
124 |
+
|
125 |
+
response = query_engine.query(eval_questions[0])
|
126 |
+
print(str(response))
|
127 |
+
|
128 |
+
|
129 |
+
|
130 |
+
from trulens_eval import Tru
|
131 |
+
tru = Tru()
|
132 |
+
|
133 |
+
documents = SimpleDirectoryReader(
|
134 |
+
input_files=["./raw_documents/qna.txt"]
|
135 |
+
).load_data()
|
136 |
+
index = VectorStoreIndex.from_documents(documents)
|
137 |
+
|
138 |
+
query_engine = index.as_query_engine()
|
139 |
+
response = query_engine.query("Which is not a government healthcare philosophy?")
|
140 |
+
print(response)
|
141 |
+
|
142 |
+
from trulens_eval.feedback.provider.openai import OpenAI
|
143 |
+
openai = OpenAI()
|
144 |
+
|
145 |
+
# select context to be used in feedback. the location of context is app specific.
|
146 |
+
from trulens_eval.app import App
|
147 |
+
context = App.select_context(query_engine)
|
148 |
+
|
149 |
+
from trulens_eval import Feedback
|
150 |
+
|
151 |
+
# Define a groundedness feedback function
|
152 |
+
from trulens_eval.feedback import Groundedness
|
153 |
+
grounded = Groundedness(groundedness_provider=OpenAI())
|
154 |
+
f_groundedness = (
|
155 |
+
Feedback(grounded.groundedness_measure_with_cot_reasons)
|
156 |
+
.on(context.collect()) # collect context chunks into a list
|
157 |
+
.on_output()
|
158 |
+
.aggregate(grounded.grounded_statements_aggregator)
|
159 |
+
)
|
160 |
+
|
161 |
+
# Question/answer relevance between overall question and answer.
|
162 |
+
f_qa_relevance = Feedback(openai.relevance).on_input_output()
|
163 |
+
|
164 |
+
# Question/statement relevance between question and each context chunk.
|
165 |
+
f_qs_relevance = (
|
166 |
+
Feedback(openai.qs_relevance)
|
167 |
+
.on_input()
|
168 |
+
.on(context)
|
169 |
+
.aggregate(np.mean)
|
170 |
+
)
|
171 |
+
|
172 |
+
from trulens_eval import TruLlama
|
173 |
+
tru_query_engine_recorder = TruLlama(query_engine,
|
174 |
+
app_id='LlamaIndex_App1',
|
175 |
+
feedbacks=[f_groundedness, f_qa_relevance, f_qs_relevance])
|
176 |
+
|
177 |
+
if False:
|
178 |
+
# or as context manager
|
179 |
+
with tru_query_engine_recorder as recording:
|
180 |
+
query_engine.query("Which of the following is TRUE on the similarity of Means Testing and Casemix?")
|
archive/model_evaluation/utils.py
ADDED
@@ -0,0 +1,160 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import numpy as np
|
3 |
+
from trulens_eval import (
|
4 |
+
Feedback,
|
5 |
+
TruLlama,
|
6 |
+
OpenAI
|
7 |
+
)
|
8 |
+
|
9 |
+
from trulens_eval.feedback import Groundedness
|
10 |
+
import nest_asyncio
|
11 |
+
|
12 |
+
from llama_index.core import ServiceContext, VectorStoreIndex, StorageContext
|
13 |
+
from llama_index.core import load_index_from_storage
|
14 |
+
from llama_index.core.node_parser import HierarchicalNodeParser
|
15 |
+
from llama_index.core.node_parser import get_leaf_nodes
|
16 |
+
|
17 |
+
from llama_index.packs.auto_merging_retriever.base import AutoMergingRetrieverPack
|
18 |
+
|
19 |
+
|
20 |
+
from llama_index.node_parser import SentenceWindowNodeParser
|
21 |
+
from llama_index.indices.postprocessor import MetadataReplacementPostProcessor
|
22 |
+
from llama_index.indices.postprocessor import SentenceTransformerRerank
|
23 |
+
from llama_index.query_engine import RetrieverQueryEngine
|
24 |
+
|
25 |
+
|
26 |
+
nest_asyncio.apply()
|
27 |
+
openai = OpenAI()
|
28 |
+
|
29 |
+
qa_relevance = (
|
30 |
+
Feedback(openai.relevance_with_cot_reasons, name="Answer Relevance")
|
31 |
+
.on_input_output()
|
32 |
+
)
|
33 |
+
|
34 |
+
qs_relevance = (
|
35 |
+
Feedback(openai.relevance_with_cot_reasons, name = "Context Relevance")
|
36 |
+
.on_input()
|
37 |
+
.on(TruLlama.select_source_nodes().node.text)
|
38 |
+
.aggregate(np.mean)
|
39 |
+
)
|
40 |
+
|
41 |
+
#grounded = Groundedness(groundedness_provider=openai, summarize_provider=openai)
|
42 |
+
grounded = Groundedness(groundedness_provider=openai)
|
43 |
+
|
44 |
+
groundedness = (
|
45 |
+
Feedback(grounded.groundedness_measure_with_cot_reasons, name="Groundedness")
|
46 |
+
.on(TruLlama.select_source_nodes().node.text)
|
47 |
+
.on_output()
|
48 |
+
.aggregate(grounded.grounded_statements_aggregator)
|
49 |
+
)
|
50 |
+
|
51 |
+
feedbacks = [qa_relevance, qs_relevance, groundedness]
|
52 |
+
|
53 |
+
def get_openai_api_key():
|
54 |
+
return os.getenv("OPENAI_API_KEY")
|
55 |
+
|
56 |
+
def get_trulens_recorder(query_engine, feedbacks, app_id):
|
57 |
+
tru_recorder = TruLlama(
|
58 |
+
query_engine,
|
59 |
+
app_id=app_id,
|
60 |
+
feedbacks=feedbacks
|
61 |
+
)
|
62 |
+
return tru_recorder
|
63 |
+
|
64 |
+
def get_prebuilt_trulens_recorder(query_engine, app_id):
|
65 |
+
tru_recorder = TruLlama(
|
66 |
+
query_engine,
|
67 |
+
app_id=app_id,
|
68 |
+
feedbacks=feedbacks
|
69 |
+
)
|
70 |
+
return tru_recorder
|
71 |
+
|
72 |
+
def build_sentence_window_index(
|
73 |
+
document, llm, embed_model="local:BAAI/bge-small-en-v1.5", save_dir="sentence_index"
|
74 |
+
):
|
75 |
+
# create the sentence window node parser w/ default settings
|
76 |
+
node_parser = SentenceWindowNodeParser.from_defaults(
|
77 |
+
window_size=3,
|
78 |
+
window_metadata_key="window",
|
79 |
+
original_text_metadata_key="original_text",
|
80 |
+
)
|
81 |
+
sentence_context = ServiceContext.from_defaults(
|
82 |
+
llm=llm,
|
83 |
+
embed_model=embed_model,
|
84 |
+
node_parser=node_parser,
|
85 |
+
)
|
86 |
+
if not os.path.exists(save_dir):
|
87 |
+
sentence_index = VectorStoreIndex.from_documents(
|
88 |
+
[document], service_context=sentence_context
|
89 |
+
)
|
90 |
+
sentence_index.storage_context.persist(persist_dir=save_dir)
|
91 |
+
else:
|
92 |
+
sentence_index = load_index_from_storage(
|
93 |
+
StorageContext.from_defaults(persist_dir=save_dir),
|
94 |
+
service_context=sentence_context,
|
95 |
+
)
|
96 |
+
|
97 |
+
return sentence_index
|
98 |
+
|
99 |
+
def get_sentence_window_query_engine(
|
100 |
+
sentence_index,
|
101 |
+
similarity_top_k=6,
|
102 |
+
rerank_top_n=2,
|
103 |
+
):
|
104 |
+
# define postprocessors
|
105 |
+
postproc = MetadataReplacementPostProcessor(target_metadata_key="window")
|
106 |
+
rerank = SentenceTransformerRerank(
|
107 |
+
top_n=rerank_top_n, model="BAAI/bge-reranker-base"
|
108 |
+
)
|
109 |
+
|
110 |
+
sentence_window_engine = sentence_index.as_query_engine(
|
111 |
+
similarity_top_k=similarity_top_k, node_postprocessors=[postproc, rerank]
|
112 |
+
)
|
113 |
+
return sentence_window_engine
|
114 |
+
|
115 |
+
def build_automerging_index(
|
116 |
+
documents,
|
117 |
+
llm,
|
118 |
+
embed_model="local:BAAI/bge-small-en-v1.5",
|
119 |
+
save_dir="merging_index",
|
120 |
+
chunk_sizes=None,
|
121 |
+
):
|
122 |
+
chunk_sizes = chunk_sizes or [2048, 512, 128]
|
123 |
+
node_parser = HierarchicalNodeParser.from_defaults(chunk_sizes=chunk_sizes)
|
124 |
+
nodes = node_parser.get_nodes_from_documents(documents)
|
125 |
+
leaf_nodes = get_leaf_nodes(nodes)
|
126 |
+
merging_context = ServiceContext.from_defaults(
|
127 |
+
llm=llm,
|
128 |
+
embed_model=embed_model,
|
129 |
+
)
|
130 |
+
storage_context = StorageContext.from_defaults()
|
131 |
+
storage_context.docstore.add_documents(nodes)
|
132 |
+
|
133 |
+
if not os.path.exists(save_dir):
|
134 |
+
automerging_index = VectorStoreIndex(
|
135 |
+
leaf_nodes, storage_context=storage_context, service_context=merging_context
|
136 |
+
)
|
137 |
+
automerging_index.storage_context.persist(persist_dir=save_dir)
|
138 |
+
else:
|
139 |
+
automerging_index = load_index_from_storage(
|
140 |
+
StorageContext.from_defaults(persist_dir=save_dir),
|
141 |
+
service_context=merging_context,
|
142 |
+
)
|
143 |
+
return automerging_index
|
144 |
+
|
145 |
+
def get_automerging_query_engine(
|
146 |
+
automerging_index,
|
147 |
+
similarity_top_k=12,
|
148 |
+
rerank_top_n=2,
|
149 |
+
):
|
150 |
+
base_retriever = automerging_index.as_retriever(similarity_top_k=similarity_top_k)
|
151 |
+
retriever = AutoMergingRetriever(
|
152 |
+
base_retriever, automerging_index.storage_context, verbose=True
|
153 |
+
)
|
154 |
+
rerank = SentenceTransformerRerank(
|
155 |
+
top_n=rerank_top_n, model="BAAI/bge-reranker-base"
|
156 |
+
)
|
157 |
+
auto_merging_engine = RetrieverQueryEngine.from_args(
|
158 |
+
retriever, node_postprocessors=[rerank]
|
159 |
+
)
|
160 |
+
return auto_merging_engine
|
archive/model_evaluation/utils_new.py
ADDED
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import numpy as np
|
3 |
+
from trulens_eval import (
|
4 |
+
Feedback,
|
5 |
+
TruLlama,
|
6 |
+
OpenAI
|
7 |
+
)
|
8 |
+
|
9 |
+
from trulens_eval.feedback import Groundedness
|
10 |
+
import nest_asyncio
|
11 |
+
|
12 |
+
from llama_index.core import ServiceContext, VectorStoreIndex, StorageContext
|
13 |
+
from llama_index.core import load_index_from_storage
|
14 |
+
from llama_index.core.node_parser import HierarchicalNodeParser
|
15 |
+
from llama_index.core.node_parser import get_leaf_nodes
|
16 |
+
from llama_index.core.query_engine import RetrieverQueryEngine
|
17 |
+
|
18 |
+
from llama_index.packs.auto_merging_retriever.base import AutoMergingRetrieverPack
|
19 |
+
|
20 |
+
|
21 |
+
nest_asyncio.apply()
|
22 |
+
openai = OpenAI()
|
23 |
+
|
24 |
+
qa_relevance = (
|
25 |
+
Feedback(openai.relevance_with_cot_reasons, name="Answer Relevance")
|
26 |
+
.on_input_output()
|
27 |
+
)
|
28 |
+
|
29 |
+
qs_relevance = (
|
30 |
+
Feedback(openai.relevance_with_cot_reasons, name = "Context Relevance")
|
31 |
+
.on_input()
|
32 |
+
.on(TruLlama.select_source_nodes().node.text)
|
33 |
+
.aggregate(np.mean)
|
34 |
+
)
|
35 |
+
|
36 |
+
#grounded = Groundedness(groundedness_provider=openai, summarize_provider=openai)
|
37 |
+
grounded = Groundedness(groundedness_provider=openai)
|
38 |
+
|
39 |
+
groundedness = (
|
40 |
+
Feedback(grounded.groundedness_measure_with_cot_reasons, name="Groundedness")
|
41 |
+
.on(TruLlama.select_source_nodes().node.text)
|
42 |
+
.on_output()
|
43 |
+
.aggregate(grounded.grounded_statements_aggregator)
|
44 |
+
)
|
45 |
+
|
46 |
+
feedbacks = [qa_relevance, qs_relevance, groundedness]
|
47 |
+
|
48 |
+
def get_openai_api_key():
|
49 |
+
return os.getenv("OPENAI_API_KEY")
|
50 |
+
|
51 |
+
def get_trulens_recorder(query_engine, feedbacks, app_id):
|
52 |
+
tru_recorder = TruLlama(
|
53 |
+
query_engine,
|
54 |
+
app_id=app_id,
|
55 |
+
feedbacks=feedbacks
|
56 |
+
)
|
57 |
+
return tru_recorder
|
58 |
+
|
59 |
+
def get_prebuilt_trulens_recorder(query_engine, app_id):
|
60 |
+
tru_recorder = TruLlama(
|
61 |
+
query_engine,
|
62 |
+
app_id=app_id,
|
63 |
+
feedbacks=feedbacks
|
64 |
+
)
|
65 |
+
return tru_recorder
|
66 |
+
|
67 |
+
def build_automerging_index(
|
68 |
+
documents,
|
69 |
+
llm,
|
70 |
+
embed_model="local:BAAI/bge-small-en-v1.5",
|
71 |
+
save_dir="merging_index",
|
72 |
+
chunk_sizes=None,
|
73 |
+
):
|
74 |
+
chunk_sizes = chunk_sizes or [2048, 512, 128]
|
75 |
+
node_parser = HierarchicalNodeParser.from_defaults(chunk_sizes=chunk_sizes)
|
76 |
+
nodes = node_parser.get_nodes_from_documents(documents)
|
77 |
+
leaf_nodes = get_leaf_nodes(nodes)
|
78 |
+
merging_context = ServiceContext.from_defaults(
|
79 |
+
llm=llm,
|
80 |
+
embed_model=embed_model,
|
81 |
+
)
|
82 |
+
storage_context = StorageContext.from_defaults()
|
83 |
+
storage_context.docstore.add_documents(nodes)
|
84 |
+
|
85 |
+
if not os.path.exists(save_dir):
|
86 |
+
automerging_index = VectorStoreIndex(
|
87 |
+
leaf_nodes, storage_context=storage_context, service_context=merging_context
|
88 |
+
)
|
89 |
+
automerging_index.storage_context.persist(persist_dir=save_dir)
|
90 |
+
else:
|
91 |
+
automerging_index = load_index_from_storage(
|
92 |
+
StorageContext.from_defaults(persist_dir=save_dir),
|
93 |
+
service_context=merging_context,
|
94 |
+
)
|
95 |
+
return automerging_index
|
database/mock_qna_source.csv
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c80d88333c3b9fb2a700d49113d2ba3fef7cc671c11b640168c389bef411bc05
|
3 |
+
size 7624
|
evaluate_model.py
ADDED
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os, time
|
2 |
+
import pandas as pd
|
3 |
+
from tqdm import tqdm
|
4 |
+
|
5 |
+
import chromadb
|
6 |
+
import openai
|
7 |
+
from llama_index import (
|
8 |
+
SimpleDirectoryReader,
|
9 |
+
StorageContext,
|
10 |
+
Document,
|
11 |
+
VectorStoreIndex,
|
12 |
+
ServiceContext
|
13 |
+
)
|
14 |
+
|
15 |
+
from llama_index.vector_stores.chroma import ChromaVectorStore
|
16 |
+
from llama_index.llms import OpenAI
|
17 |
+
from llama_index.embeddings import HuggingFaceEmbedding
|
18 |
+
from trulens_eval import Tru
|
19 |
+
|
20 |
+
import utils
|
21 |
+
from utils import get_prebuilt_trulens_recorder
|
22 |
+
|
23 |
+
openai.api_key = utils.get_openai_api_key()
|
24 |
+
|
25 |
+
def main():
|
26 |
+
|
27 |
+
start_time = time.time()
|
28 |
+
|
29 |
+
llm = OpenAI(model="gpt-3.5-turbo-1106", temperature=0.0)
|
30 |
+
fine_tuned_path = "local:./models/fine-tuned-embeddings"
|
31 |
+
|
32 |
+
db = chromadb.PersistentClient(path="./models/chroma_db")
|
33 |
+
chroma_collection = db.get_or_create_collection("quickstart")
|
34 |
+
|
35 |
+
# assign chroma as the vector_store to the context
|
36 |
+
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
|
37 |
+
storage_context = StorageContext.from_defaults(vector_store=vector_store)
|
38 |
+
service_context = ServiceContext.from_defaults(llm=llm, embed_model=fine_tuned_path)
|
39 |
+
|
40 |
+
print("Loading embeddings from vector store..")
|
41 |
+
index = VectorStoreIndex.from_vector_store(
|
42 |
+
vector_store=vector_store,
|
43 |
+
storage_context=storage_context,
|
44 |
+
service_context=service_context
|
45 |
+
)
|
46 |
+
query_engine = index.as_query_engine()
|
47 |
+
|
48 |
+
mock_qna_source = pd.read_csv("./database/mock_qna_source.csv")
|
49 |
+
mock_qna_source = mock_qna_source[ mock_qna_source["question"].notnull() ]
|
50 |
+
print("mock_qna_source.shape", mock_qna_source.shape)
|
51 |
+
|
52 |
+
with open("./raw_documents/eval_questions.txt", "r") as fp:
|
53 |
+
questions_content = fp.read()
|
54 |
+
questions_content_ls = questions_content.split("\n\n")
|
55 |
+
|
56 |
+
eval_questions = mock_qna_source["question"].tolist() + questions_content_ls
|
57 |
+
response = query_engine.query(eval_questions[0])
|
58 |
+
print(str(response))
|
59 |
+
|
60 |
+
tru = Tru(database_file="./models/trulens_eval.sqlite")
|
61 |
+
tru_recorder = get_prebuilt_trulens_recorder(query_engine,
|
62 |
+
app_id="Direct Query Engine")
|
63 |
+
|
64 |
+
print("Sending each question to llm..")
|
65 |
+
with tru_recorder as recording:
|
66 |
+
for question in tqdm(eval_questions):
|
67 |
+
response = query_engine.query(question)
|
68 |
+
|
69 |
+
records, feedback = tru.get_records_and_feedback(app_ids=[])
|
70 |
+
|
71 |
+
os.makedirs("./results", exist_ok=True)
|
72 |
+
records.to_csv("./results/records.csv", index=False)
|
73 |
+
|
74 |
+
print(tru.db.engine.url.render_as_string(hide_password=False))
|
75 |
+
|
76 |
+
end_time = time.time()
|
77 |
+
time_spent_mins = (end_time - start_time) / 60
|
78 |
+
with open("./results/time_cost.txt", "w") as fp:
|
79 |
+
fp.write(f"Takes {int(time_spent_mins)} mins to create llm evaluation.")
|
80 |
+
|
81 |
+
if __name__ == "__main__":
|
82 |
+
|
83 |
+
main()
|
models/trulens_eval.sqlite
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6849488edfa526805c51322b217557de99ac01882a9d2a136a351a68c6b305d5
|
3 |
+
size 2936832
|
notebooks/002_persisted-embedding-model.ipynb
CHANGED
@@ -236,6 +236,7 @@
|
|
236 |
"from llama_index.core import StorageContext\n",
|
237 |
"from llama_index.core import ServiceContext\n",
|
238 |
"from llama_index.core import Document\n",
|
|
|
239 |
"\n",
|
240 |
"from llama_index.embeddings.huggingface.base import HuggingFaceEmbedding\n",
|
241 |
"from llama_index.llms.openai import OpenAI\n",
|
|
|
236 |
"from llama_index.core import StorageContext\n",
|
237 |
"from llama_index.core import ServiceContext\n",
|
238 |
"from llama_index.core import Document\n",
|
239 |
+
"from llama_index.core import Settings\n",
|
240 |
"\n",
|
241 |
"from llama_index.embeddings.huggingface.base import HuggingFaceEmbedding\n",
|
242 |
"from llama_index.llms.openai import OpenAI\n",
|
pages/1_Leaderboard.py
CHANGED
@@ -31,7 +31,7 @@ database_url = None
|
|
31 |
|
32 |
|
33 |
def streamlit_app():
|
34 |
-
tru = Tru(
|
35 |
lms = tru.db
|
36 |
|
37 |
# Set the title and subtitle of the app
|
|
|
31 |
|
32 |
|
33 |
def streamlit_app():
|
34 |
+
tru = Tru(database_file="./models/trulens_eval.sqlite")
|
35 |
lms = tru.db
|
36 |
|
37 |
# Set the title and subtitle of the app
|
pages/2_Evaluations.py
CHANGED
@@ -48,7 +48,7 @@ st.runtime.legacy_caching.clear_cache()
|
|
48 |
|
49 |
add_logo_and_style_overrides()
|
50 |
|
51 |
-
tru = Tru()
|
52 |
lms = tru.db
|
53 |
|
54 |
df_results, feedback_cols = lms.get_records_and_feedback([])
|
@@ -143,7 +143,7 @@ else:
|
|
143 |
else:
|
144 |
app = apps
|
145 |
|
146 |
-
st.
|
147 |
|
148 |
options = st.multiselect("Filter Applications", apps, default=app)
|
149 |
|
|
|
48 |
|
49 |
add_logo_and_style_overrides()
|
50 |
|
51 |
+
tru = Tru(database_file="./models/trulens_eval.sqlite")
|
52 |
lms = tru.db
|
53 |
|
54 |
df_results, feedback_cols = lms.get_records_and_feedback([])
|
|
|
143 |
else:
|
144 |
app = apps
|
145 |
|
146 |
+
st.query_params["app"] = app
|
147 |
|
148 |
options = st.multiselect("Filter Applications", apps, default=app)
|
149 |
|
pages/3_app.py
CHANGED
@@ -4,13 +4,11 @@ import os
|
|
4 |
try:
|
5 |
raw_docs_files = ", ".join(os.listdir("./raw_documents"))
|
6 |
curr_directory_files = ", ".join(os.listdir("."))
|
|
|
|
|
7 |
|
8 |
-
|
9 |
-
time_cost_str = fp.read()
|
10 |
-
|
11 |
-
system_update = raw_docs_files + "\n\n" + curr_directory_files + "\n\n" + time_cost_str
|
12 |
-
|
13 |
except:
|
14 |
system_update = "NA"
|
15 |
|
16 |
-
st.write(f"Hello World!
|
|
|
4 |
try:
|
5 |
raw_docs_files = ", ".join(os.listdir("./raw_documents"))
|
6 |
curr_directory_files = ", ".join(os.listdir("."))
|
7 |
+
with open("./raw_documents/eval_answers.txt", "r") as fp:
|
8 |
+
eval_answers = fp.read()
|
9 |
|
10 |
+
system_update = raw_docs_files + "\n\n" + curr_directory_files + "\n\n" + eval_answers
|
|
|
|
|
|
|
|
|
11 |
except:
|
12 |
system_update = "NA"
|
13 |
|
14 |
+
st.write(f"Hello World! Info about the app: {system_update}")
|
qna_prompting.py
CHANGED
@@ -22,7 +22,11 @@ class QnA_Model(BaseModel):
|
|
22 |
description=(
|
23 |
"which chapter to extract, the format of this function argumet"
|
24 |
"is with `Chapter_` as prefix concatenated with chapter number"
|
25 |
-
"in integer. For example, `Chapter_2`, `Chapter_10`."
|
|
|
|
|
|
|
|
|
26 |
)
|
27 |
|
28 |
def get_qna_question(chapter_n: str) -> str:
|
@@ -37,11 +41,12 @@ def get_qna_question(chapter_n: str) -> str:
|
|
37 |
"""
|
38 |
con = sqlite3.connect(db_path)
|
39 |
cur = con.cursor()
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
|
|
45 |
res = cur.execute(sql_string)
|
46 |
result = res.fetchone()
|
47 |
|
|
|
22 |
description=(
|
23 |
"which chapter to extract, the format of this function argumet"
|
24 |
"is with `Chapter_` as prefix concatenated with chapter number"
|
25 |
+
"in integer. For example, `Chapter_2`, `Chapter_10`."
|
26 |
+
"if no chapter number specified or user requested for random question"
|
27 |
+
"or user has no preference over which chapter of textbook to be tested"
|
28 |
+
"return `Chapter_0`"
|
29 |
+
)
|
30 |
)
|
31 |
|
32 |
def get_qna_question(chapter_n: str) -> str:
|
|
|
41 |
"""
|
42 |
con = sqlite3.connect(db_path)
|
43 |
cur = con.cursor()
|
44 |
+
|
45 |
+
filter_clause = "" if chapter_n == "Chapter_0" else f"WHERE chapter='{chapter_n}'"
|
46 |
+
sql_string = """SELECT id, question, option_1, option_2, option_3, option_4, correct_answer
|
47 |
+
FROM qna_tbl
|
48 |
+
""" + filter_clause
|
49 |
+
|
50 |
res = cur.execute(sql_string)
|
51 |
result = res.fetchone()
|
52 |
|
raw_documents/eval_answers.txt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0ac533f41fb123fe9281d27f2a3166e997f09c37178d12f5cbbea1fedeb5026b
|
3 |
+
size 1458
|
raw_documents/eval_questions.txt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7bae3f2ac0cf2fdb2f58de8ecaa8d63014a4f84aa8a839dc7ff0d4ae8eb0eb22
|
3 |
+
size 1126
|
raw_documents/qna.txt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:59cc1b620ccad1393fc4311e91e538774ac76149a42bd2391af7c855895d80bc
|
3 |
+
size 56746
|
requirements.txt
CHANGED
@@ -99,8 +99,8 @@ kiwisolver==1.4.5
|
|
99 |
kubernetes==29.0.0
|
100 |
langchain==0.0.354
|
101 |
langchain-community==0.0.8
|
102 |
-
langchain-core==0.1.
|
103 |
-
langsmith==0.0.
|
104 |
llama-index==0.10.1
|
105 |
llama-index-agent-openai==0.1.1
|
106 |
llama-index-core==0.10.1
|
@@ -109,6 +109,7 @@ llama-index-embeddings-openai==0.1.1
|
|
109 |
llama-index-legacy==0.9.48
|
110 |
llama-index-llms-openai==0.1.1
|
111 |
llama-index-multi-modal-llms-openai==0.1.1
|
|
|
112 |
llama-index-program-openai==0.1.1
|
113 |
llama-index-question-gen-openai==0.1.1
|
114 |
llama-index-readers-file==0.1.2
|
@@ -218,7 +219,7 @@ SQLAlchemy==2.0.24
|
|
218 |
st-annotated-text==4.0.1
|
219 |
stack-data==0.6.3
|
220 |
starlette==0.35.1
|
221 |
-
streamlit==1.
|
222 |
streamlit-aggrid==0.3.4.post3
|
223 |
streamlit-camera-input-live==0.2.0
|
224 |
streamlit-card==1.0.0
|
@@ -246,7 +247,7 @@ tqdm==4.66.1
|
|
246 |
traitlets==5.14.0
|
247 |
transformers==4.37.2
|
248 |
trulens==0.13.4
|
249 |
-
trulens-eval==0.
|
250 |
typer==0.9.0
|
251 |
types-python-dateutil==2.8.19.14
|
252 |
typing-inspect==0.9.0
|
|
|
99 |
kubernetes==29.0.0
|
100 |
langchain==0.0.354
|
101 |
langchain-community==0.0.8
|
102 |
+
langchain-core==0.1.23
|
103 |
+
langsmith==0.0.87
|
104 |
llama-index==0.10.1
|
105 |
llama-index-agent-openai==0.1.1
|
106 |
llama-index-core==0.10.1
|
|
|
109 |
llama-index-legacy==0.9.48
|
110 |
llama-index-llms-openai==0.1.1
|
111 |
llama-index-multi-modal-llms-openai==0.1.1
|
112 |
+
llama-index-packs-auto-merging-retriever==0.1.2
|
113 |
llama-index-program-openai==0.1.1
|
114 |
llama-index-question-gen-openai==0.1.1
|
115 |
llama-index-readers-file==0.1.2
|
|
|
219 |
st-annotated-text==4.0.1
|
220 |
stack-data==0.6.3
|
221 |
starlette==0.35.1
|
222 |
+
streamlit==1.31.1
|
223 |
streamlit-aggrid==0.3.4.post3
|
224 |
streamlit-camera-input-live==0.2.0
|
225 |
streamlit-card==1.0.0
|
|
|
247 |
traitlets==5.14.0
|
248 |
transformers==4.37.2
|
249 |
trulens==0.13.4
|
250 |
+
trulens-eval==0.22.2
|
251 |
typer==0.9.0
|
252 |
types-python-dateutil==2.8.19.14
|
253 |
typing-inspect==0.9.0
|
streamlit_app.py
CHANGED
@@ -71,7 +71,7 @@ with st.sidebar:
|
|
71 |
|
72 |
st.subheader("Models and parameters")
|
73 |
selected_model = st.sidebar.selectbox("Choose an OpenAI model",
|
74 |
-
["gpt-3.5-turbo-
|
75 |
key="selected_model")
|
76 |
temperature = st.sidebar.slider("temperature", min_value=0.0, max_value=2.0,
|
77 |
value=0.0, step=0.01)
|
|
|
71 |
|
72 |
st.subheader("Models and parameters")
|
73 |
selected_model = st.sidebar.selectbox("Choose an OpenAI model",
|
74 |
+
["gpt-3.5-turbo-0125", "gpt-4-0125-preview"],
|
75 |
key="selected_model")
|
76 |
temperature = st.sidebar.slider("temperature", min_value=0.0, max_value=2.0,
|
77 |
value=0.0, step=0.01)
|
utils.py
CHANGED
@@ -5,27 +5,18 @@ from trulens_eval import (
|
|
5 |
TruLlama,
|
6 |
OpenAI
|
7 |
)
|
8 |
-
|
9 |
from trulens_eval.feedback import Groundedness
|
10 |
-
import nest_asyncio
|
11 |
|
12 |
from llama_index import ServiceContext, VectorStoreIndex, StorageContext
|
13 |
-
from llama_index.node_parser import SentenceWindowNodeParser
|
14 |
-
from llama_index.indices.postprocessor import MetadataReplacementPostProcessor
|
15 |
-
from llama_index.indices.postprocessor import SentenceTransformerRerank
|
16 |
from llama_index import load_index_from_storage
|
17 |
-
|
18 |
from llama_index.node_parser import HierarchicalNodeParser
|
19 |
from llama_index.node_parser import get_leaf_nodes
|
20 |
from llama_index import StorageContext
|
21 |
-
from llama_index.retrievers import AutoMergingRetriever
|
22 |
-
from llama_index.indices.postprocessor import SentenceTransformerRerank
|
23 |
-
from llama_index.query_engine import RetrieverQueryEngine
|
24 |
-
|
25 |
|
|
|
26 |
nest_asyncio.apply()
|
27 |
-
openai = OpenAI()
|
28 |
|
|
|
29 |
qa_relevance = (
|
30 |
Feedback(openai.relevance_with_cot_reasons, name="Answer Relevance")
|
31 |
.on_input_output()
|
@@ -69,49 +60,6 @@ def get_prebuilt_trulens_recorder(query_engine, app_id):
|
|
69 |
)
|
70 |
return tru_recorder
|
71 |
|
72 |
-
def build_sentence_window_index(
|
73 |
-
document, llm, embed_model="local:BAAI/bge-small-en-v1.5", save_dir="sentence_index"
|
74 |
-
):
|
75 |
-
# create the sentence window node parser w/ default settings
|
76 |
-
node_parser = SentenceWindowNodeParser.from_defaults(
|
77 |
-
window_size=3,
|
78 |
-
window_metadata_key="window",
|
79 |
-
original_text_metadata_key="original_text",
|
80 |
-
)
|
81 |
-
sentence_context = ServiceContext.from_defaults(
|
82 |
-
llm=llm,
|
83 |
-
embed_model=embed_model,
|
84 |
-
node_parser=node_parser,
|
85 |
-
)
|
86 |
-
if not os.path.exists(save_dir):
|
87 |
-
sentence_index = VectorStoreIndex.from_documents(
|
88 |
-
[document], service_context=sentence_context
|
89 |
-
)
|
90 |
-
sentence_index.storage_context.persist(persist_dir=save_dir)
|
91 |
-
else:
|
92 |
-
sentence_index = load_index_from_storage(
|
93 |
-
StorageContext.from_defaults(persist_dir=save_dir),
|
94 |
-
service_context=sentence_context,
|
95 |
-
)
|
96 |
-
|
97 |
-
return sentence_index
|
98 |
-
|
99 |
-
def get_sentence_window_query_engine(
|
100 |
-
sentence_index,
|
101 |
-
similarity_top_k=6,
|
102 |
-
rerank_top_n=2,
|
103 |
-
):
|
104 |
-
# define postprocessors
|
105 |
-
postproc = MetadataReplacementPostProcessor(target_metadata_key="window")
|
106 |
-
rerank = SentenceTransformerRerank(
|
107 |
-
top_n=rerank_top_n, model="BAAI/bge-reranker-base"
|
108 |
-
)
|
109 |
-
|
110 |
-
sentence_window_engine = sentence_index.as_query_engine(
|
111 |
-
similarity_top_k=similarity_top_k, node_postprocessors=[postproc, rerank]
|
112 |
-
)
|
113 |
-
return sentence_window_engine
|
114 |
-
|
115 |
def build_automerging_index(
|
116 |
documents,
|
117 |
llm,
|
@@ -140,21 +88,4 @@ def build_automerging_index(
|
|
140 |
StorageContext.from_defaults(persist_dir=save_dir),
|
141 |
service_context=merging_context,
|
142 |
)
|
143 |
-
return automerging_index
|
144 |
-
|
145 |
-
def get_automerging_query_engine(
|
146 |
-
automerging_index,
|
147 |
-
similarity_top_k=12,
|
148 |
-
rerank_top_n=2,
|
149 |
-
):
|
150 |
-
base_retriever = automerging_index.as_retriever(similarity_top_k=similarity_top_k)
|
151 |
-
retriever = AutoMergingRetriever(
|
152 |
-
base_retriever, automerging_index.storage_context, verbose=True
|
153 |
-
)
|
154 |
-
rerank = SentenceTransformerRerank(
|
155 |
-
top_n=rerank_top_n, model="BAAI/bge-reranker-base"
|
156 |
-
)
|
157 |
-
auto_merging_engine = RetrieverQueryEngine.from_args(
|
158 |
-
retriever, node_postprocessors=[rerank]
|
159 |
-
)
|
160 |
-
return auto_merging_engine
|
|
|
5 |
TruLlama,
|
6 |
OpenAI
|
7 |
)
|
|
|
8 |
from trulens_eval.feedback import Groundedness
|
|
|
9 |
|
10 |
from llama_index import ServiceContext, VectorStoreIndex, StorageContext
|
|
|
|
|
|
|
11 |
from llama_index import load_index_from_storage
|
|
|
12 |
from llama_index.node_parser import HierarchicalNodeParser
|
13 |
from llama_index.node_parser import get_leaf_nodes
|
14 |
from llama_index import StorageContext
|
|
|
|
|
|
|
|
|
15 |
|
16 |
+
import nest_asyncio
|
17 |
nest_asyncio.apply()
|
|
|
18 |
|
19 |
+
openai = OpenAI()
|
20 |
qa_relevance = (
|
21 |
Feedback(openai.relevance_with_cot_reasons, name="Answer Relevance")
|
22 |
.on_input_output()
|
|
|
60 |
)
|
61 |
return tru_recorder
|
62 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
def build_automerging_index(
|
64 |
documents,
|
65 |
llm,
|
|
|
88 |
StorageContext.from_defaults(persist_dir=save_dir),
|
89 |
service_context=merging_context,
|
90 |
)
|
91 |
+
return automerging_index
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|