Spaces:
Runtime error
Runtime error
nsthorat-lilac
commited on
Commit
•
bfc0ec6
0
Parent(s):
Duplicate from lilacai/nikhil_staging
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .dockerignore +10 -0
- .env +40 -0
- .env.demo +4 -0
- .gitignore +5 -0
- Dockerfile +27 -0
- LICENSE +201 -0
- README.md +12 -0
- data/.cache/lilac/concept/lilac/negative-sentiment/gte-small.pkl +0 -0
- data/.cache/lilac/concept/lilac/non-english/gte-small.pkl +0 -0
- data/.cache/lilac/concept/lilac/source-code/gte-small.pkl +0 -0
- demo_config.yml +3 -0
- docker_start.py +111 -0
- docker_start.sh +10 -0
- lilac/.gitignore +1 -0
- lilac/__init__.py +42 -0
- lilac/auth.py +87 -0
- lilac/batch_utils.py +92 -0
- lilac/cli.py +47 -0
- lilac/concepts/__init__.py +12 -0
- lilac/concepts/concept.py +339 -0
- lilac/concepts/db_concept.py +567 -0
- lilac/concepts/legal-termination/concept.json +185 -0
- lilac/concepts/negative-sentiment/concept.json +634 -0
- lilac/concepts/non-english/concept.json +1024 -0
- lilac/concepts/positive-sentiment/concept.json +564 -0
- lilac/concepts/profanity/concept.json +0 -0
- lilac/concepts/question/concept.json +0 -0
- lilac/concepts/source-code/concept.json +389 -0
- lilac/concepts/toxicity/concept.json +0 -0
- lilac/config.py +268 -0
- lilac/conftest.py +28 -0
- lilac/data/__init__.py +25 -0
- lilac/data/dataset.py +510 -0
- lilac/data/dataset_duckdb.py +1833 -0
- lilac/data/dataset_test_utils.py +153 -0
- lilac/data/dataset_utils.py +313 -0
- lilac/data_loader.py +110 -0
- lilac/db_manager.py +96 -0
- lilac/embeddings/__init__.py +7 -0
- lilac/embeddings/cohere.py +59 -0
- lilac/embeddings/default_vector_stores.py +10 -0
- lilac/embeddings/embedding.py +110 -0
- lilac/embeddings/gte.py +63 -0
- lilac/embeddings/openai.py +68 -0
- lilac/embeddings/palm.py +62 -0
- lilac/embeddings/sbert.py +38 -0
- lilac/embeddings/transformer_utils.py +35 -0
- lilac/embeddings/vector_store.py +201 -0
- lilac/embeddings/vector_store_hnsw.py +112 -0
- lilac/embeddings/vector_store_numpy.py +92 -0
.dockerignore
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Python
|
2 |
+
**/__pycache__
|
3 |
+
**/*.pyc
|
4 |
+
**/*.pyo
|
5 |
+
**/*.pyd
|
6 |
+
# Ignore unit tests.
|
7 |
+
**/*_test.py
|
8 |
+
|
9 |
+
# Mac OS.
|
10 |
+
.DS_Store
|
.env
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# To overwrite these variables, create a .env.local file
|
2 |
+
|
3 |
+
# The path to the directory where the data will be downloaded on machine
|
4 |
+
LILAC_DATA_PATH=./data
|
5 |
+
|
6 |
+
# Set to 1 for duckdb to use views instead of materialized tables (lower memory usage, but slower).
|
7 |
+
DUCKDB_USE_VIEWS=0
|
8 |
+
|
9 |
+
# Set to true to enable read-only mode, disabling the ability to add datasets & compute dataset
|
10 |
+
# signals.
|
11 |
+
# LILAC_AUTH_ENABLED=true
|
12 |
+
|
13 |
+
# Variables that can be set in .env.local
|
14 |
+
#
|
15 |
+
# Get key from https://dashboard.cohere.ai/api-keys
|
16 |
+
# COHERE_API_KEY=
|
17 |
+
|
18 |
+
# GCS_REGION=
|
19 |
+
# GCS_ACCESS_KEY=
|
20 |
+
# GCS_SECRET_KEY=
|
21 |
+
|
22 |
+
# Get key from https://platform.openai.com/account/api-keys
|
23 |
+
# OPENAI_API_KEY=
|
24 |
+
# Get key from https://makersuite.google.com/app/apikey
|
25 |
+
# PALM_API_KEY=
|
26 |
+
|
27 |
+
# HuggingFace demos: machine that uploads to HuggingFace.
|
28 |
+
|
29 |
+
# For authenticating with HuggingFace to deploy to a Space.
|
30 |
+
# HF_USERNAME=
|
31 |
+
# The default repo to deploy to for a staging demo. Can be overridden by a command line flag.
|
32 |
+
# HF_STAGING_DEMO_REPO='HF_ORG/HF_REPO_NAME'
|
33 |
+
|
34 |
+
# For Google-login. This is generated from the Google Cloud Console for a web client.
|
35 |
+
# See: https://developers.google.com/identity/protocols/oauth2
|
36 |
+
GOOGLE_CLIENT_ID='279475920249-i8llm8vbos1vj5m1qocir8narb3r0enu.apps.googleusercontent.com'
|
37 |
+
# The client secret of the above client.
|
38 |
+
# GOOGLE_CLIENT_SECRET=
|
39 |
+
# A random string for oauth sessions.
|
40 |
+
# LILAC_OAUTH_SECRET_KEY=
|
.env.demo
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
LILAC_DATA_PATH='/data'
|
2 |
+
HF_HOME=/data/.huggingface
|
3 |
+
TRANSFORMERS_CACHE=/data/.cache
|
4 |
+
XDG_CACHE_HOME=/data/.cache
|
.gitignore
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
__pycache__/
|
2 |
+
**/*.pyc
|
3 |
+
**/*.pyo
|
4 |
+
**/*.pyd
|
5 |
+
**/*_test.py
|
Dockerfile
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# NOTE: When we upgrade to 3.11 we can use a slimmer docker image which comes with gcc.
|
2 |
+
FROM python:3.9-bullseye
|
3 |
+
|
4 |
+
# Allow statements and log messages to immediately appear in the Knative logs
|
5 |
+
ENV PYTHONUNBUFFERED True
|
6 |
+
|
7 |
+
# Set the working directory in the container.
|
8 |
+
WORKDIR /server
|
9 |
+
|
10 |
+
# Install the dependencies. This requires exporting requirements.txt from poetry first, which
|
11 |
+
# happens from ./build_docker.sh.
|
12 |
+
COPY requirements.txt .
|
13 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
14 |
+
|
15 |
+
COPY .env .
|
16 |
+
COPY .env.demo .
|
17 |
+
COPY demo_config.yml .
|
18 |
+
# Copy the README so we can read the datasets from the HuggingFace config.
|
19 |
+
COPY README.md .
|
20 |
+
COPY LICENSE .
|
21 |
+
|
22 |
+
# Copy python files.
|
23 |
+
COPY /lilac ./lilac/
|
24 |
+
|
25 |
+
COPY docker_start.sh docker_start.py ./
|
26 |
+
|
27 |
+
CMD ["bash", "docker_start.sh"]
|
LICENSE
ADDED
@@ -0,0 +1,201 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Apache License
|
2 |
+
Version 2.0, January 2004
|
3 |
+
http://www.apache.org/licenses/
|
4 |
+
|
5 |
+
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
6 |
+
|
7 |
+
1. Definitions.
|
8 |
+
|
9 |
+
"License" shall mean the terms and conditions for use, reproduction,
|
10 |
+
and distribution as defined by Sections 1 through 9 of this document.
|
11 |
+
|
12 |
+
"Licensor" shall mean the copyright owner or entity authorized by
|
13 |
+
the copyright owner that is granting the License.
|
14 |
+
|
15 |
+
"Legal Entity" shall mean the union of the acting entity and all
|
16 |
+
other entities that control, are controlled by, or are under common
|
17 |
+
control with that entity. For the purposes of this definition,
|
18 |
+
"control" means (i) the power, direct or indirect, to cause the
|
19 |
+
direction or management of such entity, whether by contract or
|
20 |
+
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
21 |
+
outstanding shares, or (iii) beneficial ownership of such entity.
|
22 |
+
|
23 |
+
"You" (or "Your") shall mean an individual or Legal Entity
|
24 |
+
exercising permissions granted by this License.
|
25 |
+
|
26 |
+
"Source" form shall mean the preferred form for making modifications,
|
27 |
+
including but not limited to software source code, documentation
|
28 |
+
source, and configuration files.
|
29 |
+
|
30 |
+
"Object" form shall mean any form resulting from mechanical
|
31 |
+
transformation or translation of a Source form, including but
|
32 |
+
not limited to compiled object code, generated documentation,
|
33 |
+
and conversions to other media types.
|
34 |
+
|
35 |
+
"Work" shall mean the work of authorship, whether in Source or
|
36 |
+
Object form, made available under the License, as indicated by a
|
37 |
+
copyright notice that is included in or attached to the work
|
38 |
+
(an example is provided in the Appendix below).
|
39 |
+
|
40 |
+
"Derivative Works" shall mean any work, whether in Source or Object
|
41 |
+
form, that is based on (or derived from) the Work and for which the
|
42 |
+
editorial revisions, annotations, elaborations, or other modifications
|
43 |
+
represent, as a whole, an original work of authorship. For the purposes
|
44 |
+
of this License, Derivative Works shall not include works that remain
|
45 |
+
separable from, or merely link (or bind by name) to the interfaces of,
|
46 |
+
the Work and Derivative Works thereof.
|
47 |
+
|
48 |
+
"Contribution" shall mean any work of authorship, including
|
49 |
+
the original version of the Work and any modifications or additions
|
50 |
+
to that Work or Derivative Works thereof, that is intentionally
|
51 |
+
submitted to Licensor for inclusion in the Work by the copyright owner
|
52 |
+
or by an individual or Legal Entity authorized to submit on behalf of
|
53 |
+
the copyright owner. For the purposes of this definition, "submitted"
|
54 |
+
means any form of electronic, verbal, or written communication sent
|
55 |
+
to the Licensor or its representatives, including but not limited to
|
56 |
+
communication on electronic mailing lists, source code control systems,
|
57 |
+
and issue tracking systems that are managed by, or on behalf of, the
|
58 |
+
Licensor for the purpose of discussing and improving the Work, but
|
59 |
+
excluding communication that is conspicuously marked or otherwise
|
60 |
+
designated in writing by the copyright owner as "Not a Contribution."
|
61 |
+
|
62 |
+
"Contributor" shall mean Licensor and any individual or Legal Entity
|
63 |
+
on behalf of whom a Contribution has been received by Licensor and
|
64 |
+
subsequently incorporated within the Work.
|
65 |
+
|
66 |
+
2. Grant of Copyright License. Subject to the terms and conditions of
|
67 |
+
this License, each Contributor hereby grants to You a perpetual,
|
68 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
69 |
+
copyright license to reproduce, prepare Derivative Works of,
|
70 |
+
publicly display, publicly perform, sublicense, and distribute the
|
71 |
+
Work and such Derivative Works in Source or Object form.
|
72 |
+
|
73 |
+
3. Grant of Patent License. Subject to the terms and conditions of
|
74 |
+
this License, each Contributor hereby grants to You a perpetual,
|
75 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
76 |
+
(except as stated in this section) patent license to make, have made,
|
77 |
+
use, offer to sell, sell, import, and otherwise transfer the Work,
|
78 |
+
where such license applies only to those patent claims licensable
|
79 |
+
by such Contributor that are necessarily infringed by their
|
80 |
+
Contribution(s) alone or by combination of their Contribution(s)
|
81 |
+
with the Work to which such Contribution(s) was submitted. If You
|
82 |
+
institute patent litigation against any entity (including a
|
83 |
+
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
84 |
+
or a Contribution incorporated within the Work constitutes direct
|
85 |
+
or contributory patent infringement, then any patent licenses
|
86 |
+
granted to You under this License for that Work shall terminate
|
87 |
+
as of the date such litigation is filed.
|
88 |
+
|
89 |
+
4. Redistribution. You may reproduce and distribute copies of the
|
90 |
+
Work or Derivative Works thereof in any medium, with or without
|
91 |
+
modifications, and in Source or Object form, provided that You
|
92 |
+
meet the following conditions:
|
93 |
+
|
94 |
+
(a) You must give any other recipients of the Work or
|
95 |
+
Derivative Works a copy of this License; and
|
96 |
+
|
97 |
+
(b) You must cause any modified files to carry prominent notices
|
98 |
+
stating that You changed the files; and
|
99 |
+
|
100 |
+
(c) You must retain, in the Source form of any Derivative Works
|
101 |
+
that You distribute, all copyright, patent, trademark, and
|
102 |
+
attribution notices from the Source form of the Work,
|
103 |
+
excluding those notices that do not pertain to any part of
|
104 |
+
the Derivative Works; and
|
105 |
+
|
106 |
+
(d) If the Work includes a "NOTICE" text file as part of its
|
107 |
+
distribution, then any Derivative Works that You distribute must
|
108 |
+
include a readable copy of the attribution notices contained
|
109 |
+
within such NOTICE file, excluding those notices that do not
|
110 |
+
pertain to any part of the Derivative Works, in at least one
|
111 |
+
of the following places: within a NOTICE text file distributed
|
112 |
+
as part of the Derivative Works; within the Source form or
|
113 |
+
documentation, if provided along with the Derivative Works; or,
|
114 |
+
within a display generated by the Derivative Works, if and
|
115 |
+
wherever such third-party notices normally appear. The contents
|
116 |
+
of the NOTICE file are for informational purposes only and
|
117 |
+
do not modify the License. You may add Your own attribution
|
118 |
+
notices within Derivative Works that You distribute, alongside
|
119 |
+
or as an addendum to the NOTICE text from the Work, provided
|
120 |
+
that such additional attribution notices cannot be construed
|
121 |
+
as modifying the License.
|
122 |
+
|
123 |
+
You may add Your own copyright statement to Your modifications and
|
124 |
+
may provide additional or different license terms and conditions
|
125 |
+
for use, reproduction, or distribution of Your modifications, or
|
126 |
+
for any such Derivative Works as a whole, provided Your use,
|
127 |
+
reproduction, and distribution of the Work otherwise complies with
|
128 |
+
the conditions stated in this License.
|
129 |
+
|
130 |
+
5. Submission of Contributions. Unless You explicitly state otherwise,
|
131 |
+
any Contribution intentionally submitted for inclusion in the Work
|
132 |
+
by You to the Licensor shall be under the terms and conditions of
|
133 |
+
this License, without any additional terms or conditions.
|
134 |
+
Notwithstanding the above, nothing herein shall supersede or modify
|
135 |
+
the terms of any separate license agreement you may have executed
|
136 |
+
with Licensor regarding such Contributions.
|
137 |
+
|
138 |
+
6. Trademarks. This License does not grant permission to use the trade
|
139 |
+
names, trademarks, service marks, or product names of the Licensor,
|
140 |
+
except as required for reasonable and customary use in describing the
|
141 |
+
origin of the Work and reproducing the content of the NOTICE file.
|
142 |
+
|
143 |
+
7. Disclaimer of Warranty. Unless required by applicable law or
|
144 |
+
agreed to in writing, Licensor provides the Work (and each
|
145 |
+
Contributor provides its Contributions) on an "AS IS" BASIS,
|
146 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
147 |
+
implied, including, without limitation, any warranties or conditions
|
148 |
+
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
149 |
+
PARTICULAR PURPOSE. You are solely responsible for determining the
|
150 |
+
appropriateness of using or redistributing the Work and assume any
|
151 |
+
risks associated with Your exercise of permissions under this License.
|
152 |
+
|
153 |
+
8. Limitation of Liability. In no event and under no legal theory,
|
154 |
+
whether in tort (including negligence), contract, or otherwise,
|
155 |
+
unless required by applicable law (such as deliberate and grossly
|
156 |
+
negligent acts) or agreed to in writing, shall any Contributor be
|
157 |
+
liable to You for damages, including any direct, indirect, special,
|
158 |
+
incidental, or consequential damages of any character arising as a
|
159 |
+
result of this License or out of the use or inability to use the
|
160 |
+
Work (including but not limited to damages for loss of goodwill,
|
161 |
+
work stoppage, computer failure or malfunction, or any and all
|
162 |
+
other commercial damages or losses), even if such Contributor
|
163 |
+
has been advised of the possibility of such damages.
|
164 |
+
|
165 |
+
9. Accepting Warranty or Additional Liability. While redistributing
|
166 |
+
the Work or Derivative Works thereof, You may choose to offer,
|
167 |
+
and charge a fee for, acceptance of support, warranty, indemnity,
|
168 |
+
or other liability obligations and/or rights consistent with this
|
169 |
+
License. However, in accepting such obligations, You may act only
|
170 |
+
on Your own behalf and on Your sole responsibility, not on behalf
|
171 |
+
of any other Contributor, and only if You agree to indemnify,
|
172 |
+
defend, and hold each Contributor harmless for any liability
|
173 |
+
incurred by, or claims asserted against, such Contributor by reason
|
174 |
+
of your accepting any such warranty or additional liability.
|
175 |
+
|
176 |
+
END OF TERMS AND CONDITIONS
|
177 |
+
|
178 |
+
APPENDIX: How to apply the Apache License to your work.
|
179 |
+
|
180 |
+
To apply the Apache License to your work, attach the following
|
181 |
+
boilerplate notice, with the fields enclosed by brackets "[]"
|
182 |
+
replaced with your own identifying information. (Don't include
|
183 |
+
the brackets!) The text should be enclosed in the appropriate
|
184 |
+
comment syntax for the file format. We also recommend that a
|
185 |
+
file or class name and description of purpose be included on the
|
186 |
+
same "printed page" as the copyright notice for easier
|
187 |
+
identification within third-party archives.
|
188 |
+
|
189 |
+
Copyright 2023 Lilac AI Inc.
|
190 |
+
|
191 |
+
Licensed under the Apache License, Version 2.0 (the "License");
|
192 |
+
you may not use this file except in compliance with the License.
|
193 |
+
You may obtain a copy of the License at
|
194 |
+
|
195 |
+
http://www.apache.org/licenses/LICENSE-2.0
|
196 |
+
|
197 |
+
Unless required by applicable law or agreed to in writing, software
|
198 |
+
distributed under the License is distributed on an "AS IS" BASIS,
|
199 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
200 |
+
See the License for the specific language governing permissions and
|
201 |
+
limitations under the License.
|
README.md
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
app_port: 5432
|
3 |
+
colorFrom: purple
|
4 |
+
colorTo: purple
|
5 |
+
datasets:
|
6 |
+
- lilacai/nikhil_staging-local-glue
|
7 |
+
- lilacai/nikhil_staging-local-imdb
|
8 |
+
emoji: 🌷
|
9 |
+
sdk: docker
|
10 |
+
title: Lilac
|
11 |
+
duplicated_from: lilacai/nikhil_staging
|
12 |
+
---
|
data/.cache/lilac/concept/lilac/negative-sentiment/gte-small.pkl
ADDED
Binary file (202 kB). View file
|
|
data/.cache/lilac/concept/lilac/non-english/gte-small.pkl
ADDED
Binary file (331 kB). View file
|
|
data/.cache/lilac/concept/lilac/source-code/gte-small.pkl
ADDED
Binary file (126 kB). View file
|
|
demo_config.yml
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
lilac_hf_datasets:
|
2 |
+
- {hf_dataset_repo_id: lilacai/nikhil_staging-local-glue, lilac_name: glue, lilac_namespace: local}
|
3 |
+
- {hf_dataset_repo_id: lilacai/nikhil_staging-local-imdb, lilac_name: imdb, lilac_namespace: local}
|
docker_start.py
ADDED
@@ -0,0 +1,111 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Startup work before running the web server."""
|
2 |
+
|
3 |
+
import os
|
4 |
+
import shutil
|
5 |
+
from typing import TypedDict
|
6 |
+
|
7 |
+
import yaml
|
8 |
+
from huggingface_hub import scan_cache_dir, snapshot_download
|
9 |
+
|
10 |
+
from lilac.concepts.db_concept import CONCEPTS_DIR, DiskConceptDB, get_concept_output_dir
|
11 |
+
from lilac.env import data_path, env
|
12 |
+
from lilac.utils import get_datasets_dir, get_lilac_cache_dir, log
|
13 |
+
|
14 |
+
|
15 |
+
def delete_old_files() -> None:
|
16 |
+
"""Delete old files from the cache."""
|
17 |
+
# Scan cache
|
18 |
+
try:
|
19 |
+
scan = scan_cache_dir()
|
20 |
+
except BaseException:
|
21 |
+
# Cache was not found.
|
22 |
+
return
|
23 |
+
|
24 |
+
# Select revisions to delete
|
25 |
+
to_delete = []
|
26 |
+
for repo in scan.repos:
|
27 |
+
latest_revision = max(repo.revisions, key=lambda x: x.last_modified)
|
28 |
+
to_delete.extend(
|
29 |
+
[revision.commit_hash for revision in repo.revisions if revision != latest_revision])
|
30 |
+
strategy = scan.delete_revisions(*to_delete)
|
31 |
+
|
32 |
+
# Delete them
|
33 |
+
log(f'Will delete {len(to_delete)} old revisions and save {strategy.expected_freed_size_str}')
|
34 |
+
strategy.execute()
|
35 |
+
|
36 |
+
|
37 |
+
class HfSpaceConfig(TypedDict):
|
38 |
+
"""The huggingface space config, defined in README.md.
|
39 |
+
|
40 |
+
See:
|
41 |
+
https://huggingface.co/docs/hub/spaces-config-reference
|
42 |
+
"""
|
43 |
+
title: str
|
44 |
+
datasets: list[str]
|
45 |
+
|
46 |
+
|
47 |
+
def main() -> None:
|
48 |
+
"""Download dataset files from the HF space that was uploaded before building the image."""
|
49 |
+
# SPACE_ID is the HuggingFace Space ID environment variable that is automatically set by HF.
|
50 |
+
repo_id = env('SPACE_ID', None)
|
51 |
+
if not repo_id:
|
52 |
+
return
|
53 |
+
|
54 |
+
delete_old_files()
|
55 |
+
|
56 |
+
print('readme:', os.path.abspath('README.md'))
|
57 |
+
with open(os.path.abspath('README.md')) as f:
|
58 |
+
print(f.read())
|
59 |
+
|
60 |
+
with open(os.path.abspath('README.md')) as f:
|
61 |
+
# Strip the '---' for the huggingface readme config.
|
62 |
+
readme = f.read().strip('---')
|
63 |
+
hf_config: HfSpaceConfig = yaml.safe_load(readme)
|
64 |
+
|
65 |
+
# Download the huggingface space data. This includes code and datasets, so we move the datasets
|
66 |
+
# alone to the data directory.
|
67 |
+
for lilac_hf_dataset in hf_config['datasets']:
|
68 |
+
print('Downloading dataset from HuggingFace: ', lilac_hf_dataset)
|
69 |
+
snapshot_download(
|
70 |
+
repo_id=lilac_hf_dataset,
|
71 |
+
repo_type='dataset',
|
72 |
+
token=env('HF_ACCESS_TOKEN'),
|
73 |
+
local_dir=get_datasets_dir(data_path()),
|
74 |
+
ignore_patterns=['.gitattributes', 'README.md'])
|
75 |
+
|
76 |
+
snapshot_dir = snapshot_download(repo_id=repo_id, repo_type='space', token=env('HF_ACCESS_TOKEN'))
|
77 |
+
# # Copy datasets.
|
78 |
+
spaces_data_dir = os.path.join(snapshot_dir, 'data')
|
79 |
+
|
80 |
+
# Delete cache files from persistent storage.
|
81 |
+
cache_dir = get_lilac_cache_dir(data_path())
|
82 |
+
if os.path.exists(cache_dir):
|
83 |
+
shutil.rmtree(cache_dir)
|
84 |
+
|
85 |
+
# NOTE: This is temporary during the move of concepts into the pip package. Once all the demos
|
86 |
+
# have been updated, this block can be deleted.
|
87 |
+
old_lilac_concepts_data_dir = os.path.join(data_path(), CONCEPTS_DIR, 'lilac')
|
88 |
+
if os.path.exists(old_lilac_concepts_data_dir):
|
89 |
+
shutil.rmtree(old_lilac_concepts_data_dir)
|
90 |
+
|
91 |
+
# Copy cache files from the space if they exist.
|
92 |
+
spaces_cache_dir = get_lilac_cache_dir(spaces_data_dir)
|
93 |
+
if os.path.exists(spaces_cache_dir):
|
94 |
+
shutil.copytree(spaces_cache_dir, cache_dir)
|
95 |
+
|
96 |
+
# Copy concepts.
|
97 |
+
concepts = DiskConceptDB(spaces_data_dir).list()
|
98 |
+
for concept in concepts:
|
99 |
+
# Ignore lilac concepts, they're already part of the source code.
|
100 |
+
if concept.namespace == 'lilac':
|
101 |
+
continue
|
102 |
+
spaces_concept_output_dir = get_concept_output_dir(spaces_data_dir, concept.namespace,
|
103 |
+
concept.name)
|
104 |
+
persistent_output_dir = get_concept_output_dir(data_path(), concept.namespace, concept.name)
|
105 |
+
shutil.rmtree(persistent_output_dir, ignore_errors=True)
|
106 |
+
shutil.copytree(spaces_concept_output_dir, persistent_output_dir, dirs_exist_ok=True)
|
107 |
+
shutil.rmtree(spaces_concept_output_dir, ignore_errors=True)
|
108 |
+
|
109 |
+
|
110 |
+
if __name__ == '__main__':
|
111 |
+
main()
|
docker_start.sh
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
|
3 |
+
# Fail if any of the commands below fail.
|
4 |
+
set -e
|
5 |
+
|
6 |
+
python docker_start.py
|
7 |
+
gunicorn lilac.server:app \
|
8 |
+
--bind 0.0.0.0:5432 \
|
9 |
+
--preload -k uvicorn.workers.UvicornWorker \
|
10 |
+
--timeout 120
|
lilac/.gitignore
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
web/
|
lilac/__init__.py
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from importlib import metadata
|
2 |
+
|
3 |
+
from .concepts import * # noqa: F403
|
4 |
+
from .config import DatasetConfig, DatasetSettings, EmbeddingConfig, SignalConfig
|
5 |
+
from .data import * # noqa: F403
|
6 |
+
from .data.dataset_duckdb import DatasetDuckDB
|
7 |
+
from .data_loader import create_dataset
|
8 |
+
from .db_manager import get_dataset, set_default_dataset_cls
|
9 |
+
from .embeddings import * # noqa: F403
|
10 |
+
from .embeddings.default_vector_stores import register_default_vector_stores
|
11 |
+
from .schema import * # noqa: F403
|
12 |
+
from .server import start_server, stop_server
|
13 |
+
from .signals import * # noqa: F403
|
14 |
+
from .signals.default_signals import register_default_signals
|
15 |
+
from .sources import * # noqa: F403
|
16 |
+
from .sources.default_sources import register_default_sources
|
17 |
+
from .splitters import * # noqa: F403
|
18 |
+
|
19 |
+
try:
|
20 |
+
__version__ = metadata.version('lilacai')
|
21 |
+
except metadata.PackageNotFoundError:
|
22 |
+
__version__ = ''
|
23 |
+
|
24 |
+
register_default_sources()
|
25 |
+
register_default_signals()
|
26 |
+
register_default_vector_stores()
|
27 |
+
set_default_dataset_cls(DatasetDuckDB)
|
28 |
+
|
29 |
+
# Avoids polluting the results of dir(__package__).
|
30 |
+
del (metadata, register_default_sources, register_default_signals, set_default_dataset_cls,
|
31 |
+
DatasetDuckDB)
|
32 |
+
|
33 |
+
__all__ = [
|
34 |
+
'start_server',
|
35 |
+
'stop_server',
|
36 |
+
'create_dataset',
|
37 |
+
'get_dataset',
|
38 |
+
'DatasetConfig',
|
39 |
+
'EmbeddingConfig',
|
40 |
+
'SignalConfig',
|
41 |
+
'DatasetSettings',
|
42 |
+
]
|
lilac/auth.py
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Authentication and ACL configuration."""
|
2 |
+
|
3 |
+
from typing import Optional
|
4 |
+
|
5 |
+
from fastapi import Request
|
6 |
+
from pydantic import BaseModel, ValidationError
|
7 |
+
|
8 |
+
from .env import env
|
9 |
+
|
10 |
+
|
11 |
+
class ConceptAuthorizationException(Exception):
|
12 |
+
"""Authorization exceptions thrown by the concept database."""
|
13 |
+
pass
|
14 |
+
|
15 |
+
|
16 |
+
class DatasetUserAccess(BaseModel):
|
17 |
+
"""User access for datasets."""
|
18 |
+
# Whether the user can compute a signal.
|
19 |
+
compute_signals: bool
|
20 |
+
# Whether the user can delete a dataset.
|
21 |
+
delete_dataset: bool
|
22 |
+
# Whether the user can delete a signal.
|
23 |
+
delete_signals: bool
|
24 |
+
# Whether the user can update settings.
|
25 |
+
update_settings: bool
|
26 |
+
|
27 |
+
|
28 |
+
class ConceptUserAccess(BaseModel):
|
29 |
+
"""User access for concepts."""
|
30 |
+
# Whether the user can delete any concept (not their own).
|
31 |
+
delete_any_concept: bool
|
32 |
+
|
33 |
+
|
34 |
+
class UserAccess(BaseModel):
|
35 |
+
"""User access."""
|
36 |
+
create_dataset: bool
|
37 |
+
|
38 |
+
# TODO(nsthorat): Make this keyed to each dataset and concept.
|
39 |
+
dataset: DatasetUserAccess
|
40 |
+
concept: ConceptUserAccess
|
41 |
+
|
42 |
+
|
43 |
+
class UserInfo(BaseModel):
|
44 |
+
"""User information."""
|
45 |
+
id: str
|
46 |
+
email: str
|
47 |
+
name: str
|
48 |
+
given_name: str
|
49 |
+
family_name: str
|
50 |
+
|
51 |
+
|
52 |
+
class AuthenticationInfo(BaseModel):
|
53 |
+
"""Authentication information for the user."""
|
54 |
+
user: Optional[UserInfo] = None
|
55 |
+
access: UserAccess
|
56 |
+
auth_enabled: bool
|
57 |
+
|
58 |
+
|
59 |
+
def get_session_user(request: Request) -> Optional[UserInfo]:
|
60 |
+
"""Get the user from the session."""
|
61 |
+
if not env('LILAC_AUTH_ENABLED'):
|
62 |
+
return None
|
63 |
+
user_info_dict = request.session.get('user', None)
|
64 |
+
if user_info_dict:
|
65 |
+
try:
|
66 |
+
return UserInfo.parse_obj(user_info_dict)
|
67 |
+
except ValidationError:
|
68 |
+
return None
|
69 |
+
return None
|
70 |
+
|
71 |
+
|
72 |
+
def get_user_access() -> UserAccess:
|
73 |
+
"""Get the user access."""
|
74 |
+
auth_enabled = env('LILAC_AUTH_ENABLED')
|
75 |
+
if isinstance(auth_enabled, str):
|
76 |
+
auth_enabled = auth_enabled.lower() == 'true'
|
77 |
+
if auth_enabled:
|
78 |
+
return UserAccess(
|
79 |
+
create_dataset=False,
|
80 |
+
dataset=DatasetUserAccess(
|
81 |
+
compute_signals=False, delete_dataset=False, delete_signals=False, update_settings=False),
|
82 |
+
concept=ConceptUserAccess(delete_any_concept=False))
|
83 |
+
return UserAccess(
|
84 |
+
create_dataset=True,
|
85 |
+
dataset=DatasetUserAccess(
|
86 |
+
compute_signals=True, delete_dataset=True, delete_signals=True, update_settings=True),
|
87 |
+
concept=ConceptUserAccess(delete_any_concept=True))
|
lilac/batch_utils.py
ADDED
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Utils for the python server."""
|
2 |
+
import itertools
|
3 |
+
from typing import Any, Callable, Generator, Iterable, Iterator, TypeVar, Union, cast
|
4 |
+
|
5 |
+
from .schema import Item
|
6 |
+
from .utils import chunks, is_primitive
|
7 |
+
|
8 |
+
|
9 |
+
def _deep_flatten(input: Union[Iterator, object],
|
10 |
+
is_primitive_predicate: Callable[[object], bool]) -> Generator:
|
11 |
+
"""Flattens a nested iterable."""
|
12 |
+
if is_primitive_predicate(input):
|
13 |
+
yield input
|
14 |
+
elif isinstance(input, dict):
|
15 |
+
yield input
|
16 |
+
elif is_primitive(input):
|
17 |
+
yield input
|
18 |
+
else:
|
19 |
+
for elem in cast(Iterator, input):
|
20 |
+
yield from _deep_flatten(elem, is_primitive_predicate)
|
21 |
+
|
22 |
+
|
23 |
+
def deep_flatten(input: Union[Iterator, Iterable],
|
24 |
+
is_primitive_predicate: Callable[[object], bool] = is_primitive) -> Iterator:
|
25 |
+
"""Flattens a deeply nested iterator.
|
26 |
+
|
27 |
+
Primitives and dictionaries are not flattened. The user can also provide a predicate to determine
|
28 |
+
what is a primitive.
|
29 |
+
"""
|
30 |
+
return _deep_flatten(input, is_primitive_predicate)
|
31 |
+
|
32 |
+
|
33 |
+
def _deep_unflatten(flat_input: Iterator[list[object]], original_input: Union[Iterable, object],
|
34 |
+
is_primitive_predicate: Callable[[object], bool]) -> Union[list, dict]:
|
35 |
+
"""Unflattens a deeply flattened iterable according to the original iterable's structure."""
|
36 |
+
if is_primitive_predicate(original_input):
|
37 |
+
return next(flat_input)
|
38 |
+
else:
|
39 |
+
values: Iterable
|
40 |
+
if isinstance(original_input, dict):
|
41 |
+
values = original_input.values()
|
42 |
+
else:
|
43 |
+
values = cast(Iterable, original_input)
|
44 |
+
return [_deep_unflatten(flat_input, orig_elem, is_primitive_predicate) for orig_elem in values]
|
45 |
+
|
46 |
+
|
47 |
+
def deep_unflatten(flat_input: Union[Iterable, Iterator],
|
48 |
+
original_input: Union[Iterable, object],
|
49 |
+
is_primitive_predicate: Callable[[object], bool] = is_primitive) -> list:
|
50 |
+
"""Unflattens a deeply flattened iterable according to the original iterable's structure."""
|
51 |
+
return cast(list, _deep_unflatten(iter(flat_input), original_input, is_primitive_predicate))
|
52 |
+
|
53 |
+
|
54 |
+
TFlatten = TypeVar('TFlatten')
|
55 |
+
|
56 |
+
|
57 |
+
def flatten(inputs: Iterable[Iterable[TFlatten]]) -> Iterator[TFlatten]:
|
58 |
+
"""Flattens a nested iterator.
|
59 |
+
|
60 |
+
Only supports flattening one level deep.
|
61 |
+
"""
|
62 |
+
for input in inputs:
|
63 |
+
yield from input
|
64 |
+
|
65 |
+
|
66 |
+
TUnflatten = TypeVar('TUnflatten')
|
67 |
+
|
68 |
+
|
69 |
+
def unflatten(flat_inputs: Union[Iterable[TUnflatten], Iterator[TUnflatten]],
|
70 |
+
original_inputs: Iterable[Iterable[Any]]) -> Iterator[list[TUnflatten]]:
|
71 |
+
"""Unflattens a flattened iterable according to the original iterable's structure."""
|
72 |
+
flat_inputs_iter = iter(flat_inputs)
|
73 |
+
for original_input in original_inputs:
|
74 |
+
yield [next(flat_inputs_iter) for _ in original_input]
|
75 |
+
|
76 |
+
|
77 |
+
TFlatBatchedInput = TypeVar('TFlatBatchedInput')
|
78 |
+
TFlatBatchedOutput = TypeVar('TFlatBatchedOutput')
|
79 |
+
|
80 |
+
|
81 |
+
def flat_batched_compute(input: Iterable[Iterable[TFlatBatchedInput]],
|
82 |
+
f: Callable[[list[TFlatBatchedInput]], Iterable[TFlatBatchedOutput]],
|
83 |
+
batch_size: int) -> Iterable[Iterable[TFlatBatchedOutput]]:
|
84 |
+
"""Flatten the input, batched call f, and return the output unflattened."""
|
85 |
+
# Tee the input so we can use it twice for the input and output shapes.
|
86 |
+
input_1, input_2 = itertools.tee(input, 2)
|
87 |
+
batches = chunks(flatten(input_1), batch_size)
|
88 |
+
batched_outputs = flatten((f(batch) for batch in batches))
|
89 |
+
return unflatten(batched_outputs, input_2)
|
90 |
+
|
91 |
+
|
92 |
+
TBatchSpanVectorOutput = TypeVar('TBatchSpanVectorOutput', bound=Item)
|
lilac/cli.py
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Lilac CLI."""
|
2 |
+
|
3 |
+
import click
|
4 |
+
|
5 |
+
from . import __version__
|
6 |
+
from .concepts.db_concept import DISK_CONCEPT_DB
|
7 |
+
from .load import load_command as load
|
8 |
+
from .server import start_server
|
9 |
+
|
10 |
+
|
11 |
+
@click.command()
|
12 |
+
@click.option(
|
13 |
+
'--host',
|
14 |
+
help='The host address where the web server will listen to.',
|
15 |
+
default='0.0.0.0',
|
16 |
+
type=str)
|
17 |
+
@click.option('--port', help='The port number of the web-server', type=int, default=5432)
|
18 |
+
def start(host: str, port: int) -> None:
|
19 |
+
"""Starts the Lilac web server."""
|
20 |
+
start_server(host=host, port=port, open=True)
|
21 |
+
|
22 |
+
|
23 |
+
@click.command()
|
24 |
+
def version() -> None:
|
25 |
+
"""Prints the version of Lilac."""
|
26 |
+
print(__version__)
|
27 |
+
|
28 |
+
|
29 |
+
@click.command()
|
30 |
+
def concepts() -> None:
|
31 |
+
"""Lists lilac concepts."""
|
32 |
+
print(DISK_CONCEPT_DB.list())
|
33 |
+
|
34 |
+
|
35 |
+
@click.group()
|
36 |
+
def cli() -> None:
|
37 |
+
"""Lilac CLI."""
|
38 |
+
pass
|
39 |
+
|
40 |
+
|
41 |
+
cli.add_command(start)
|
42 |
+
cli.add_command(version)
|
43 |
+
cli.add_command(load)
|
44 |
+
cli.add_command(concepts)
|
45 |
+
|
46 |
+
if __name__ == '__main__':
|
47 |
+
cli()
|
lilac/concepts/__init__.py
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Concepts are customizable signals that help enrich documents."""
|
2 |
+
|
3 |
+
from .concept import Example, ExampleIn
|
4 |
+
from .db_concept import ConceptUpdate, DiskConceptDB, DiskConceptModelDB
|
5 |
+
|
6 |
+
__all__ = [
|
7 |
+
'DiskConceptDB',
|
8 |
+
'DiskConceptModelDB',
|
9 |
+
'Example',
|
10 |
+
'ExampleIn',
|
11 |
+
'ConceptUpdate',
|
12 |
+
]
|
lilac/concepts/concept.py
ADDED
@@ -0,0 +1,339 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Defines the concept and the concept models."""
|
2 |
+
import dataclasses
|
3 |
+
from enum import Enum
|
4 |
+
from typing import Callable, Literal, Optional, Union
|
5 |
+
|
6 |
+
import numpy as np
|
7 |
+
from joblib import Parallel, delayed
|
8 |
+
from pydantic import BaseModel, validator
|
9 |
+
from scipy.interpolate import interp1d
|
10 |
+
from sklearn.base import clone
|
11 |
+
from sklearn.linear_model import LogisticRegression
|
12 |
+
from sklearn.metrics import precision_recall_curve, roc_auc_score
|
13 |
+
from sklearn.model_selection import KFold
|
14 |
+
|
15 |
+
from ..embeddings.embedding import get_embed_fn
|
16 |
+
from ..signal import TextEmbeddingSignal, get_signal_cls
|
17 |
+
from ..utils import DebugTimer
|
18 |
+
|
19 |
+
LOCAL_CONCEPT_NAMESPACE = 'local'
|
20 |
+
|
21 |
+
# The maximum number of cross-validation models to train.
|
22 |
+
MAX_NUM_CROSS_VAL_MODELS = 15
|
23 |
+
# The β weight to use for the F-beta score: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.fbeta_score.html
|
24 |
+
# β = 0.5 means we value precision 2x as much as recall.
|
25 |
+
# β = 2 means we value recall 2x as much as precision.
|
26 |
+
F_BETA_WEIGHT = 0.5
|
27 |
+
|
28 |
+
|
29 |
+
class ConceptType(str, Enum):
|
30 |
+
"""Enum holding the concept type."""
|
31 |
+
TEXT = 'text'
|
32 |
+
IMAGE = 'image'
|
33 |
+
|
34 |
+
def __repr__(self) -> str:
|
35 |
+
return self.value
|
36 |
+
|
37 |
+
|
38 |
+
class ExampleOrigin(BaseModel):
|
39 |
+
"""The origin of an example."""
|
40 |
+
# The namespace that holds the dataset.
|
41 |
+
dataset_namespace: str
|
42 |
+
|
43 |
+
# The name of the dataset.
|
44 |
+
dataset_name: str
|
45 |
+
|
46 |
+
# The id of row in the dataset that the example was added from.
|
47 |
+
dataset_row_id: str
|
48 |
+
|
49 |
+
|
50 |
+
DraftId = Union[Literal['main'], str]
|
51 |
+
DRAFT_MAIN = 'main'
|
52 |
+
|
53 |
+
|
54 |
+
class ExampleIn(BaseModel):
|
55 |
+
"""An example in a concept without the id (used for adding new examples)."""
|
56 |
+
label: bool
|
57 |
+
text: Optional[str] = None
|
58 |
+
img: Optional[bytes] = None
|
59 |
+
origin: Optional[ExampleOrigin] = None
|
60 |
+
# The name of the draft to put the example in. If None, puts it in the main draft.
|
61 |
+
draft: Optional[DraftId] = DRAFT_MAIN
|
62 |
+
|
63 |
+
@validator('text')
|
64 |
+
def parse_text(cls, text: str) -> str:
|
65 |
+
"""Fixes surrogate errors in text: https://github.com/ijl/orjson/blob/master/README.md#str ."""
|
66 |
+
return text.encode('utf-8', 'replace').decode('utf-8')
|
67 |
+
|
68 |
+
|
69 |
+
class Example(ExampleIn):
|
70 |
+
"""A single example in a concept used for training a concept model."""
|
71 |
+
id: str
|
72 |
+
|
73 |
+
|
74 |
+
class Concept(BaseModel):
|
75 |
+
"""A concept is a collection of examples."""
|
76 |
+
# The namespace of the concept.
|
77 |
+
namespace: str
|
78 |
+
# The name of the concept.
|
79 |
+
concept_name: str
|
80 |
+
# The type of the data format that this concept represents.
|
81 |
+
type: ConceptType
|
82 |
+
data: dict[str, Example]
|
83 |
+
version: int = 0
|
84 |
+
|
85 |
+
tags: list[str] = []
|
86 |
+
description: Optional[str] = None
|
87 |
+
|
88 |
+
def drafts(self) -> list[DraftId]:
|
89 |
+
"""Gets all the drafts for the concept."""
|
90 |
+
drafts: set[DraftId] = set([DRAFT_MAIN]) # Always return the main draft.
|
91 |
+
for example in self.data.values():
|
92 |
+
if example.draft:
|
93 |
+
drafts.add(example.draft)
|
94 |
+
return list(sorted(drafts))
|
95 |
+
|
96 |
+
|
97 |
+
class OverallScore(str, Enum):
|
98 |
+
"""Enum holding the overall score."""
|
99 |
+
NOT_GOOD = 'not_good'
|
100 |
+
OK = 'ok'
|
101 |
+
GOOD = 'good'
|
102 |
+
VERY_GOOD = 'very_good'
|
103 |
+
GREAT = 'great'
|
104 |
+
|
105 |
+
|
106 |
+
def _get_overall_score(f1_score: float) -> OverallScore:
|
107 |
+
if f1_score < 0.5:
|
108 |
+
return OverallScore.NOT_GOOD
|
109 |
+
if f1_score < 0.8:
|
110 |
+
return OverallScore.OK
|
111 |
+
if f1_score < 0.9:
|
112 |
+
return OverallScore.GOOD
|
113 |
+
if f1_score < 0.95:
|
114 |
+
return OverallScore.VERY_GOOD
|
115 |
+
return OverallScore.GREAT
|
116 |
+
|
117 |
+
|
118 |
+
class ConceptMetrics(BaseModel):
|
119 |
+
"""Metrics for a concept."""
|
120 |
+
# The average F1 score for the concept computed using cross validation.
|
121 |
+
f1: float
|
122 |
+
precision: float
|
123 |
+
recall: float
|
124 |
+
roc_auc: float
|
125 |
+
overall: OverallScore
|
126 |
+
|
127 |
+
|
128 |
+
@dataclasses.dataclass
|
129 |
+
class LogisticEmbeddingModel:
|
130 |
+
"""A model that uses logistic regression with embeddings."""
|
131 |
+
|
132 |
+
_metrics: Optional[ConceptMetrics] = None
|
133 |
+
_threshold: float = 0.5
|
134 |
+
|
135 |
+
def __post_init__(self) -> None:
|
136 |
+
# See `notebooks/Toxicity.ipynb` for an example of training a concept model.
|
137 |
+
self._model = LogisticRegression(
|
138 |
+
class_weight='balanced', C=30, tol=1e-5, warm_start=True, max_iter=5_000, n_jobs=-1)
|
139 |
+
|
140 |
+
def score_embeddings(self, embeddings: np.ndarray) -> np.ndarray:
|
141 |
+
"""Get the scores for the provided embeddings."""
|
142 |
+
y_probs = self._model.predict_proba(embeddings)[:, 1]
|
143 |
+
# Map [0, threshold, 1] to [0, 0.5, 1].
|
144 |
+
interpolate_fn = interp1d([0, self._threshold, 1], [0, 0.4999, 1])
|
145 |
+
return interpolate_fn(y_probs)
|
146 |
+
|
147 |
+
def _setup_training(self, X_train: np.ndarray,
|
148 |
+
labels: Union[list[bool], np.ndarray]) -> tuple[np.ndarray, np.ndarray]:
|
149 |
+
y_train = np.array(labels)
|
150 |
+
# Shuffle the data in unison.
|
151 |
+
p = np.random.permutation(len(X_train))
|
152 |
+
X_train = X_train[p]
|
153 |
+
y_train = y_train[p]
|
154 |
+
return X_train, y_train
|
155 |
+
|
156 |
+
def fit(self, embeddings: np.ndarray, labels: list[bool]) -> None:
|
157 |
+
"""Fit the model to the provided embeddings and labels."""
|
158 |
+
label_set = set(labels)
|
159 |
+
if len(label_set) < 2:
|
160 |
+
dim = embeddings.shape[1]
|
161 |
+
random_vector = np.random.randn(dim).astype(np.float32)
|
162 |
+
random_vector /= np.linalg.norm(random_vector)
|
163 |
+
embeddings = np.vstack([embeddings, random_vector])
|
164 |
+
labels.append(False if True in label_set else True)
|
165 |
+
|
166 |
+
if len(labels) != len(embeddings):
|
167 |
+
raise ValueError(
|
168 |
+
f'Length of embeddings ({len(embeddings)}) must match length of labels ({len(labels)})')
|
169 |
+
X_train, y_train = self._setup_training(embeddings, labels)
|
170 |
+
self._model.fit(X_train, y_train)
|
171 |
+
self._metrics, self._threshold = self._compute_metrics(embeddings, labels)
|
172 |
+
|
173 |
+
def _compute_metrics(self, embeddings: np.ndarray,
|
174 |
+
labels: list[bool]) -> tuple[Optional[ConceptMetrics], float]:
|
175 |
+
"""Return the concept metrics."""
|
176 |
+
labels_np = np.array(labels)
|
177 |
+
n_splits = min(len(labels_np), MAX_NUM_CROSS_VAL_MODELS)
|
178 |
+
fold = KFold(n_splits, shuffle=True, random_state=42)
|
179 |
+
|
180 |
+
def _fit_and_score(model: LogisticRegression, X_train: np.ndarray, y_train: np.ndarray,
|
181 |
+
X_test: np.ndarray, y_test: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
|
182 |
+
if len(set(y_train)) < 2:
|
183 |
+
return np.array([]), np.array([])
|
184 |
+
model.fit(X_train, y_train)
|
185 |
+
y_pred = model.predict_proba(X_test)[:, 1]
|
186 |
+
return y_test, y_pred
|
187 |
+
|
188 |
+
# Compute the metrics for each validation fold in parallel.
|
189 |
+
jobs: list[Callable] = []
|
190 |
+
for (train_index, test_index) in fold.split(embeddings):
|
191 |
+
X_train, y_train = embeddings[train_index], labels_np[train_index]
|
192 |
+
X_train, y_train = self._setup_training(X_train, y_train)
|
193 |
+
X_test, y_test = embeddings[test_index], labels_np[test_index]
|
194 |
+
model = clone(self._model)
|
195 |
+
jobs.append(delayed(_fit_and_score)(model, X_train, y_train, X_test, y_test))
|
196 |
+
results = Parallel(n_jobs=-1)(jobs)
|
197 |
+
|
198 |
+
y_test = np.concatenate([y_test for y_test, _ in results], axis=0)
|
199 |
+
y_pred = np.concatenate([y_pred for _, y_pred in results], axis=0)
|
200 |
+
if len(set(y_test)) < 2:
|
201 |
+
return None, 0.5
|
202 |
+
roc_auc_val = roc_auc_score(y_test, y_pred)
|
203 |
+
precision, recall, thresholds = precision_recall_curve(y_test, y_pred)
|
204 |
+
numerator = (1 + F_BETA_WEIGHT**2) * precision * recall
|
205 |
+
denom = (F_BETA_WEIGHT**2 * precision) + recall
|
206 |
+
f1_scores = np.divide(numerator, denom, out=np.zeros_like(denom), where=(denom != 0))
|
207 |
+
max_f1: float = np.max(f1_scores)
|
208 |
+
max_f1_index = np.argmax(f1_scores)
|
209 |
+
max_f1_thresh: float = thresholds[max_f1_index]
|
210 |
+
max_f1_prec: float = precision[max_f1_index]
|
211 |
+
max_f1_recall: float = recall[max_f1_index]
|
212 |
+
metrics = ConceptMetrics(
|
213 |
+
f1=max_f1,
|
214 |
+
precision=max_f1_prec,
|
215 |
+
recall=max_f1_recall,
|
216 |
+
roc_auc=float(roc_auc_val),
|
217 |
+
overall=_get_overall_score(max_f1))
|
218 |
+
return metrics, max_f1_thresh
|
219 |
+
|
220 |
+
|
221 |
+
def draft_examples(concept: Concept, draft: DraftId) -> dict[str, Example]:
|
222 |
+
"""Get the examples in the provided draft by overriding the main draft."""
|
223 |
+
draft_examples: dict[str, dict[str, Example]] = {}
|
224 |
+
for id, example in concept.data.items():
|
225 |
+
draft_examples.setdefault(example.draft or DRAFT_MAIN, {})[example.id] = example
|
226 |
+
|
227 |
+
if draft == DRAFT_MAIN:
|
228 |
+
return draft_examples.get(DRAFT_MAIN, {})
|
229 |
+
|
230 |
+
if draft not in draft_examples:
|
231 |
+
raise ValueError(
|
232 |
+
f'Draft {draft} not found in concept. Found drafts: {list(draft_examples.keys())}')
|
233 |
+
|
234 |
+
# Map the text of the draft to its id so we can dedupe with main.
|
235 |
+
draft_text_ids = {example.text: id for id, example in draft_examples[draft].items()}
|
236 |
+
|
237 |
+
# Write each of examples from main to the draft examples only if the text does not appear in the
|
238 |
+
# draft.
|
239 |
+
for id, example in draft_examples[DRAFT_MAIN].items():
|
240 |
+
if example.text not in draft_text_ids:
|
241 |
+
draft_examples[draft][id] = example
|
242 |
+
|
243 |
+
return draft_examples[draft]
|
244 |
+
|
245 |
+
|
246 |
+
@dataclasses.dataclass
|
247 |
+
class ConceptModel:
|
248 |
+
"""A concept model. Stores all concept model drafts and manages syncing."""
|
249 |
+
# The concept that this model is for.
|
250 |
+
namespace: str
|
251 |
+
concept_name: str
|
252 |
+
|
253 |
+
# The name of the embedding for this model.
|
254 |
+
embedding_name: str
|
255 |
+
version: int = 0
|
256 |
+
|
257 |
+
batch_size = 4096
|
258 |
+
|
259 |
+
# The following fields are excluded from JSON serialization, but still pickle-able.
|
260 |
+
# Maps a concept id to the embeddings.
|
261 |
+
_embeddings: dict[str, np.ndarray] = dataclasses.field(default_factory=dict)
|
262 |
+
_logistic_models: dict[DraftId, LogisticEmbeddingModel] = dataclasses.field(default_factory=dict)
|
263 |
+
|
264 |
+
def get_metrics(self) -> Optional[ConceptMetrics]:
|
265 |
+
"""Return the metrics for this model."""
|
266 |
+
return self._get_logistic_model(DRAFT_MAIN)._metrics
|
267 |
+
|
268 |
+
def score_embeddings(self, draft: DraftId, embeddings: np.ndarray) -> np.ndarray:
|
269 |
+
"""Get the scores for the provided embeddings."""
|
270 |
+
return self._get_logistic_model(draft).score_embeddings(embeddings)
|
271 |
+
|
272 |
+
def coef(self, draft: DraftId) -> np.ndarray:
|
273 |
+
"""Get the coefficients of the underlying ML model."""
|
274 |
+
return self._get_logistic_model(draft)._model.coef_.reshape(-1)
|
275 |
+
|
276 |
+
def _get_logistic_model(self, draft: DraftId) -> LogisticEmbeddingModel:
|
277 |
+
"""Get the logistic model for the provided draft."""
|
278 |
+
if draft not in self._logistic_models:
|
279 |
+
self._logistic_models[draft] = LogisticEmbeddingModel()
|
280 |
+
return self._logistic_models[draft]
|
281 |
+
|
282 |
+
def sync(self, concept: Concept) -> bool:
|
283 |
+
"""Update the model with the latest labeled concept data."""
|
284 |
+
if concept.version == self.version:
|
285 |
+
# The model is up to date.
|
286 |
+
return False
|
287 |
+
|
288 |
+
concept_path = (f'{self.namespace}/{self.concept_name}/'
|
289 |
+
f'{self.embedding_name}')
|
290 |
+
with DebugTimer(f'Computing embeddings for "{concept_path}"'):
|
291 |
+
self._compute_embeddings(concept)
|
292 |
+
|
293 |
+
# Fit each of the drafts, sort by draft name for deterministic behavior.
|
294 |
+
for draft in concept.drafts():
|
295 |
+
examples = draft_examples(concept, draft)
|
296 |
+
embeddings = np.array([self._embeddings[id] for id in examples.keys()])
|
297 |
+
labels = [example.label for example in examples.values()]
|
298 |
+
model = self._get_logistic_model(draft)
|
299 |
+
with DebugTimer(f'Fitting model for "{concept_path}"'):
|
300 |
+
model.fit(embeddings, labels)
|
301 |
+
|
302 |
+
# Synchronize the model version with the concept version.
|
303 |
+
self.version = concept.version
|
304 |
+
|
305 |
+
return True
|
306 |
+
|
307 |
+
def _compute_embeddings(self, concept: Concept) -> None:
|
308 |
+
signal_cls = get_signal_cls(self.embedding_name)
|
309 |
+
if not signal_cls:
|
310 |
+
raise ValueError(f'Embedding signal "{self.embedding_name}" not found in the registry.')
|
311 |
+
embedding_signal = signal_cls()
|
312 |
+
if not isinstance(embedding_signal, TextEmbeddingSignal):
|
313 |
+
raise ValueError(f'Only text embedding signals are currently supported for concepts. '
|
314 |
+
f'"{self.embedding_name}" is a {type(embedding_signal)}.')
|
315 |
+
|
316 |
+
embed_fn = get_embed_fn(self.embedding_name, split=False)
|
317 |
+
concept_embeddings: dict[str, np.ndarray] = {}
|
318 |
+
|
319 |
+
examples = concept.data.items()
|
320 |
+
if not examples:
|
321 |
+
raise ValueError(f'Cannot sync concept "{concept.concept_name}". It has no examples.')
|
322 |
+
|
323 |
+
# Compute the embeddings for the examples with cache miss.
|
324 |
+
texts_of_missing_embeddings: dict[str, str] = {}
|
325 |
+
for id, example in examples:
|
326 |
+
if id in self._embeddings:
|
327 |
+
# Cache hit.
|
328 |
+
concept_embeddings[id] = self._embeddings[id]
|
329 |
+
else:
|
330 |
+
# Cache miss.
|
331 |
+
# TODO(smilkov): Support images.
|
332 |
+
texts_of_missing_embeddings[id] = example.text or ''
|
333 |
+
|
334 |
+
missing_ids = texts_of_missing_embeddings.keys()
|
335 |
+
missing_embeddings = embed_fn(list(texts_of_missing_embeddings.values()))
|
336 |
+
|
337 |
+
for id, (embedding,) in zip(missing_ids, missing_embeddings):
|
338 |
+
concept_embeddings[id] = embedding['vector'] / np.linalg.norm(embedding['vector'])
|
339 |
+
self._embeddings = concept_embeddings
|
lilac/concepts/db_concept.py
ADDED
@@ -0,0 +1,567 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""The concept database."""
|
2 |
+
|
3 |
+
import abc
|
4 |
+
import glob
|
5 |
+
import json
|
6 |
+
import os
|
7 |
+
import pathlib
|
8 |
+
import pickle
|
9 |
+
import shutil
|
10 |
+
import threading
|
11 |
+
|
12 |
+
# NOTE: We have to import the module for uuid so it can be mocked.
|
13 |
+
import uuid
|
14 |
+
from importlib import resources
|
15 |
+
from typing import Any, List, Optional, Union, cast
|
16 |
+
|
17 |
+
from pydantic import BaseModel
|
18 |
+
from typing_extensions import override
|
19 |
+
|
20 |
+
from ..auth import ConceptAuthorizationException, UserInfo
|
21 |
+
from ..env import data_path, env
|
22 |
+
from ..schema import SignalInputType
|
23 |
+
from ..signal import get_signal_cls
|
24 |
+
from ..utils import delete_file, file_exists, get_lilac_cache_dir, open_file
|
25 |
+
from .concept import DRAFT_MAIN, Concept, ConceptModel, ConceptType, DraftId, Example, ExampleIn
|
26 |
+
|
27 |
+
CONCEPTS_DIR = 'concept'
|
28 |
+
CONCEPT_JSON_FILENAME = 'concept.json'
|
29 |
+
# Under 'lilac' package.
|
30 |
+
LILAC_CONCEPTS_DIR = 'concepts'
|
31 |
+
|
32 |
+
|
33 |
+
class ConceptNamespaceACL(BaseModel):
|
34 |
+
"""The access control list for a namespace."""
|
35 |
+
# Whether the current user can read concepts in the namespace.
|
36 |
+
read: bool
|
37 |
+
# Whether the current user can add concepts to the namespace.
|
38 |
+
write: bool
|
39 |
+
|
40 |
+
|
41 |
+
class ConceptACL(BaseModel):
|
42 |
+
"""The access control list for an individual concept."""
|
43 |
+
# Whether the current user can read the concept.
|
44 |
+
read: bool
|
45 |
+
# Whether the current user can edit the concept, including adding examples or deleting the
|
46 |
+
# concept.
|
47 |
+
write: bool
|
48 |
+
|
49 |
+
|
50 |
+
class ConceptInfo(BaseModel):
|
51 |
+
"""Information about a concept."""
|
52 |
+
namespace: str
|
53 |
+
name: str
|
54 |
+
description: Optional[str] = None
|
55 |
+
type: ConceptType
|
56 |
+
drafts: list[DraftId]
|
57 |
+
tags: list[str] = []
|
58 |
+
|
59 |
+
acls: ConceptACL
|
60 |
+
|
61 |
+
|
62 |
+
class ConceptUpdate(BaseModel):
|
63 |
+
"""An update to a concept."""
|
64 |
+
# List of examples to be inserted.
|
65 |
+
insert: Optional[list[ExampleIn]] = []
|
66 |
+
|
67 |
+
# List of examples to be updated.
|
68 |
+
update: Optional[list[Example]] = []
|
69 |
+
|
70 |
+
# The ids of the examples to be removed.
|
71 |
+
remove: Optional[list[str]] = []
|
72 |
+
|
73 |
+
|
74 |
+
class ConceptDB(abc.ABC):
|
75 |
+
"""Interface for the concept database."""
|
76 |
+
|
77 |
+
@abc.abstractmethod
|
78 |
+
def list(self, user: Optional[UserInfo] = None) -> list[ConceptInfo]:
|
79 |
+
"""List all the concepts."""
|
80 |
+
pass
|
81 |
+
|
82 |
+
@abc.abstractmethod
|
83 |
+
def namespace_acls(self, namespace: str, user: Optional[UserInfo] = None) -> ConceptNamespaceACL:
|
84 |
+
"""Return the ACL for a namespace."""
|
85 |
+
pass
|
86 |
+
|
87 |
+
@abc.abstractmethod
|
88 |
+
def concept_acls(self, namespace: str, name: str, user: Optional[UserInfo] = None) -> ConceptACL:
|
89 |
+
"""Return the ACL for a concept."""
|
90 |
+
pass
|
91 |
+
|
92 |
+
@abc.abstractmethod
|
93 |
+
def get(self, namespace: str, name: str, user: Optional[UserInfo] = None) -> Optional[Concept]:
|
94 |
+
"""Return a concept or None if there isn't one."""
|
95 |
+
pass
|
96 |
+
|
97 |
+
@abc.abstractmethod
|
98 |
+
def create(self,
|
99 |
+
namespace: str,
|
100 |
+
name: str,
|
101 |
+
type: Union[ConceptType, str],
|
102 |
+
description: Optional[str] = None,
|
103 |
+
user: Optional[UserInfo] = None) -> Concept:
|
104 |
+
"""Create a concept.
|
105 |
+
|
106 |
+
Args:
|
107 |
+
namespace: The namespace of the concept.
|
108 |
+
name: The name of the concept.
|
109 |
+
type: The type of the concept.
|
110 |
+
description: The description of the concept.
|
111 |
+
user: The user creating the concept, if authentication is enabled.
|
112 |
+
"""
|
113 |
+
pass
|
114 |
+
|
115 |
+
@abc.abstractmethod
|
116 |
+
def edit(self,
|
117 |
+
namespace: str,
|
118 |
+
name: str,
|
119 |
+
change: ConceptUpdate,
|
120 |
+
user: Optional[UserInfo] = None) -> Concept:
|
121 |
+
"""Edit a concept. If the concept doesn't exist, throw an error."""
|
122 |
+
pass
|
123 |
+
|
124 |
+
@abc.abstractmethod
|
125 |
+
def remove(self, namespace: str, name: str, user: Optional[UserInfo] = None) -> None:
|
126 |
+
"""Remove a concept."""
|
127 |
+
pass
|
128 |
+
|
129 |
+
@abc.abstractmethod
|
130 |
+
def merge_draft(self,
|
131 |
+
namespace: str,
|
132 |
+
name: str,
|
133 |
+
draft: DraftId,
|
134 |
+
user: Optional[UserInfo] = None) -> Concept:
|
135 |
+
"""Merge a draft concept.."""
|
136 |
+
pass
|
137 |
+
|
138 |
+
|
139 |
+
class ConceptModelDB(abc.ABC):
|
140 |
+
"""Interface for the concept model database."""
|
141 |
+
|
142 |
+
_concept_db: ConceptDB
|
143 |
+
_sync_lock = threading.Lock()
|
144 |
+
|
145 |
+
def __init__(self, concept_db: ConceptDB) -> None:
|
146 |
+
self._concept_db = concept_db
|
147 |
+
|
148 |
+
@abc.abstractmethod
|
149 |
+
def create(self,
|
150 |
+
namespace: str,
|
151 |
+
concept_name: str,
|
152 |
+
embedding_name: str,
|
153 |
+
user: Optional[UserInfo] = None) -> ConceptModel:
|
154 |
+
"""Create the concept model."""
|
155 |
+
pass
|
156 |
+
|
157 |
+
@abc.abstractmethod
|
158 |
+
def get(self,
|
159 |
+
namespace: str,
|
160 |
+
concept_name: str,
|
161 |
+
embedding_name: str,
|
162 |
+
user: Optional[UserInfo] = None) -> Optional[ConceptModel]:
|
163 |
+
"""Get the model associated with the provided concept the embedding.
|
164 |
+
|
165 |
+
Returns None if the model does not exist.
|
166 |
+
"""
|
167 |
+
pass
|
168 |
+
|
169 |
+
@abc.abstractmethod
|
170 |
+
def _save(self, model: ConceptModel) -> None:
|
171 |
+
"""Save the concept model."""
|
172 |
+
pass
|
173 |
+
|
174 |
+
def in_sync(self, model: ConceptModel, user: Optional[UserInfo] = None) -> bool:
|
175 |
+
"""Return True if the model is up to date with the concept."""
|
176 |
+
concept = self._concept_db.get(model.namespace, model.concept_name, user=user)
|
177 |
+
if not concept:
|
178 |
+
raise ValueError(f'Concept "{model.namespace}/{model.concept_name}" does not exist.')
|
179 |
+
return concept.version == model.version
|
180 |
+
|
181 |
+
def sync(self,
|
182 |
+
namespace: str,
|
183 |
+
concept_name: str,
|
184 |
+
embedding_name: str,
|
185 |
+
user: Optional[UserInfo] = None,
|
186 |
+
create: bool = False) -> ConceptModel:
|
187 |
+
"""Sync the concept model. Returns true if the model was updated."""
|
188 |
+
with self._sync_lock:
|
189 |
+
model = self.get(namespace, concept_name, embedding_name, user=user)
|
190 |
+
if not model:
|
191 |
+
if create:
|
192 |
+
model = self.create(namespace, concept_name, embedding_name, user=user)
|
193 |
+
else:
|
194 |
+
raise ValueError(f'Model "{namespace}/{concept_name}/{embedding_name}" does not exist.')
|
195 |
+
|
196 |
+
concept = self._concept_db.get(model.namespace, model.concept_name, user=user)
|
197 |
+
if not concept:
|
198 |
+
raise ValueError(f'Concept "{model.namespace}/{model.concept_name}" does not exist.')
|
199 |
+
model_updated = model.sync(concept)
|
200 |
+
if model_updated:
|
201 |
+
self._save(model)
|
202 |
+
return model
|
203 |
+
|
204 |
+
@abc.abstractmethod
|
205 |
+
def remove(self, namespace: str, concept_name: str, embedding_name: str) -> None:
|
206 |
+
"""Remove the model of a concept."""
|
207 |
+
pass
|
208 |
+
|
209 |
+
@abc.abstractmethod
|
210 |
+
def get_models(self, namespace: str, concept_name: str) -> list[ConceptModel]:
|
211 |
+
"""List all the models associated with a concept."""
|
212 |
+
pass
|
213 |
+
|
214 |
+
|
215 |
+
class DiskConceptModelDB(ConceptModelDB):
|
216 |
+
"""Interface for the concept model database."""
|
217 |
+
|
218 |
+
def __init__(self,
|
219 |
+
concept_db: ConceptDB,
|
220 |
+
base_dir: Optional[Union[str, pathlib.Path]] = None) -> None:
|
221 |
+
super().__init__(concept_db)
|
222 |
+
self._base_dir = base_dir
|
223 |
+
|
224 |
+
def _get_base_dir(self) -> str:
|
225 |
+
return str(self._base_dir) if self._base_dir else data_path()
|
226 |
+
|
227 |
+
@override
|
228 |
+
def create(self,
|
229 |
+
namespace: str,
|
230 |
+
concept_name: str,
|
231 |
+
embedding_name: str,
|
232 |
+
user: Optional[UserInfo] = None) -> ConceptModel:
|
233 |
+
if self.get(namespace, concept_name, embedding_name, user=user):
|
234 |
+
raise ValueError('Concept model already exists.')
|
235 |
+
concept = self._concept_db.get(namespace, concept_name, user=user)
|
236 |
+
if not concept:
|
237 |
+
raise ValueError(f'Concept "{namespace}/{concept_name}" does not exist.')
|
238 |
+
model = ConceptModel(
|
239 |
+
namespace=namespace, concept_name=concept_name, embedding_name=embedding_name)
|
240 |
+
self._save(model)
|
241 |
+
return model
|
242 |
+
|
243 |
+
@override
|
244 |
+
def get(self,
|
245 |
+
namespace: str,
|
246 |
+
concept_name: str,
|
247 |
+
embedding_name: str,
|
248 |
+
user: Optional[UserInfo] = None) -> Optional[ConceptModel]:
|
249 |
+
# Make sure the concept exists.
|
250 |
+
concept = self._concept_db.get(namespace, concept_name, user=user)
|
251 |
+
if not concept:
|
252 |
+
raise ValueError(f'Concept "{namespace}/{concept_name}" does not exist.')
|
253 |
+
|
254 |
+
# Make sure that the embedding signal exists.
|
255 |
+
if not get_signal_cls(embedding_name):
|
256 |
+
raise ValueError(f'Embedding signal "{embedding_name}" not found in the registry.')
|
257 |
+
|
258 |
+
concept_model_path = _concept_model_path(self._get_base_dir(), namespace, concept_name,
|
259 |
+
embedding_name)
|
260 |
+
if not file_exists(concept_model_path):
|
261 |
+
return None
|
262 |
+
|
263 |
+
with open_file(concept_model_path, 'rb') as f:
|
264 |
+
return pickle.load(f)
|
265 |
+
|
266 |
+
def _save(self, model: ConceptModel) -> None:
|
267 |
+
"""Save the concept model."""
|
268 |
+
concept_model_path = _concept_model_path(self._get_base_dir(), model.namespace,
|
269 |
+
model.concept_name, model.embedding_name)
|
270 |
+
with open_file(concept_model_path, 'wb') as f:
|
271 |
+
pickle.dump(model, f)
|
272 |
+
|
273 |
+
@override
|
274 |
+
def remove(self,
|
275 |
+
namespace: str,
|
276 |
+
concept_name: str,
|
277 |
+
embedding_name: str,
|
278 |
+
user: Optional[UserInfo] = None) -> None:
|
279 |
+
concept_model_path = _concept_model_path(self._get_base_dir(), namespace, concept_name,
|
280 |
+
embedding_name)
|
281 |
+
|
282 |
+
if not file_exists(concept_model_path):
|
283 |
+
raise ValueError(f'Concept model {namespace}/{concept_name}/{embedding_name} does not exist.')
|
284 |
+
|
285 |
+
delete_file(concept_model_path)
|
286 |
+
|
287 |
+
@override
|
288 |
+
def get_models(self,
|
289 |
+
namespace: str,
|
290 |
+
concept_name: str,
|
291 |
+
user: Optional[UserInfo] = None) -> list[ConceptModel]:
|
292 |
+
"""List all the models associated with a concept."""
|
293 |
+
model_files = glob.iglob(
|
294 |
+
os.path.join(_concept_cache_dir(self._get_base_dir(), namespace, concept_name), '*.pkl'))
|
295 |
+
models: list[ConceptModel] = []
|
296 |
+
for model_file in model_files:
|
297 |
+
embedding_name = os.path.basename(model_file)[:-len('.pkl')]
|
298 |
+
model = self.get(namespace, concept_name, embedding_name, user=user)
|
299 |
+
if model:
|
300 |
+
models.append(model)
|
301 |
+
return models
|
302 |
+
|
303 |
+
|
304 |
+
def get_concept_output_dir(base_dir: str, namespace: str, name: str) -> str:
|
305 |
+
"""Return the output directory for a given concept."""
|
306 |
+
if namespace == 'lilac':
|
307 |
+
# Lilac concepts are stored in the resources directory and shipped with the pip package.
|
308 |
+
return str(resources.files('lilac').joinpath(os.path.join(LILAC_CONCEPTS_DIR, name)))
|
309 |
+
|
310 |
+
return os.path.join(base_dir, CONCEPTS_DIR, namespace, name)
|
311 |
+
|
312 |
+
|
313 |
+
def _concept_json_path(base_dir: str, namespace: str, name: str) -> str:
|
314 |
+
return os.path.join(get_concept_output_dir(base_dir, namespace, name), CONCEPT_JSON_FILENAME)
|
315 |
+
|
316 |
+
|
317 |
+
def _concept_cache_dir(base_dir: str, namespace: str, concept_name: str) -> str:
|
318 |
+
return os.path.join(get_lilac_cache_dir(base_dir), CONCEPTS_DIR, namespace, concept_name)
|
319 |
+
|
320 |
+
|
321 |
+
def _concept_model_path(base_dir: str, namespace: str, concept_name: str,
|
322 |
+
embedding_name: str) -> str:
|
323 |
+
|
324 |
+
return os.path.join(
|
325 |
+
_concept_cache_dir(base_dir, namespace, concept_name), f'{embedding_name}.pkl')
|
326 |
+
|
327 |
+
|
328 |
+
class DiskConceptDB(ConceptDB):
|
329 |
+
"""A concept database."""
|
330 |
+
|
331 |
+
def __init__(self, base_dir: Optional[Union[str, pathlib.Path]] = None) -> None:
|
332 |
+
self._base_dir = base_dir
|
333 |
+
|
334 |
+
def _get_base_dir(self) -> str:
|
335 |
+
return str(self._base_dir) if self._base_dir else data_path()
|
336 |
+
|
337 |
+
@override
|
338 |
+
def namespace_acls(self, namespace: str, user: Optional[UserInfo] = None) -> ConceptNamespaceACL:
|
339 |
+
if not env('LILAC_AUTH_ENABLED'):
|
340 |
+
return ConceptNamespaceACL(read=True, write=True)
|
341 |
+
|
342 |
+
if namespace == 'lilac':
|
343 |
+
return ConceptNamespaceACL(read=True, write=False)
|
344 |
+
if user and user.id == namespace:
|
345 |
+
return ConceptNamespaceACL(read=True, write=True)
|
346 |
+
|
347 |
+
return ConceptNamespaceACL(read=False, write=False)
|
348 |
+
|
349 |
+
@override
|
350 |
+
def concept_acls(self, namespace: str, name: str, user: Optional[UserInfo] = None) -> ConceptACL:
|
351 |
+
namespace_acls = self.namespace_acls(namespace, user=user)
|
352 |
+
# Concept ACL inherit from the namespace ACL. We currently don't have concept-specific
|
353 |
+
# ACL.
|
354 |
+
return ConceptACL(read=namespace_acls.read, write=namespace_acls.write)
|
355 |
+
|
356 |
+
@override
|
357 |
+
def list(self, user: Optional[UserInfo] = None) -> list[ConceptInfo]:
|
358 |
+
namespaces: Optional[list[str]] = None
|
359 |
+
if env('LILAC_AUTH_ENABLED'):
|
360 |
+
namespaces = ['lilac']
|
361 |
+
if user:
|
362 |
+
namespaces += [user.id]
|
363 |
+
|
364 |
+
concept_infos: list[ConceptInfo] = []
|
365 |
+
|
366 |
+
namespace_concept_dirs: list[tuple[Optional[str], str]] = [
|
367 |
+
# None = Read the namespace from the directory.
|
368 |
+
(None, os.path.join(self._get_base_dir(), CONCEPTS_DIR)),
|
369 |
+
# Read lilac concepts from the resources directory.
|
370 |
+
('lilac', str(resources.files('lilac').joinpath(LILAC_CONCEPTS_DIR)))
|
371 |
+
]
|
372 |
+
|
373 |
+
for (default_namespace, concept_dir) in namespace_concept_dirs:
|
374 |
+
# Read the concepts from the data dir and return a ConceptInfo containing the namespace and
|
375 |
+
# name.
|
376 |
+
for root, _, files in os.walk(concept_dir):
|
377 |
+
for file in files:
|
378 |
+
if file == CONCEPT_JSON_FILENAME:
|
379 |
+
namespace, name = root.split('/')[-2:]
|
380 |
+
if default_namespace is not None:
|
381 |
+
namespace = default_namespace
|
382 |
+
if namespaces and namespace not in namespaces:
|
383 |
+
# Ignore concepts that are not in the namespace, if provided.
|
384 |
+
continue
|
385 |
+
|
386 |
+
concept = cast(Concept, self.get(namespace, name, user=user))
|
387 |
+
concept_infos.append(
|
388 |
+
_info_from_concept(concept, self.concept_acls(namespace, name, user=user)))
|
389 |
+
|
390 |
+
return concept_infos
|
391 |
+
|
392 |
+
@override
|
393 |
+
def get(self, namespace: str, name: str, user: Optional[UserInfo] = None) -> Optional[Concept]:
|
394 |
+
# If the user does not have access to the concept, return None.
|
395 |
+
acls = self.concept_acls(namespace, name, user=user)
|
396 |
+
if not acls.read:
|
397 |
+
raise ConceptAuthorizationException(
|
398 |
+
f'Concept "{namespace}/{name}" does not exist or user does not have access.')
|
399 |
+
|
400 |
+
concept_json_path = _concept_json_path(self._get_base_dir(), namespace, name)
|
401 |
+
if not file_exists(concept_json_path):
|
402 |
+
return None
|
403 |
+
|
404 |
+
with open_file(concept_json_path) as f:
|
405 |
+
obj: dict[str, Any] = json.load(f)
|
406 |
+
if 'namespace' not in obj:
|
407 |
+
obj['namespace'] = namespace
|
408 |
+
return Concept.parse_obj(obj)
|
409 |
+
|
410 |
+
@override
|
411 |
+
def create(self,
|
412 |
+
namespace: str,
|
413 |
+
name: str,
|
414 |
+
type: Union[ConceptType, str] = ConceptType.TEXT,
|
415 |
+
description: Optional[str] = None,
|
416 |
+
user: Optional[UserInfo] = None) -> Concept:
|
417 |
+
"""Create a concept."""
|
418 |
+
# If the user does not have access to the write to the concept namespace, throw.
|
419 |
+
acls = self.namespace_acls(namespace, user=user)
|
420 |
+
if not acls.write:
|
421 |
+
raise ConceptAuthorizationException(
|
422 |
+
f'Concept namespace "{namespace}" does not exist or user does not have access.')
|
423 |
+
|
424 |
+
concept_json_path = _concept_json_path(self._get_base_dir(), namespace, name)
|
425 |
+
if file_exists(concept_json_path):
|
426 |
+
raise ValueError(f'Concept with namespace "{namespace}" and name "{name}" already exists.')
|
427 |
+
|
428 |
+
if isinstance(type, str):
|
429 |
+
type = ConceptType(type)
|
430 |
+
concept = Concept(
|
431 |
+
namespace=namespace, concept_name=name, type=type, data={}, description=description)
|
432 |
+
self._save(concept)
|
433 |
+
return concept
|
434 |
+
|
435 |
+
def _validate_examples(self, examples: List[Union[ExampleIn, Example]],
|
436 |
+
type: ConceptType) -> None:
|
437 |
+
for example in examples:
|
438 |
+
inferred_type = 'text' if example.text else 'unknown'
|
439 |
+
if inferred_type != type:
|
440 |
+
raise ValueError(f'Example type "{inferred_type}" does not match concept type "{type}".')
|
441 |
+
|
442 |
+
@override
|
443 |
+
def edit(self,
|
444 |
+
namespace: str,
|
445 |
+
name: str,
|
446 |
+
change: ConceptUpdate,
|
447 |
+
user: Optional[UserInfo] = None) -> Concept:
|
448 |
+
# If the user does not have access to the concept, return None.
|
449 |
+
acls = self.concept_acls(namespace, name, user=user)
|
450 |
+
if not acls.write:
|
451 |
+
raise ConceptAuthorizationException(
|
452 |
+
f'Concept "{namespace}/{name}" does not exist or user does not have access.')
|
453 |
+
|
454 |
+
concept_json_path = _concept_json_path(self._get_base_dir(), namespace, name)
|
455 |
+
|
456 |
+
if not file_exists(concept_json_path):
|
457 |
+
raise ValueError(f'Concept with namespace "{namespace}" and name "{name}" does not exist. '
|
458 |
+
'Please call create() first.')
|
459 |
+
|
460 |
+
inserted_points = change.insert or []
|
461 |
+
updated_points = change.update or []
|
462 |
+
removed_points = change.remove or []
|
463 |
+
|
464 |
+
concept = cast(Concept, self.get(namespace, name, user=user))
|
465 |
+
|
466 |
+
self._validate_examples([*inserted_points, *updated_points], concept.type)
|
467 |
+
|
468 |
+
for remove_example in removed_points:
|
469 |
+
if remove_example not in concept.data:
|
470 |
+
raise ValueError(f'Example with id "{remove_example}" does not exist.')
|
471 |
+
concept.data.pop(remove_example)
|
472 |
+
|
473 |
+
for example in inserted_points:
|
474 |
+
id = uuid.uuid4().hex
|
475 |
+
concept.data[id] = Example(id=id, **example.dict())
|
476 |
+
|
477 |
+
for example in updated_points:
|
478 |
+
if example.id not in concept.data:
|
479 |
+
raise ValueError(f'Example with id "{example.id}" does not exist.')
|
480 |
+
|
481 |
+
# Remove the old example and make a new one with a new id to keep it functional.
|
482 |
+
concept.data.pop(example.id)
|
483 |
+
concept.data[example.id] = example.copy()
|
484 |
+
|
485 |
+
concept.version += 1
|
486 |
+
|
487 |
+
self._save(concept)
|
488 |
+
|
489 |
+
return concept
|
490 |
+
|
491 |
+
def _save(self, concept: Concept) -> None:
|
492 |
+
concept_json_path = _concept_json_path(self._get_base_dir(), concept.namespace,
|
493 |
+
concept.concept_name)
|
494 |
+
with open_file(concept_json_path, 'w') as f:
|
495 |
+
f.write(concept.json(exclude_none=True, indent=2, exclude_defaults=True))
|
496 |
+
|
497 |
+
@override
|
498 |
+
def remove(self, namespace: str, name: str, user: Optional[UserInfo] = None) -> None:
|
499 |
+
# If the user does not have access to the concept, return None.
|
500 |
+
acls = self.concept_acls(namespace, name, user=user)
|
501 |
+
if not acls.write:
|
502 |
+
raise ConceptAuthorizationException(
|
503 |
+
f'Concept "{namespace}/{name}" does not exist or user does not have access.')
|
504 |
+
|
505 |
+
concept_dir = get_concept_output_dir(self._get_base_dir(), namespace, name)
|
506 |
+
|
507 |
+
if not file_exists(concept_dir):
|
508 |
+
raise ValueError(f'Concept with namespace "{namespace}" and name "{name}" does not exist.')
|
509 |
+
|
510 |
+
shutil.rmtree(concept_dir, ignore_errors=True)
|
511 |
+
|
512 |
+
@override
|
513 |
+
def merge_draft(self,
|
514 |
+
namespace: str,
|
515 |
+
name: str,
|
516 |
+
draft: DraftId,
|
517 |
+
user: Optional[UserInfo] = None) -> Concept:
|
518 |
+
"""Merge a draft concept."""
|
519 |
+
# If the user does not have access to the concept, return None.
|
520 |
+
acls = self.concept_acls(namespace, name, user=user)
|
521 |
+
if not acls.write:
|
522 |
+
raise ConceptAuthorizationException(
|
523 |
+
f'Concept "{namespace}/{name}" does not exist or user does not have access.')
|
524 |
+
|
525 |
+
concept = self.get(namespace, name, user=user)
|
526 |
+
if not concept:
|
527 |
+
raise ValueError(f'Concept with namespace "{namespace}" and name "{name}" does not exist.')
|
528 |
+
|
529 |
+
if draft == DRAFT_MAIN:
|
530 |
+
return concept
|
531 |
+
|
532 |
+
# Map the text of examples in main so we can remove them if they are duplicates.
|
533 |
+
main_text_ids: dict[Optional[str], str] = {
|
534 |
+
example.text: id for id, example in concept.data.items() if example.draft == DRAFT_MAIN
|
535 |
+
}
|
536 |
+
|
537 |
+
draft_examples: dict[str, Example] = {
|
538 |
+
id: example for id, example in concept.data.items() if example.draft == draft
|
539 |
+
}
|
540 |
+
for example in draft_examples.values():
|
541 |
+
example.draft = DRAFT_MAIN
|
542 |
+
# Remove duplicates in main.
|
543 |
+
main_text_id = main_text_ids.get(example.text)
|
544 |
+
if main_text_id:
|
545 |
+
del concept.data[main_text_id]
|
546 |
+
|
547 |
+
concept.version += 1
|
548 |
+
|
549 |
+
self._save(concept)
|
550 |
+
|
551 |
+
return concept
|
552 |
+
|
553 |
+
|
554 |
+
def _info_from_concept(concept: Concept, acls: ConceptACL) -> ConceptInfo:
|
555 |
+
return ConceptInfo(
|
556 |
+
namespace=concept.namespace,
|
557 |
+
name=concept.concept_name,
|
558 |
+
description=concept.description,
|
559 |
+
type=SignalInputType.TEXT,
|
560 |
+
drafts=concept.drafts(),
|
561 |
+
tags=concept.tags,
|
562 |
+
acls=acls)
|
563 |
+
|
564 |
+
|
565 |
+
# A singleton concept database.
|
566 |
+
DISK_CONCEPT_DB = DiskConceptDB()
|
567 |
+
DISK_CONCEPT_MODEL_DB = DiskConceptModelDB(DISK_CONCEPT_DB)
|
lilac/concepts/legal-termination/concept.json
ADDED
@@ -0,0 +1,185 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"namespace": "lilac",
|
3 |
+
"concept_name": "legal-termination",
|
4 |
+
"type": "text",
|
5 |
+
"tags": ["legal"],
|
6 |
+
"data": {
|
7 |
+
"731b1338cf1949958c3526c555f88058": {
|
8 |
+
"label": true,
|
9 |
+
"text": "In the event that any provision of this agreement is found to be unenforceable, the remaining provisions shall continue to be valid and binding.",
|
10 |
+
"id": "731b1338cf1949958c3526c555f88058"
|
11 |
+
},
|
12 |
+
"99a20e547e38474dbc24507a1658d0c9": {
|
13 |
+
"label": true,
|
14 |
+
"text": "The parties agree that in the event of a natural disaster or other unforeseen event, both parties will make reasonable efforts to fulfill their obligations under this contract.",
|
15 |
+
"id": "99a20e547e38474dbc24507a1658d0c9"
|
16 |
+
},
|
17 |
+
"3f27b47c526a4c5896a0a100024535c7": {
|
18 |
+
"label": true,
|
19 |
+
"text": "If any party breaches the terms of this agreement, the non-breaching party shall have the right to seek legal remedies.",
|
20 |
+
"id": "3f27b47c526a4c5896a0a100024535c7"
|
21 |
+
},
|
22 |
+
"d403dbb1ab9c4594bc7f7dcb0ad5b333": {
|
23 |
+
"label": true,
|
24 |
+
"text": "This lease agreement shall survive the termination or expiration of the lease term, and continue to be binding upon the parties.",
|
25 |
+
"id": "d403dbb1ab9c4594bc7f7dcb0ad5b333"
|
26 |
+
},
|
27 |
+
"b7deba9f7e80444abe14448f53f45c43": {
|
28 |
+
"label": true,
|
29 |
+
"text": "In the event of a dispute arising from this contract, the parties agree to first attempt to resolve the dispute through mediation before pursuing any legal action.",
|
30 |
+
"id": "b7deba9f7e80444abe14448f53f45c43"
|
31 |
+
},
|
32 |
+
"a82231b490174e62aad733cb0c75024d": {
|
33 |
+
"label": true,
|
34 |
+
"text": "This Agreement may be terminated, and the transactions contemplated hereby may be abandoned, at any time prior to the Effective Time, whether prior to or after the Company Stockholders' Approval:",
|
35 |
+
"id": "a82231b490174e62aad733cb0c75024d"
|
36 |
+
},
|
37 |
+
"160b25dbf14e4759a0065bbd652ce33f": {
|
38 |
+
"label": true,
|
39 |
+
"text": "This Agreement may be terminated and abandoned at any time prior to the Effective Time of the Merger, whether before or after the Company Stockholder Approval:",
|
40 |
+
"id": "160b25dbf14e4759a0065bbd652ce33f"
|
41 |
+
},
|
42 |
+
"8f5f9f96b16441228bb0c9b8a14c4e25": {
|
43 |
+
"label": false,
|
44 |
+
"text": "any jurisdiction, then such provision shall, as to such jurisdiction, be modified or restricted to the extent necessary to make such provision valid, binding and enforceable, or if such provision cannot be so modified or restricted, then such provision shall, as to such jurisdiction, be deemed to be excised from this Agreement; provided, however, that the legality, binding effect and",
|
45 |
+
"id": "8f5f9f96b16441228bb0c9b8a14c4e25"
|
46 |
+
},
|
47 |
+
"87b6c31b04a346b4a3e0da8d2cc5a7ac": {
|
48 |
+
"label": true,
|
49 |
+
"text": "This Agreement shall terminate automatically without any further action by any party hereto upon the earliest to occur of (a) the Effective Time of the Merger, (b) the termination of the Merger Agreement in accordance with its terms and (c) any amendment or other modification of the Merger Agreement that reduces the amount of the Merger Consideration or provides that the Merger Consideration shall",
|
50 |
+
"id": "87b6c31b04a346b4a3e0da8d2cc5a7ac"
|
51 |
+
},
|
52 |
+
"985344f7ecfb41f4a69ba101973221a1": {
|
53 |
+
"label": false,
|
54 |
+
"text": " During the Employment Period, the Corporation shall pay ----------- the Executive a base salary which, as of the commencement of the Employment Period, shall be at an annual rate of Two Hundred Fifty Thousand Dollars ($250,000). The base salary shall be payable in equal periodic installments which are not less frequent than the periodic installments in effect for salaries of other senior",
|
55 |
+
"id": "985344f7ecfb41f4a69ba101973221a1"
|
56 |
+
},
|
57 |
+
"5d53ff48376046fdab41e95c7f4bad54": {
|
58 |
+
"label": true,
|
59 |
+
"text": "This Agreement may be terminated at any time prior to the Closing Date solely:",
|
60 |
+
"id": "5d53ff48376046fdab41e95c7f4bad54"
|
61 |
+
},
|
62 |
+
"bdeb785be2154b21b4eb052466fa9bcb": {
|
63 |
+
"label": true,
|
64 |
+
"text": "(a) This Agreement may be terminated by you by notice to the Company at any time prior to the Closing Date if any of the following has occurred: (i) since the respective dates as of which information is given in the Registration Statement and the Prospectus, any material adverse change or any development involving a prospective material adverse change in or affecting the earnings, busi ness,",
|
65 |
+
"id": "bdeb785be2154b21b4eb052466fa9bcb"
|
66 |
+
},
|
67 |
+
"fe6871e9070441f8a9e4b3db26b077d7": {
|
68 |
+
"label": true,
|
69 |
+
"text": "Section 3(b), this Section 7 and Section 8 of this Agreement shall survive a termination of this Agreement pursuant to (a) or (b) above in this Section 7 until the date that is two years following the date of such termination. Notwithstanding anything else to the contrary contained herein or in the Merger Agreement, if the Effective Time occurs, the representations and warranties contained in",
|
70 |
+
"id": "fe6871e9070441f8a9e4b3db26b077d7"
|
71 |
+
},
|
72 |
+
"bf1a51751d0748e58c344aec8e5fc789": {
|
73 |
+
"label": false,
|
74 |
+
"text": "This Agreement may be executed in one or more counterparts (including counterparts executed and delivered by facsimile, which shall be as counterparts executed and delivered manually), all of which shall be considered one and the same agreement and shall become effective when one or more counterparts have been signed by each of the parties and delivered to the other party, it being understood that",
|
75 |
+
"id": "bf1a51751d0748e58c344aec8e5fc789"
|
76 |
+
},
|
77 |
+
"bc1b2affa6d848fd92d4dee033e30659": {
|
78 |
+
"label": false,
|
79 |
+
"text": "would, in your judgment, make it impracticable or inadvisable to market the Units or to enforce contracts for the sale of the Units, (iii) suspension of trading in securities generally on the New York Stock Exchange, the American Stock Exchange or the Nasdaq National Market or limitation on prices (other than limitations on hours or numbers of days of trading) for securities on any such Exchange,",
|
80 |
+
"id": "bc1b2affa6d848fd92d4dee033e30659"
|
81 |
+
},
|
82 |
+
"67a73d5887f74a91bed190ca8f64b17c": {
|
83 |
+
"label": false,
|
84 |
+
"text": " The authorized capital stock of FM consists of 1,000 shares of Common Stock, no par value each, of which 1,000 shares are issued and outstanding. There are no outstanding or authorized options, warrants, calls, subscriptions, rights (including any preemptive rights or rights of first refusal), agreements or commitments of any character obligating FM to issue any stock or any other Equity",
|
85 |
+
"id": "67a73d5887f74a91bed190ca8f64b17c"
|
86 |
+
},
|
87 |
+
"025b2ca5147849c8a921d9aaa31cd9cd": {
|
88 |
+
"label": false,
|
89 |
+
"text": "Taxes that are being contested in good faith by appropriate proceedings, provided that Holdings, the Borrower or Restricted Subsidiary, as the case may be, has set aside on its books adequate reserves therefor in accordance with GAAP.",
|
90 |
+
"id": "025b2ca5147849c8a921d9aaa31cd9cd"
|
91 |
+
},
|
92 |
+
"76acff27f13743f4822a094c707d8b75": {
|
93 |
+
"label": false,
|
94 |
+
"text": "have been a suspension or material limitation in trading in the Company\u2019s common stock on the New York Stock Exchange; (iii) there shall have been a general moratorium on commercial banking activities declared by either federal or New York state authorities or a material disruption in commercial banking or securities settlement or clearance services in the United States; (iv) there shall have been",
|
95 |
+
"id": "76acff27f13743f4822a094c707d8b75"
|
96 |
+
},
|
97 |
+
"b11a95c0eb564445b1a473e90622f861": {
|
98 |
+
"label": true,
|
99 |
+
"text": "10.1. This Agreement will terminate:",
|
100 |
+
"id": "b11a95c0eb564445b1a473e90622f861"
|
101 |
+
},
|
102 |
+
"d536428a02084d94ba18d412851cb913": {
|
103 |
+
"label": false,
|
104 |
+
"text": "may not be limited to his Base Salary and that the Employee may receive an annual bonus in the amount, if any, determined annually by the Employer. The Employee shall also participate in employee compensation and benefit plans available generally to executives of the Employer (including, without limitation, any tax-qualified profit sharing plan, nonqualified profit sharing plan, life insurance",
|
105 |
+
"id": "d536428a02084d94ba18d412851cb913"
|
106 |
+
},
|
107 |
+
"368bb1d9c7d0419d9ca58f28565eeb2e": {
|
108 |
+
"label": true,
|
109 |
+
"text": "This Agreement may be terminated in the absolute discretion of the Representatives, by notice to the Bank, if after execution and delivery of this Agreement and prior to the Closing Date (i) there has been, since the date of this Agreement or since the respective dates as of which information is given in the Registration Statement, the Time of Sale Information or the Prospectus, any material",
|
110 |
+
"id": "368bb1d9c7d0419d9ca58f28565eeb2e"
|
111 |
+
},
|
112 |
+
"1b5fd7b037a84404bf85c858953c79e8": {
|
113 |
+
"label": true,
|
114 |
+
"text": "however, (i) the right to terminate this Agreement under this Section 8 shall not be available to such Buyer if the failure of the transactions contemplated by this Agreement to have been consummated by such date is the result of such Buyer\u2019s breach of this Agreement and (ii) the abandonment of the sale and purchase of the Notes and the Warrants shall be applicable only to such Buyer providing",
|
115 |
+
"id": "1b5fd7b037a84404bf85c858953c79e8"
|
116 |
+
},
|
117 |
+
"6d5a23d2663f457cab96df03d9dc8ab7": {
|
118 |
+
"label": true,
|
119 |
+
"text": "In addition, any Stockholder may terminate this Agreement if Weatherford, WEUS, or the Company breaches any representation, warranty, covenant or other agreement contained in the Merger Agreement that (A) would give rise to the failure of Weatherford, WEUS, or the Company to satisfy any condition set forth in Section 8.2(a) thereof, and (B) cannot be or has not been cured within 45 days after the",
|
120 |
+
"id": "6d5a23d2663f457cab96df03d9dc8ab7"
|
121 |
+
},
|
122 |
+
"4a8223a48f83491b9b3eafd7ad37baf9": {
|
123 |
+
"label": true,
|
124 |
+
"text": "The obligations of the Underwriters hereunder may be terminated by the Representatives, in their absolute discretion, by notice given to and received by the Depositor or the Bank prior to delivery of and payment for the Notes if, prior to that time, any of the events described in Section 5(v) shall have occurred or any of the other conditions described in Section 5 shall not be satisfied.",
|
125 |
+
"id": "4a8223a48f83491b9b3eafd7ad37baf9"
|
126 |
+
},
|
127 |
+
"fbb152eae00c440bb2d0df0fbd82c262": {
|
128 |
+
"label": true,
|
129 |
+
"text": "Either of the parties hereto may terminate this Agreement by giving to the other party a notice in writing specifying the date of such termination, which shall be not less than 60 days after the date of receipt of such notice. In the event such notice is given by the Customer, it shall be accompanied by a copy of a resolution of the Board of Directors of the Customer, certified by its Secretary,",
|
130 |
+
"id": "fbb152eae00c440bb2d0df0fbd82c262"
|
131 |
+
},
|
132 |
+
"1d21880f426c45ada31409d22815cc87": {
|
133 |
+
"label": false,
|
134 |
+
"text": "Prospectus or the Final Prospectus (exclusive of any amendment or supplement thereof or thereto after the date hereof).",
|
135 |
+
"id": "1d21880f426c45ada31409d22815cc87"
|
136 |
+
},
|
137 |
+
"795cac72a3504740bc7401a84fc6fba4": {
|
138 |
+
"label": true,
|
139 |
+
"text": "This Agreement may be terminated by the Customer or the Bank by giving ninety (90) days written notice to the other, provided that such notice to the Bank shall specify the names of the persons to whom the Bank shall deliver the Assets in the Accounts. If notice of termination is given by the Bank, the Customer shall, within ninety (90) days following receipt of the notice, deliver to the Bank Instructions specifying the names of the persons to whom the Bank shall deliver the Assets.",
|
140 |
+
"id": "795cac72a3504740bc7401a84fc6fba4"
|
141 |
+
},
|
142 |
+
"3b82e6eba4894ac0b9f7f12aba2aab2e": {
|
143 |
+
"label": false,
|
144 |
+
"text": "of this Agreement, or to Authorized Persons, or may continue to hold the Assets until Instructions are provided to the Bank.",
|
145 |
+
"id": "3b82e6eba4894ac0b9f7f12aba2aab2e"
|
146 |
+
},
|
147 |
+
"da16bd0e9dce4d4c87400eab61b9b14c": {
|
148 |
+
"label": false,
|
149 |
+
"text": "into force of the Convention. In such event, the Convention shall cease to have effect:",
|
150 |
+
"id": "da16bd0e9dce4d4c87400eab61b9b14c"
|
151 |
+
},
|
152 |
+
"02cc328109984db094b0b02caec0d575": {
|
153 |
+
"label": true,
|
154 |
+
"text": "Survival. The rights and obligations contained in Sections 3 (\u201cOwnership of Work Product\u201d), 4 (\u201cOther Rights\u201d), 5 (\u201cLicense to Preexisting IP\u201d), 6 (\u201cRepresentations and Warranties\u201d), 8 (\u201cConfidential Information\u201d) and 12 (\u201cNon-solicitation\u201d) will survive any termination or expiration of this Agreement. ",
|
155 |
+
"id": "02cc328109984db094b0b02caec0d575"
|
156 |
+
},
|
157 |
+
"f8edf65d9acf4ff4a04459a3492ac426": {
|
158 |
+
"label": false,
|
159 |
+
"text": "Severability. Should any provisions of this Agreement be held by a court of law to be illegal, invalid or unenforceable, the legality, validity and enforceability of the remaining provisions of this Agreement will not be affected or impaired thereby. ",
|
160 |
+
"id": "f8edf65d9acf4ff4a04459a3492ac426"
|
161 |
+
},
|
162 |
+
"5a8517f359494ead8c11b6aff440480d": {
|
163 |
+
"label": false,
|
164 |
+
"text": "\u0095\tCommitted to deliver the best, we leave no room for customer grievances.\r\n\r\n",
|
165 |
+
"id": "5a8517f359494ead8c11b6aff440480d"
|
166 |
+
},
|
167 |
+
"a47d327d0f6e46fc861f86b2e0e54a2f": {
|
168 |
+
"label": false,
|
169 |
+
"text": "the due diligence and using our agreement creator to close the deal successfully. \r",
|
170 |
+
"id": "a47d327d0f6e46fc861f86b2e0e54a2f"
|
171 |
+
},
|
172 |
+
"811d0dcc92e14c5c881e903c7d4ff7b6": {
|
173 |
+
"label": false,
|
174 |
+
"text": "in accordance with customary procedures in the relevant markets, but in any event for a settlement period no longer than three months following the date of such commitment.",
|
175 |
+
"id": "811d0dcc92e14c5c881e903c7d4ff7b6"
|
176 |
+
},
|
177 |
+
"907f92e0d5704418944a559a4bfb96c7": {
|
178 |
+
"label": false,
|
179 |
+
"text": "terminate in accordance with Section 2 of the Investors\u2019 Rights Agreement.",
|
180 |
+
"id": "907f92e0d5704418944a559a4bfb96c7"
|
181 |
+
}
|
182 |
+
},
|
183 |
+
"version": 33,
|
184 |
+
"description": "Termination or survival clause in a legal document"
|
185 |
+
}
|
lilac/concepts/negative-sentiment/concept.json
ADDED
@@ -0,0 +1,634 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"namespace": "lilac",
|
3 |
+
"concept_name": "negative-sentiment",
|
4 |
+
"type": "text",
|
5 |
+
"data": {
|
6 |
+
"0": {
|
7 |
+
"label": true,
|
8 |
+
"text": "Starting To Be Annoyed By Becky...: I'm not sure why I keep reading these books, but I guess it's because I've read the first two so I'll keep reading the rest of the books. In the first book, I really found it amusing. I was a little annoyed by the fact that Becky couldn't stop spending, but then again that's why she is called a Shopaholic. In the second book, I felt more of the same it was just magniifed more. Now in the third book, I'm just down right annoyed by Becky Bloomwood. In this book, she wasn't going on crazy shopping sprees, just planning two different weddings because she was afraid to tell each person and because I feel she's really selfish. Still, I read the book because I wanted to see how she could get herself out of another situation. I will say that I love her friends Suze and Danny, her client Laurel and her husband Luke. Maybe that's why I keep reading. I will read the next book, but I'm sure I'll be just as annoyed when I'm done.",
|
9 |
+
"id": "0"
|
10 |
+
},
|
11 |
+
"1": {
|
12 |
+
"label": true,
|
13 |
+
"text": "the cover is fine - the pool is horrible: The entire pool was horrible. The cover was fine once we got it on, but we finally got rid of the pool after 2 weeks because it was so hard to set up and keep clean.",
|
14 |
+
"id": "1"
|
15 |
+
},
|
16 |
+
"2": {
|
17 |
+
"label": false,
|
18 |
+
"text": "Good album, not their best.: This album is probably the most marketable and radio friendly of all of dashboard's albums. For the peripheral listener it may be the right one to get to introduce you to this band. But as a Dashboard fan of 5 or so years I truly hope they return to their original sound for their next work. Not for the listen-ability but for the show. To this day the fans react best to the songs from \"Places\" or \"A Mark, A Mission.\" I recommend this album to everyone but I also recommend any and all of their other work.",
|
19 |
+
"id": "2"
|
20 |
+
},
|
21 |
+
"3": {
|
22 |
+
"label": true,
|
23 |
+
"text": "This is a horror novel, right?: Never one to pass up any vampire novel, I purchased Sips because the description seemed interesting. Vampires, Marquis de Sade, fetishism, yada yada yada. If this is a comedy, I give it 4 stars; however, I'll give it 1 star as a horror novel. Sade was rather boring; I would think a character as intense and multi-faceted as the Marquis de Sade would make for a more interesting vampire. The writing style isn't too bad, but overall I found the characters to be mildly amusing at best. The plot was thin, the end was anti-climactic, and the vampires were not very frightening. The book had little suspense, and it leaves a mile-wide opening for a sequel at the conclusion. I would, however, like to see something more of the vampire mutants lurking in the graveyard. They were the most riveting of any of the characters.",
|
24 |
+
"id": "3"
|
25 |
+
},
|
26 |
+
"4": {
|
27 |
+
"label": false,
|
28 |
+
"text": "Superb mix of global non secular musical denominations: I first heard Ms. Pook's music on the \"Eyes Wide Shut\" soundtrack (the masquerade ball scene) and was blown away; if ever there was a necessity for music to permeate a scene in a film this was it. She incorporates a blend of the traditional songs from faiths across continents and mixes them, for lack of a better comparison than similar to your quintessential raver d.j. (though these are better and definitively more original :) \"Oppenheimer\" is my favorite, and if you let the last track run for a few minutes a portion of the song will play once more. I can't wait to hear more of her stuff - these hymns are awesome.",
|
29 |
+
"id": "4"
|
30 |
+
},
|
31 |
+
"5": {
|
32 |
+
"label": false,
|
33 |
+
"text": "A moving and suspenseful story!: For anyone familiar with the occult, this book is likely to raise hairs on the back of your neck as you read. Even if you're not, the storyline is suspenseful and fascinating, and the characters evoke great sympathy and admiration. An excellent read.",
|
34 |
+
"id": "5"
|
35 |
+
},
|
36 |
+
"6": {
|
37 |
+
"label": false,
|
38 |
+
"text": "Simple & Easy to Use - A Practical approach to eating out: This guide is extremely to use. It provides sample menus that you'd see at Chinese, Indian and Thai restaurants. Then you are provided with descriptions of each dish and how it is prepared and the ingredients used. From there you are provided with specific considerations as to how the preparation or ingredient list may affect you if you have Gluten or Allergen issues.This book is the size of a passport and very organized and well written. The Chinese, Indian and Thai Cuisine Passport is perfect for making choices while traveling, or while dining at your favorite local restaurant.",
|
39 |
+
"id": "6"
|
40 |
+
},
|
41 |
+
"7": {
|
42 |
+
"label": true,
|
43 |
+
"text": "Being Fair....I am a fan of hers: and I really enjoyed her previous works, more than I could have ever imagined, but this record is horrible. The songs are trite, the lyrics are incredibly boring, indulgent and immature. The music is pop staple, with forgetable melodies and repetative chorus lines, I feel as if the studio wrote the entire album for her while she was sleeping, this just doesn't speak to any of her previous works at all. This album fits on the same shelf with a Nickelodeon-themed CD. Instead of heading in the direction of an artist like Alanis Morrisette, she is going backward and joining the ranks of Hannah Montana and the Naked Brothers Band. She is a great artist and her first two records are amazing. She is better than this CD and I am looking forward to her next effort.",
|
44 |
+
"id": "7"
|
45 |
+
},
|
46 |
+
"8": {
|
47 |
+
"label": true,
|
48 |
+
"text": "Sucked: I thought the DVD sucked tremendously. It was very boring and if I could, I would return it for a refund. There was only one \"small\" clip of Dylan himself. I'm very disappointed.",
|
49 |
+
"id": "8"
|
50 |
+
},
|
51 |
+
"9": {
|
52 |
+
"label": false,
|
53 |
+
"text": "Excellent product: Easy to install. If you have a newer furnace you probably do not need the swail switch as the HE220A comes with a Humistat which can be connected to the furnace. They recommend the Honeywell 32005847-001 Installation Kit, Bypass which is a little pricey and you can probably buy the pieces of this kit cheaper individually from Home Depot or Lowes or ACO as well as the filters.",
|
54 |
+
"id": "9"
|
55 |
+
},
|
56 |
+
"10": {
|
57 |
+
"label": false,
|
58 |
+
"text": "Very happy.: I am very happy with this trashcan. I was unable to find one in the stores to fit the space in my cabinet, but this one does the job. It is very sturdy and looks like it will put up with years of use.",
|
59 |
+
"id": "10"
|
60 |
+
},
|
61 |
+
"11": {
|
62 |
+
"label": true,
|
63 |
+
"text": "These aren't Throughbreds!: This makes me so mad. All these new authors are coming and changing the series. Nothings the same anymore and the plots are repeditive. Don't even bother reading these books until #32 these are like a different series. I don't know excactly what's happing but these new authors suck!",
|
64 |
+
"id": "11"
|
65 |
+
},
|
66 |
+
"12": {
|
67 |
+
"label": true,
|
68 |
+
"text": "Large and slow are a bad combination.: I bought this TV and returned it a week later, because it blurred so badly with motion that sports were unwatchable. I ended up buying a smaller Sony XBR4, and I have none of the issues (plus the picture is far, far better).This has nothing to do with 60 vs 120Hz. That is more important for DVDs and Blu-Ray signals that are 24fps (which doesn't divide evenly into 60 but does for 120). The LT52133 has an 8ms response time, which is extremely slow. A decent LCD should be 5 or lower.If you want an LCD, choose speed and quality over size. If you want size and quality but want to spend less, buy a plasma. Don't buy a big, cheap, slow LCD!I gave it 2 stars because I like the interface and remote.",
|
69 |
+
"id": "12"
|
70 |
+
},
|
71 |
+
"13": {
|
72 |
+
"label": true,
|
73 |
+
"text": "Skip it: This movie is very poorly written and the result is not distressing, just lame. The actors do their best but from very early on it is obvious that the material gives them nothing to work with. Fans of Colin Firth will experience a certain dim level of enjoyment. Minnie Driver is a treat but her character is no better written than the others. Vermont locations are worth something. With one or two moments of exception it's neither comedic nor romantic.",
|
74 |
+
"id": "13"
|
75 |
+
},
|
76 |
+
"14": {
|
77 |
+
"label": false,
|
78 |
+
"text": "Belive it i went to the concert?: hi everyone let me tell you i went to the concert i was amazed with what i saw cher was awsome i tell you buy the dvd. as i sat in front of the stage cher was doing a great job to us the she is living proof . So i urge you to buy it?",
|
79 |
+
"id": "14"
|
80 |
+
},
|
81 |
+
"15": {
|
82 |
+
"label": false,
|
83 |
+
"text": "Vale la pena.: En este libro se narra de una forma muy interesante la vida de una familia en particular. Lo que mas me gusto de este libro fue la manera en que la autora describe a lo largo del libro las personalidades de los sujetos envueltos en la novela; que vienen a ser muy distintos y extremos, lo cual, intensifica el drama... Definitivamente es un buen libro y lo recomiendo a todos.",
|
84 |
+
"id": "15"
|
85 |
+
},
|
86 |
+
"16": {
|
87 |
+
"label": false,
|
88 |
+
"text": "Nummie Children's story: I ordered this book for our grandchildren. Two boys 5 & 3 and a 4 month old girl. All love the story. The mouse is determined.",
|
89 |
+
"id": "16"
|
90 |
+
},
|
91 |
+
"17": {
|
92 |
+
"label": true,
|
93 |
+
"text": "Seem to be alone on this one: Looking at the other reviews, I seem to be the only one that was disappointed with this book. The content is too babyish in most of it for older tweens and the more \"grown up\" content would be over a younger tween's head. I had a quick read through and with every paged turned, I thought duh. I'll be looking around for another book shortly.",
|
94 |
+
"id": "17"
|
95 |
+
},
|
96 |
+
"18": {
|
97 |
+
"label": false,
|
98 |
+
"text": "Best yet: by far the best EA game yet. I especially like the easy controls and kick - a graphics. the playbook is extremely accurate and detailed. Also the fight songs and cheerleaders were a nice touch. this is an excellent game and worth checking out.",
|
99 |
+
"id": "18"
|
100 |
+
},
|
101 |
+
"19": {
|
102 |
+
"label": true,
|
103 |
+
"text": "washed out: A bit like Simply Reds version of the Valentine bros hit \"Moneys too tight to mention\" - this cover version has nothing of the driving energy that characterised the original recording.",
|
104 |
+
"id": "19"
|
105 |
+
},
|
106 |
+
"20": {
|
107 |
+
"label": false,
|
108 |
+
"text": "great water bottle: I love this bottle it is great. I like knowing it is non toxic and it just works very well. You can have it full and lay it down and it doesn't leak at all.",
|
109 |
+
"id": "20"
|
110 |
+
},
|
111 |
+
"21": {
|
112 |
+
"label": false,
|
113 |
+
"text": "Nice goggles: I am pretty happy with these goggles. They work well during swim workouts in the pool. I do notice a little bit of fogging from time to time. I had hoped to wear them during an upcoming triathlon, but based on a few instances where they slipped a little in the pool I am concerned that they won't be secure enough. I will keep using them in the pool, but will likely get different ones for open water races.",
|
114 |
+
"id": "21"
|
115 |
+
},
|
116 |
+
"22": {
|
117 |
+
"label": true,
|
118 |
+
"text": "aaahhh nnnoooooo!: Hopefully the last film in one of the worst horror trilogys ever made. This series pretty much ruined the horror film for years to come, for one its too self aware, thats incredibley annoying, second everyone acts like they are on Friends or some sitcom. The acting is just plain bad and unconvincing. Now the gore, if you're going with material this weak you should load it up with disgusting violence, is there any in the Scream series? No.Everyone went to see this movie just to see who THE KILLER is. This movie sets low standards to be met, you expect alot of people to die, one shock, then we find out who the killer is, then you go home. Every horror film being made today is like that, there's nothing new or exciting or risk taking, its the same stuff over and over and people are laping it up like dog food.This film is what you've come to expect, bad acting, some people die and we eventually find out who the killer is and all is merry and well. Pathetic.",
|
119 |
+
"id": "22"
|
120 |
+
},
|
121 |
+
"23": {
|
122 |
+
"label": false,
|
123 |
+
"text": "A classic of its kind: This movie is a classic of its kind and much better that a lot of movies, that followed. It is not one of the best, but it still deserves five stars...",
|
124 |
+
"id": "23"
|
125 |
+
},
|
126 |
+
"24": {
|
127 |
+
"label": true,
|
128 |
+
"text": "Nice suite, but Virtual PC 7 disappoints on my G5: I purchased the upgrade since I'd already bought both Office v.X and Virtual PC 6.1 last year.The biggest letdown is that Microsoft's promised support for the G5 is nearly non-existent. I have a dual processor G5 with an ATI Radeon 9800 card (Apple), and after trying to install Virtual PC 7 three times, I cannot get a VM to work. It did install (and work) flawlessly on my G4 Powerbook. Googling for reviews finds it's very hit or miss, but if (when) it misses, you'll regret investing the extra $$$ in an immature product.",
|
129 |
+
"id": "24"
|
130 |
+
},
|
131 |
+
"25": {
|
132 |
+
"label": true,
|
133 |
+
"text": "Okay player, don't expect a miracle: I bought this DVD player at Circuit City earlier this yr for about a $100. I hooked it up to a 47\" Vizio LCD (which by the way has an awesome picture) using a HDMI cable. After fine tuning this product, I was very, very, very diasppointed. The picture was very \"grainy\" (lots of pixels). I have a $35 DVD player that only utilizes an s-video cable that produces a much more crisp picture. Be warned, the picture stinks.",
|
134 |
+
"id": "25"
|
135 |
+
},
|
136 |
+
"26": {
|
137 |
+
"label": false,
|
138 |
+
"text": "A revelation of the science of consciousness evolution and all natural growth: Here is a readable and fascinating account of the development of the new science of chaos theory, the only body of ideas that describes how the natural world as experienced by human beings emerges out of basic quantum processes. The different explorers and innovators of the new science are introduced in a personable way that will enchant the interested reader.",
|
139 |
+
"id": "26"
|
140 |
+
},
|
141 |
+
"27": {
|
142 |
+
"label": true,
|
143 |
+
"text": "Don't say that I didn't warn ya' !: I'm absolutely convinced that Delbert McClinton had no controlover the release of this CD. I rated it 1 star simplybecause there is no 0 star rating ! In actuality , I am not certain that the vocalist on this recording IS Delbert McClinton. Only on the Mr. Pitiful track is there any similarity at all to Delbert's voice. This is the perfect CD for someone with money to burn who would like to have a recording of a 1960's garage band recorded in a garage and who should be working in a garage ! Delbert fans...run fast and run far away from this ! END",
|
144 |
+
"id": "27"
|
145 |
+
},
|
146 |
+
"28": {
|
147 |
+
"label": true,
|
148 |
+
"text": "This item is not available: I ordered this unit on February 7th. Every time I checked back on the status of the order, it read \"not shipped\" and the estimated shipping date got moved out. I really don't think this unit is avaialble from the company anytime soon. I cancelled the order.",
|
149 |
+
"id": "28"
|
150 |
+
},
|
151 |
+
"29": {
|
152 |
+
"label": true,
|
153 |
+
"text": "I used to like ABBA...: I used to like ABBA, until I saw Mama Mia! A horribly disjointed musical, where songs feel contrived to fit into the story; a story that doesn't seem to come together. Individual songs are usually done alright, but don't segue from one to another very well.The cast butchered several of the songs, but especially S.O.S, Take A Chance On Me, and anything where Pierce Brosnan sang. On a side note, I also counted at least two violations of Chekov's Gun. And finally, I think it has a bad moral message. Which you only recognize if you manage to sit through the whole thing.If there is justice in the world, cast members without established careers won't get to have them as punishment for the worst movies I've seen since The Talented Mr. Ripley.",
|
154 |
+
"id": "29"
|
155 |
+
},
|
156 |
+
"30": {
|
157 |
+
"label": true,
|
158 |
+
"text": "A complete disaster!: If you're like me, you probably wanted to check out this movie because it sounded like it really could be an excellent supernatural Gothic horror tale full of goblins and wicked things alike. Well, don't make the same mistake I did and actually watch it. It's horrible. Terrible. An honest to goodness waste of film. The acting is wretched, the film quality is rotten (it actually looks twenty years older than it is), and the plot is thin, weak, and does not give you what it's supposed to. The only reason I bothered to give this film 1 star is because of Alexis Arquette -- he's great looking, but should have left this film out of his career.",
|
159 |
+
"id": "30"
|
160 |
+
},
|
161 |
+
"31": {
|
162 |
+
"label": false,
|
163 |
+
"text": "beautiful detail: I just purchased these Dover COloring Books for my mother and she loves them. The detail is out of this world and the variety of colors you can use are only limited by your inagination. HIGHLY RECOMMENDED!",
|
164 |
+
"id": "31"
|
165 |
+
},
|
166 |
+
"32": {
|
167 |
+
"label": true,
|
168 |
+
"text": "Very disappointed: I looked forward to getting this movie as I had heard many good things about it but it was nothing like I had imagined or been led to believe. There is very little actual history in it or real Christian experience except for the background because the main focus is a soap opera style romance and caricature figures. I agree with the reviewer who described it as a mixture of \"tawdry Hollywood sex\" somehow interspersed with a vague nod to Christianity. The only decent scene was the arena scene where the Christians are going to their deaths singing hymns - but that's not enough to make it a great or even a good movie. Not personally to my taste anyway.",
|
169 |
+
"id": "32"
|
170 |
+
},
|
171 |
+
"33": {
|
172 |
+
"label": true,
|
173 |
+
"text": "Unreliable minikit: I bought this minikit because it got good reviews and it would be perfect for my purposes. However it switches on and off whenever it wants, it looses contact with the phone. Very often the on/off button works only in a horizontal position (?) I use a Treo 650, which is on the compatible phone list. When I contacted Parrot, they said it wasn't (?) At last I opened the unit, but there are no moving parts inside except the micro switches. It is giving me a headache, so I will go searching for an alternative.",
|
174 |
+
"id": "33"
|
175 |
+
},
|
176 |
+
"34": {
|
177 |
+
"label": false,
|
178 |
+
"text": "A Christmas Classic!: This is surely one of the best classical Christmas recordings available. Don't buy the older version, as the quality of this recording is excellent. This is one of those \"Every Christmas - Can't have Christmas without\" recordings.",
|
179 |
+
"id": "34"
|
180 |
+
},
|
181 |
+
"35": {
|
182 |
+
"label": true,
|
183 |
+
"text": "too narrow: These were the narrowest pair of D size shoes I have ever tried on. I don't care how nice a shoe looks. If it don't fit it just don't fit.",
|
184 |
+
"id": "35"
|
185 |
+
},
|
186 |
+
"36": {
|
187 |
+
"label": true,
|
188 |
+
"text": "Lack of extension: This earphones lack a descent extension cord. ITs very small cable, but its of good quality. Sadly, cord its too short, and the extension is useless.",
|
189 |
+
"id": "36"
|
190 |
+
},
|
191 |
+
"37": {
|
192 |
+
"label": false,
|
193 |
+
"text": "Easy-Reading: This is the 3rd Southern Sisters Mystery I've read. They're easy, fast and funny murder mysteries, with lots of cute family stories intertwined in the intrigue.",
|
194 |
+
"id": "37"
|
195 |
+
},
|
196 |
+
"38": {
|
197 |
+
"label": true,
|
198 |
+
"text": "it'd be great if it worked like it was supposed to: for the first 30 seconds it was lovely, but i believe that either the motor isn't powerful enough to keep the shaft rotating smoothly or 3 AA batteries just don't provide enough juice for the motor to work more than 30 seconds. it was a nice idea, but i'm rather dissapointed. the jelly material is somewhat difficult to maintain also. i think if it were hooked up to a larger battery pack it'd be WONDERFUL... which i think i may have a macgyver friend with a knack for electronics attempt to do for me.",
|
199 |
+
"id": "38"
|
200 |
+
},
|
201 |
+
"39": {
|
202 |
+
"label": false,
|
203 |
+
"text": "Not Hornby's best but still good: I loved About a Boy and really, really loved the sardonic wit of High Fidelity. About a Boy is much deeper but just as cynical. Maybe even more so. The characters are richly drawn and just complex enough to keep the reader wanting more. Good read, but best to take some time with this one. Not recommended for a summer beach read.",
|
204 |
+
"id": "39"
|
205 |
+
},
|
206 |
+
"40": {
|
207 |
+
"label": true,
|
208 |
+
"text": "A Disappointment: As with most Taunton Press publications, the illustrations and photographs in this book are spectacular and the organization and layout is superb. Nonetheless, I found this book disappointing. It lacks both depth and breadth. I had hoped for a detailed review of wood joinery including some of the more unusual joinery found in Japanese woodworking. This book, however, is targeted more toward the beginner. Even so, it does not cover the details and \"tricks\" of even the most basic techniques in sufficient detail to allow beginners to easily reproduce them. Consequently, it is unclear who this book was written for - not the beginner as it lacks depth, and not the advanced woodworker as it lacks breadth. Far more effort appears to have been put into appearance and organization than in content.",
|
209 |
+
"id": "40"
|
210 |
+
},
|
211 |
+
"41": {
|
212 |
+
"label": true,
|
213 |
+
"text": "Horrible. Don't do it!: Great price for the item when a 6' one of these at Best Buy is $20. Thing is, the one from Best Buy fits in the outlet and stays there. This cord fits very loose and does not connect. I bought 2 of them, neither did what they were suppose to.As much as I hate to say it, but, buy the more expensive one. At least it works.",
|
214 |
+
"id": "41"
|
215 |
+
},
|
216 |
+
"42": {
|
217 |
+
"label": false,
|
218 |
+
"text": "Given as a gift...: Given to my best friend as a gift. She loves it. Her fiance enjoys making coffee for her in the mornings. :)",
|
219 |
+
"id": "42"
|
220 |
+
},
|
221 |
+
"43": {
|
222 |
+
"label": false,
|
223 |
+
"text": "Love the ring.: This is a nice ring. I was worried it out be thin and cheap looking, but it's not. It's a very pretty stylish ring. Go for it.",
|
224 |
+
"id": "43"
|
225 |
+
},
|
226 |
+
"44": {
|
227 |
+
"label": true,
|
228 |
+
"text": "Beautiful writing Marred by One-Note Characterizations: How could Kingsolver have ruined her book with such an obvious error? Nathan is a strident paper doll that flattens the whole story. Just as bad, the author has all the narrators using the same ironic tone to decribe him, deadening their voices as well. At the same time, Kingsolver doesn't have the guts to show him doing something trully terrible. I don't trust an author who can't let the reader make up his own mind, and as a consequence I couldn't trust her views about ANYTHING in the story. I'm giving this two stars for her descriptions of the African landscape, and that is all.",
|
229 |
+
"id": "44"
|
230 |
+
},
|
231 |
+
"45": {
|
232 |
+
"label": true,
|
233 |
+
"text": "Much worse than any cordless phone I've ever had: This phone cuts out only 2 rooms away from the base station. There is static noise, and callers on the other end complain about sound quality. I can't go into the garden, which used to be no problem with my old 900 MHz phone.",
|
234 |
+
"id": "45"
|
235 |
+
},
|
236 |
+
"46": {
|
237 |
+
"label": true,
|
238 |
+
"text": "Waste of time & money: The first Hangover was not too bad, this one was just terrible. The acting is bad, the script is bad, everything about this movie was just bad. Do yourself a favor, don't buy this movie as it is a total waste of time and money.",
|
239 |
+
"id": "46"
|
240 |
+
},
|
241 |
+
"47": {
|
242 |
+
"label": true,
|
243 |
+
"text": "Did Not Work For Me!: Impressive You Tube Video (Like a Sci-Fi Fantasy). In reality it's a high speed Easy Out so unsurprisingly it broke faster than an Easy out. This product did not work for me. The drill part did not drlil, the puller part did not pull. It was a total zero.",
|
244 |
+
"id": "47"
|
245 |
+
},
|
246 |
+
"48": {
|
247 |
+
"label": false,
|
248 |
+
"text": "Excellent book, long overdue.: From a very long time women were told that looking good was of utmost importance. This was without regard to health or fitness and how age affected these parameters. Witness the whalebone and other types of corsets, the spike heeled shoes and the numerous weight loss programmes on the market (some of which are downright dangerous). Now there is a book, backed by solid research, that allows women of all ages to remain fit and healthy for a lifetime. I am certainly going to recommend this book to all the women I know.Bentley Norville",
|
249 |
+
"id": "48"
|
250 |
+
},
|
251 |
+
"49": {
|
252 |
+
"label": true,
|
253 |
+
"text": "not an all star: Not a practical guide in this collecting age. Does NOT have a comprehensive list; meaning it does NOT cover all manufacturers and, more importantly, for the ones it does, only provides listings of the base set. That means no insert or variation pricing whatsoever. Also, no oddball or minor league issues are listed. Generally speaking, unless you are collecting base sets prior to the advent of inserts and alternate versions of the base set, this guide is fairly useless.",
|
254 |
+
"id": "49"
|
255 |
+
},
|
256 |
+
"50": {
|
257 |
+
"label": true,
|
258 |
+
"text": "Again, second rate city, third rate writer: Just another example of Mr. Lindberg's pitiful attempt at exhibiting a strong expertise on a subject with which he is clearly obsessed. Don't waste your time with this book, either. It is poorly written and fails to engage the reader. You might consider using this book and the first book he wrote on the same subject, as a pair of bookends. That is about all they are worth.",
|
259 |
+
"id": "50"
|
260 |
+
},
|
261 |
+
"51": {
|
262 |
+
"label": false,
|
263 |
+
"text": "Reality: No one should need to convince you to buy this book, you should just do it! It's so well written and worded and brings you right to the heart of a sexual reality that most people like to pretend doesn't really live and breath in their fair cities. I never again want to hear someone bad mouth a working girl for what she does. I will and do now however look at men with a curious eye wondering if they are depraved peep show window lickers :)",
|
264 |
+
"id": "51"
|
265 |
+
},
|
266 |
+
"52": {
|
267 |
+
"label": true,
|
268 |
+
"text": "Bummer: Visual effects and Battle footage were great...the other 85% of the movie was just lousy fluff...",
|
269 |
+
"id": "52"
|
270 |
+
},
|
271 |
+
"53": {
|
272 |
+
"label": false,
|
273 |
+
"text": "The spark of idependence: Filled with the independent spark that made us all love life at one point or another. A fun, introspective and nonsensical movie that sticks with you.",
|
274 |
+
"id": "53"
|
275 |
+
},
|
276 |
+
"54": {
|
277 |
+
"label": false,
|
278 |
+
"text": "What I expected from Mirman's website. Funny. Funny. Russian.: lol, gotta love Eugene. Even when his audience doesn't initially laugh, he gets in a good zinger at himself and they laugh at that. He's witty without being condescending, and uncomplicated without seeing contrived. However, if you're not a fan of irreverant humor, this may not be for you.",
|
279 |
+
"id": "54"
|
280 |
+
},
|
281 |
+
"55": {
|
282 |
+
"label": true,
|
283 |
+
"text": "Do not...repeat...do not bother!: It is not often that I offer a negative review but this compilation while attractive does not deliver at all.The foot massage gizmo is awkward and uncomfortable.The pumice stone leaves rough splinter like skin.The foot scrub doesn't reall scrub.The rotary action tool has five heads, none of which work well and you must hold the switch in place or it turns off. It is cumbersome and ineffective.The one star was initially given for a foot brush (which later lost its bristles very easily as I update the review) and a sweet smelling foot repair balm.Don't waist your money. Soak your feet and invest in an inexpensive German Titania file, smooth and coarser side, or a like product. It will last for years.",
|
284 |
+
"id": "55"
|
285 |
+
},
|
286 |
+
"56": {
|
287 |
+
"label": true,
|
288 |
+
"text": "Not Sandra's Best: Ms. Brown has written better romance novels. Don't give up on her if this was your first Sandra book.The feeble female lead struggles with a 15-year crush that walks back into her life. The smug male lead acts like a jerk through most of the novel. The romance scenes grapple to muster up passion but fall short. Both of the main characters bothered me; my favorite character was the 17-year old.A quick read...about 4 hours (with interruptions) for me...but probably not worth it.",
|
289 |
+
"id": "56"
|
290 |
+
},
|
291 |
+
"57": {
|
292 |
+
"label": false,
|
293 |
+
"text": "Impressed: Lots-O-Fun. Wood and glass toys are high quality and are a good fall back for the kids to play with they are \"bored\". Would buy again.",
|
294 |
+
"id": "57"
|
295 |
+
},
|
296 |
+
"58": {
|
297 |
+
"label": true,
|
298 |
+
"text": "Light turned on by itself 3 times: The installation was easy. I used it for a week, everything worked fine, EXCEPT the light it connected to turned on by itself 3 times so far, with no one near to either one of the switch. Not sure whether it is a defective unit, or this product is too sensitive to noise. I'm returning this product and will just install a regular switch instead.",
|
299 |
+
"id": "58"
|
300 |
+
},
|
301 |
+
"59": {
|
302 |
+
"label": false,
|
303 |
+
"text": "good battery: I feel kind of silly writing a review for a battery, but have to say that these last a LONG time. Work very well.",
|
304 |
+
"id": "59"
|
305 |
+
},
|
306 |
+
"60": {
|
307 |
+
"label": false,
|
308 |
+
"text": "Even a Woman finds it funny: Yes, even a woman finds \"Married to Mommy\" funny. The book gets you laughing aloud when it is trying to make fun of \"Mommies\". The truth is that it really is making fun of the stupidity of men and their simple basic needs of sex, getting out of work, and beer. Of course, the truth is always funny.A definite MUST for any woman, married or not. We will now know all the secret tricks the men try to use on us.By the way, I am NOT a MOMMY!",
|
309 |
+
"id": "60"
|
310 |
+
},
|
311 |
+
"61": {
|
312 |
+
"label": false,
|
313 |
+
"text": "Gungrave...not quite what you might expect: Those thinking this is another version of Trigun will be disappointed. Gungrave is actually a lot deeper and more complex. The lead is short on dialouge, but the story has more depth and character development than most anime. The first DVD is more about the main character's past than about the reanimated killing machine he's become, but it definitely leaves you wanting more.",
|
314 |
+
"id": "61"
|
315 |
+
},
|
316 |
+
"62": {
|
317 |
+
"label": false,
|
318 |
+
"text": "Error in product description: It's great in every way. However, if you'd prefer a digital tuner (as I do), then you might need to look further. The product description boasts a digital AM/FM tuner, but it's disappointingly an analog AM/FM tuner.Overall - especially for the price - I think it's pretty good.",
|
319 |
+
"id": "62"
|
320 |
+
},
|
321 |
+
"63": {
|
322 |
+
"label": false,
|
323 |
+
"text": "good phone but not as user friendly as it could be: Battery life is very good. Phone has good range. My only complaint is it's to involved to get your message from the handset.",
|
324 |
+
"id": "63"
|
325 |
+
},
|
326 |
+
"64": {
|
327 |
+
"label": true,
|
328 |
+
"text": "Big waste of money (and space in my house!): My 5 year old son wanted this so bad, but when we got it for him, there were so many pieces to put together that didn't fit together well, he never played with it. It just sits on our floor in many pieces taking up toy space! What a waste!",
|
329 |
+
"id": "64"
|
330 |
+
},
|
331 |
+
"65": {
|
332 |
+
"label": false,
|
333 |
+
"text": "Don't want to take it off: Very satisfied with an earlier purchase of this Bali bra model, I was just as pleased with the new one. Very comfortable, well made and a good neutral color. It will be my next choice, too.",
|
334 |
+
"id": "65"
|
335 |
+
},
|
336 |
+
"66": {
|
337 |
+
"label": false,
|
338 |
+
"text": "Fantastico: If anybody who's into rock music is ever looking for a band to keep you on your toes, this is the band. I've been a fan for 10 years now, and no album has ever sounded like any of their previous albums. This disc is fantastic with such a variety of styles, as are the previous releases, even back to the Rainbow Butt Monkey days.",
|
339 |
+
"id": "66"
|
340 |
+
},
|
341 |
+
"67": {
|
342 |
+
"label": true,
|
343 |
+
"text": "too much visual: There are far too much designs, visuals, colors, etc in the book - this is highly distracting, as TV screen can be...By way of example (among so many...), what is the use of colors with the three squares of the Pyth. theorem???? this is as useless as writting 2+3=5 with 2 in blue, 3 in red and 5 in yellow...I wish I had purchased the 2nd edition, which according to reviews was closer to what I was looking for.",
|
344 |
+
"id": "67"
|
345 |
+
},
|
346 |
+
"68": {
|
347 |
+
"label": false,
|
348 |
+
"text": "Aretha's First Arista Release Showed Pleasures to Come: After a long and musically satisfying career with Atlantic, Aretha severed her ties with that company and moved under the wing of Arista's Clive Davis. With the start of the 1980's, Aretha was looking for new territory to conquer and almost succeeded with this mixed bag.\"United Together\" is a fine tune that benefits from beautiful orchestral arrangement that is matched by Aretha's superb vocal instrument. The remake of \"Can't Turn You Loose\" allows Aretha to show why she is the Queen of Soul\" for she really belts this one out. Another cover, that of the Doobies' \"What a Fool Believes,\" is an interesting interpretation. The final cut \"School Days\" appears to be \"autobiographical\" for every girl growing up in the fifties.Although not as strong as her Atlantic work, \"Aretha\" is still a suitable addition to the artist's discography.",
|
349 |
+
"id": "68"
|
350 |
+
},
|
351 |
+
"69": {
|
352 |
+
"label": true,
|
353 |
+
"text": "Misguided Purchase: The photo and description do not reflect the product. The screen panel kit I received was white. What a huge inconvenience during a time-crunch.",
|
354 |
+
"id": "69"
|
355 |
+
},
|
356 |
+
"70": {
|
357 |
+
"label": true,
|
358 |
+
"text": "Banacek: My husband and were looking forward to seeing this series.The first show was SO boring, we finally just quit watching it.Actually, we haven't gotten around to watching anymore. I guess we were afraid of a repeat.Maybe that was just once, I hope!",
|
359 |
+
"id": "70"
|
360 |
+
},
|
361 |
+
"71": {
|
362 |
+
"label": false,
|
363 |
+
"text": "JDT: Uncle Tupelo is without doubt one of the most under appreciated groups of the 90's. Anodyne, like each of the three albums that came before it, has everything that a remarkable recording requires: great songs, honest lyrics, and artists who really care about the music they are making. Like the best of Dylan and Springsteen, the songs are about real people with real troubles and joys. When you hear them you know they are coming from the heart. The songs contributed by Jay Farrar and Jeff Tweedy are easily differentiated by the voacls, music, and lyrics. What makes this record interesting is how well these unique sounds compliment each other. The union is seamless.",
|
364 |
+
"id": "71"
|
365 |
+
},
|
366 |
+
"72": {
|
367 |
+
"label": false,
|
368 |
+
"text": "Well Worth Reading: First a confession: Miriam Wasserman was my mother. However, she published several books, but this is the only one I really found useful. She walks the reader through the New York City school system and the attitudes of different groups involved in the system back in the 1960s. This includes parents, teachers and administrators. Her view is that the further away one got from parents and students, the more prestige one had. She meticulously describes the teachers' strike of 1968 against \"community control of schools\", a strike of which she is extremely critical. She explores the racism that was involved in this strike, including using quotes from striking teachers, etc. It should be emphasized that the author was pro-union all her life, so her views don't stem from an anti-union bias. The book also covers the high school student rebellion which coincided with and followed the strike.",
|
369 |
+
"id": "72"
|
370 |
+
},
|
371 |
+
"73": {
|
372 |
+
"label": false,
|
373 |
+
"text": "compact and loaded: I bought this phone after reading the cnet reviews and really liked it. It looks small and really compact. I like the camera pics at 2 mega pixel and bright flash. The mp3 player is crisp. The headset that comes along delvers amazing fM radio. I think my phone is not very loud and you have a problem when you are around a noisy crowd. I just bought this phone again for my cousin. He likes it too. Almost forgot the display is very good.",
|
374 |
+
"id": "73"
|
375 |
+
},
|
376 |
+
"74": {
|
377 |
+
"label": false,
|
378 |
+
"text": "Outstanding text!: Brooks/Cole should keep this text in their catalog for ages! It is well-written, examples are generally quite clear, vocabulary is introduced well, and the exercises develop real skills, rather than simply be busy-work. One of the best calculus books ever!",
|
379 |
+
"id": "74"
|
380 |
+
},
|
381 |
+
"75": {
|
382 |
+
"label": false,
|
383 |
+
"text": "Excel 2003 Bible: Very good source of information. I will most likely buy other books in this series.",
|
384 |
+
"id": "75"
|
385 |
+
},
|
386 |
+
"76": {
|
387 |
+
"label": false,
|
388 |
+
"text": "Tasting is Believing: Gluten-free breads used to have a gritty texture from the rice flour, and were too soft for sandwiches. Bette Hagman uses garbanzo/fava bean flour, sorghum flour, tapioca flour, and corn starch to create breads which have a similar texture to wheat flour breads, and the flavors of her breads are fabulous.My BF bought me this book and a great tasting beverage to drink it with. Since he knows I quit coffee recently, he's been really wonderful helping me in cope with my mood swings. S o y f e e is made from soy beans that is roasted just like coffee. I enjoy the taste and don't miss coffee one bit. Buy it online at www.s o y c o f fee.com.This is a 'must have' for anyone baking gluten-free. I think all of Bette Hagman's books are wonderful and a must for those with gluten intolerance.",
|
389 |
+
"id": "76"
|
390 |
+
},
|
391 |
+
"77": {
|
392 |
+
"label": false,
|
393 |
+
"text": "5 stars for the show, no stars for the \"Collector's Edition\": I was really looking forward to getting this Collector's Edition and see what extras were added. I knew it wasn't a lot - just a mini-book and a documentary - but I figured it would be packaged in a cool way.Wrong.As others have already mentioned, the Collector's Edition is *literally* theAvatar: The Last Airbender - The Complete Book 1 Collectionslipped into another cardboard box, with a little booklet and DVD in an envelope (not even a case!) wedged in. It's really disappointing; it would have been so easy to create a quality Collector's Edition but the studio couldn't be bothered, I guess.",
|
394 |
+
"id": "77"
|
395 |
+
},
|
396 |
+
"78": {
|
397 |
+
"label": false,
|
398 |
+
"text": "sula scottcampos: Sula, a book that talks about the issues of being a black women is a really good novel to read.One of the reasons I recommend it is because of its realism and its themes - death, sex, friendship and poverty.I also think that its characters are very good, its easy to identify with one or both of them. I really recommend this book to anyone who enjoys good literature.",
|
399 |
+
"id": "78"
|
400 |
+
},
|
401 |
+
"79": {
|
402 |
+
"label": false,
|
403 |
+
"text": "Fantastic! It's a must-have for girls!: I hated razor, tried shaving but it did not work for me. Shaving made the hair grows thicker and faster afterwards, plus the roots are impossible to be getting rid of. After reading the reviews, I ordered it to try, I used it for once and already fall in love with this. I used to use small tweezer to pluck out my leg's hair, in order to avoid the razor, it took me a few hours to do that but this super electronic tweezer works wonder! You won't see the black roots and I have smooth and silkly legs in 20 mins. It does not hurt at all, if you use it on your legs. But, if you use it at your under arm, it won't be a pleasant feeling, of course! I will never use anything else besides this for hair removing anymore! highly recommended!",
|
404 |
+
"id": "79"
|
405 |
+
},
|
406 |
+
"80": {
|
407 |
+
"label": true,
|
408 |
+
"text": "This is not a toy: I guess I was expecting more out of these leave window decals. I just didn't find them attractive after placing them on my window, they seem very cheap, I guess because they are cheap.I threw them away.",
|
409 |
+
"id": "80"
|
410 |
+
},
|
411 |
+
"81": {
|
412 |
+
"label": false,
|
413 |
+
"text": "Wonderful book for anyone running a professional hatchery: This book is aimed more for hatcheries that are raising Trout, Salmon, Catfish and other food fishes. However, there is so much information in this book that even ornamental fish hatcheries will find an incredible amount of useful information. The chapters on Fish Nutrition are especially helpful.",
|
414 |
+
"id": "81"
|
415 |
+
},
|
416 |
+
"82": {
|
417 |
+
"label": false,
|
418 |
+
"text": "Amazing book!!: Once again, Eric Victorino's artistic talent is put into this great free-verse poetry book. I couldn't put it down and I finished it the day I received it in the mail. All of the poems are awesome but the one I found the most interesting was \"It's A People Business.\" All of the experiences in his life, personally and with his band, come to life in this book. Please check it out! It's worth every penny!!",
|
419 |
+
"id": "82"
|
420 |
+
},
|
421 |
+
"83": {
|
422 |
+
"label": false,
|
423 |
+
"text": "The white trumpet contender respect Miles Davis!: The story of the Jazz in the Fifties certainly would be remain unfinished without the ominous presence of this outstanding virtuoso. Baker sound still possesses this alluring hook, this magnetic engagement charm, eloquent expressiveness, enrapturing lyricism and contagious rhythm, despite the elapsed time, which confirms by itself the status of his musicianship.This selection is jus a little sample of the broad universe of his genius. A well thought selection of great musical successes, available, preserved and immortalized by the Digital Technology for our future enjoyment.Absolutely indispensable in your treasured collection.",
|
424 |
+
"id": "83"
|
425 |
+
},
|
426 |
+
"84": {
|
427 |
+
"label": true,
|
428 |
+
"text": "What the?: I'm sorry, maybe it's just me but I can't helping stating that this has to be one of the wrost movies I've seen in my life!Can you say boring? Can you say doesn't make sense at all? The first 30 minutes of the movie were O.K. But it went downhill after that. This movie is a prime example of a director attempting to make a deep movie with a meaningful lesson but failed on all levels. I don't recommend this movie unless you want to go to sleep or you don't have anything else to do.",
|
429 |
+
"id": "84"
|
430 |
+
},
|
431 |
+
"85": {
|
432 |
+
"label": false,
|
433 |
+
"text": "very very good!!!!: linda blair is a young girl who is possessed. and her mother doesn't know what to do until one day when she hears her daughter screaming and stabbind herself she knows what to do GET AN EXORCIZIM!!!",
|
434 |
+
"id": "85"
|
435 |
+
},
|
436 |
+
"86": {
|
437 |
+
"label": false,
|
438 |
+
"text": "Awesome product for the price!: This range extender works as advertised! I am very happy with the purchase. I was a little worried after reading some of the horror stories here, but I have to say, Chovy's review instructions (on this site) were just this ticket to get the repeater up and running in less than 30 minutes. It was unbelievably easy to install! Do not be frightened by negative reviews. If you can set up a wireless network, you can set up this repeater. However, I did upgrade the firmware before I did anything else and maybe that helped. I got the firmware update from the Belkin site.",
|
439 |
+
"id": "86"
|
440 |
+
},
|
441 |
+
"87": {
|
442 |
+
"label": true,
|
443 |
+
"text": "Slight: This book is either a heavily illustrated short story collection or a text-heavy comic. Its unusual format is its most original feature. Its plots are negligible, but its illustrations and text evoke a unique atmosphere of self-conscious nonconformism. Although its target audience is dare-to-be-different teens and college students, its interesting turns of phrase and expressive line drawings are not devoid of interest for general audences.",
|
444 |
+
"id": "87"
|
445 |
+
},
|
446 |
+
"88": {
|
447 |
+
"label": false,
|
448 |
+
"text": "ANgeleyes: Seem to dry up their eyes fairly well, although I haven't seen the color (brown stain) change much yet.",
|
449 |
+
"id": "88"
|
450 |
+
},
|
451 |
+
"89": {
|
452 |
+
"label": true,
|
453 |
+
"text": "Nice Try: Salt Lake 2002 is not a bad game, but it isn't good either. The graphics are excellent, but some of the events are bad. Bobsleigh, and skiing aren't bad but the others are. You dont stay into it for long. I liked it for a while, but it gets boring.",
|
454 |
+
"id": "89"
|
455 |
+
},
|
456 |
+
"90": {
|
457 |
+
"label": true,
|
458 |
+
"text": "Cutler's share of the pie: This book was a major disappointment. I am familiar with books written solely by the Dalai Lama, such as the \"Library of Tibet\" series, which are much more engrossing and have much more substance than Cutler's book. Cutler attempts (successfully, sadly) to have his share of the profitable market that involves the Dalai Lama's writings. The book is insipid, does not try to explain any important issue in the light of Buddhist philosophy, and only rehashes issues that several other westerners already wrote about. It's another big ego trip: we keep hearing time and again about his opportunities to be with the Dalai Lama. What a shame, Cutler. I sold the book as soon as I finished it.",
|
459 |
+
"id": "90"
|
460 |
+
},
|
461 |
+
"91": {
|
462 |
+
"label": true,
|
463 |
+
"text": "Mostly tedious, with interesting parts: I found the writing interesting, and the subject fascinating, but I found myself frustrated by the author's difficulty in talking directly about the status of Muslim women with her interview subjects. The author spent many pages writing about the menus and dress of the many middle and upper-middle class women she interviewed. It seemed as though her interview subjects resisted her efforts to discuss the status of women in their countries, so we too as readers had to wade through much distracting material and misunderstandings about feminism and gender. Great travel stories, but not a great source of information about Muslim women.",
|
464 |
+
"id": "91"
|
465 |
+
},
|
466 |
+
"92": {
|
467 |
+
"label": true,
|
468 |
+
"text": "Sesame Street Toddler: I did not find this game to be as educationally sound as I would expect from Sesame street. There is too much talking before the program will react to a command. The graphics are jerky and the cursor acts like the target is magnetically charged and keeps pushing away the cursor. When the child actually does manage to click on a target, the cursor may still fly to another target and the child is told that his answer is wrong. Another example of educational problems is the pronunciation of \"eggs\" using a long \"a\" sound instead of a short \"e.\" This is not very helpful in teaching a child the sound for short \"e.\" Children that are used to playing computer games by themselves may find that this game is too frustrating to do alone. The open ended learning curve is a great idea. I just wish Sesame Street would hire a truly qualified literacy expert to help clean up the many problems in this program.",
|
469 |
+
"id": "92"
|
470 |
+
},
|
471 |
+
"93": {
|
472 |
+
"label": true,
|
473 |
+
"text": "needs a buzz cut and a point: I avoided reading this book, not because of the hermaphrodite subject matter, but because I have never read a multigenerational family saga that I liked. Many books let me down in the middle, and this was no exception. The beginning of the book was incredible and harrowing, with momentum and characterization. The post-America nextgens part of the saga was so boring I found myself flipping and flipping - always a bad sign. If there was some kind of larger point to all of that, then I must have missed it. Yes there's the identity duality and trinity themes playing out here: man/woman, greek/turkish/american modern/old world sick/healthy innocent/guilty original/reinvented. But it was almost as if the author was saying - here it is again - get it? I like my fiction much more subtle than this.",
|
474 |
+
"id": "93"
|
475 |
+
},
|
476 |
+
"94": {
|
477 |
+
"label": true,
|
478 |
+
"text": "OMG! DO NOT BUY!: I normally don't take the time to submit a review.In this case however, I feel obligated to do so.This is by far one of the worst purchases I have ever made.Here's why.....The contraption is far too bulky.The case's enclosing is unbearable, takes a good minute or so to open it.The texture of the material feels like a cheap toy.The overall design is horrible, something I could make in my basement.For the love of everything sacred, do not buy this thing.",
|
479 |
+
"id": "94"
|
480 |
+
},
|
481 |
+
"95": {
|
482 |
+
"label": false,
|
483 |
+
"text": "Good price, good quality: Comparable HDMI cables can be bought for 45 or more. Even though the price is cheap the quality is good, no problems so far.",
|
484 |
+
"id": "95"
|
485 |
+
},
|
486 |
+
"96": {
|
487 |
+
"label": false,
|
488 |
+
"text": "Good rock music: This is what i call rock music good beat and good lyrics, don't listen to the other reviews. This cd is one of the best, listen to a few songs and you will get hooked. I recommend this cd its awesome.",
|
489 |
+
"id": "96"
|
490 |
+
},
|
491 |
+
"97": {
|
492 |
+
"label": true,
|
493 |
+
"text": "BORING!: This movie is soo boring. How in the hell did this movie make so much at the box office. Do people really want to pay for crappy movies like this. bottom line this is a chick flick nothing is good. And now they are re-releasing this movie with more boring stuff. This is the worst movie ever.",
|
494 |
+
"id": "97"
|
495 |
+
},
|
496 |
+
"98": {
|
497 |
+
"label": true,
|
498 |
+
"text": "Already Rusting: Inferior quality. The plating is thin and rust is coming through the finish. Inexcusable for a product that is designed for use in a humid environment.",
|
499 |
+
"id": "98"
|
500 |
+
},
|
501 |
+
"99": {
|
502 |
+
"label": true,
|
503 |
+
"text": "confusing internet setup: i wanted a camera that could email photos but this camera will not go out through the router and the manual setup , to punch a hole thru router is confusing.",
|
504 |
+
"id": "99"
|
505 |
+
},
|
506 |
+
"04c7dfc0f94e4e88968d09b40edbfa14": {
|
507 |
+
"label": true,
|
508 |
+
"text": "The new gaming console is unaffordable.",
|
509 |
+
"id": "04c7dfc0f94e4e88968d09b40edbfa14"
|
510 |
+
},
|
511 |
+
"58f58a1a4cbb4bb699772ed934006ec8": {
|
512 |
+
"label": true,
|
513 |
+
"text": "How can it be sure difficult for @115830 to deliver a package to a University address? Two failed attempts so far ...",
|
514 |
+
"id": "58f58a1a4cbb4bb699772ed934006ec8"
|
515 |
+
},
|
516 |
+
"d4a3cd4877c54aef81c376eff8008df4": {
|
517 |
+
"label": false,
|
518 |
+
"text": "@204780 Glad they showed up! Hope you have a great flight! -Sean",
|
519 |
+
"id": "d4a3cd4877c54aef81c376eff8008df4"
|
520 |
+
},
|
521 |
+
"affe1d6548f84bed84238bac45cc10a1": {
|
522 |
+
"label": false,
|
523 |
+
"text": "@British_Airways Thank you! All looks good then \ud83c\uddec\ud83c\udde7\u2708\ufe0f",
|
524 |
+
"id": "affe1d6548f84bed84238bac45cc10a1"
|
525 |
+
},
|
526 |
+
"e304ea77a94c450a95690c7b605a035f": {
|
527 |
+
"label": false,
|
528 |
+
"text": "@246667 Thank you for reaching out, Andrea. The built in application in Windows 10 are exempted to be uninstalled. However, you can send this suggestion directly to our developers via the Feedback Hub so they can take a look at it: https://t.co/jowrfbgQm6. Keep in touch.",
|
529 |
+
"id": "e304ea77a94c450a95690c7b605a035f"
|
530 |
+
},
|
531 |
+
"76b694b019eb4e6888a422e144030bd0": {
|
532 |
+
"label": true,
|
533 |
+
"text": "@GWRHelp It\u2019s mainly the constant short forming and cancellations due to mechanical faults Phil. As a company, these excuses have been used ad nauseam for years and years. It just gets worse and no amount of rhetoric and IET self promotion can hide that fact.",
|
534 |
+
"id": "76b694b019eb4e6888a422e144030bd0"
|
535 |
+
},
|
536 |
+
"ce0698020b7a457396c7674b04db10e6": {
|
537 |
+
"label": false,
|
538 |
+
"text": "English gangster flick.",
|
539 |
+
"id": "ce0698020b7a457396c7674b04db10e6"
|
540 |
+
},
|
541 |
+
"52bda6cbab224899845e66e0474cdefc": {
|
542 |
+
"label": false,
|
543 |
+
"text": "sees the formula graph, the chip calculates the formula, able to \"survive\" thanks to its connection to Edit, develops a parallel personality and affords her abilities greater than she ever imagined...",
|
544 |
+
"id": "52bda6cbab224899845e66e0474cdefc"
|
545 |
+
},
|
546 |
+
"435aabe68c294963a05e090d479582bc": {
|
547 |
+
"label": false,
|
548 |
+
"text": "Aanandam is a 2016 Indian Malayalam campus musical film written and directed by Ganesh Raj in his directorial debut. Vineeth Sreenivasan produces the film under the banner of Habit Of Life with Vinod Shornur under Cast N Crew.",
|
549 |
+
"id": "435aabe68c294963a05e090d479582bc"
|
550 |
+
},
|
551 |
+
"f96313d0087e4941a359783634ef9e86": {
|
552 |
+
"label": false,
|
553 |
+
"text": "The remarkable story of The Weather Underground, radical activists of the 1970s, and of radical politics at its best and most disastrous.",
|
554 |
+
"id": "f96313d0087e4941a359783634ef9e86"
|
555 |
+
},
|
556 |
+
"f63e4502791a409fa2d750687d3841eb": {
|
557 |
+
"label": false,
|
558 |
+
"text": "A young widow on a trip to the backwoods stumbles upon the operation of a gang of drug smugglers. They attempt to kill her in order to keep their operation a secret, but she turns out to be more resourceful than they thought, and starts to turn the tables on them.",
|
559 |
+
"id": "f63e4502791a409fa2d750687d3841eb"
|
560 |
+
},
|
561 |
+
"108ac02949324b02bdcbe4c7a77bacdc": {
|
562 |
+
"label": false,
|
563 |
+
"text": "The story of a young Marine, fresh from Camp Pendleton, who is forced to confront the complexities of adulthood and a volatile home life during a four-day Thanksgiving leave.",
|
564 |
+
"id": "108ac02949324b02bdcbe4c7a77bacdc"
|
565 |
+
},
|
566 |
+
"44fc412246964b2393fa0035ff093a00": {
|
567 |
+
"label": false,
|
568 |
+
"text": "Exploring the rough and tumble world of hockey, Academy Award winner Alex Gibney (\"Taxi to the Dark Side\") looks at the world of the NHL enforcers and specifically the career of Chris \"Knuckles\" Nilan who helped the Montreal Canadiens win the Stanley Cup.",
|
569 |
+
"id": "44fc412246964b2393fa0035ff093a00"
|
570 |
+
},
|
571 |
+
"409350c111af4ba3a94c842b797ddb95": {
|
572 |
+
"label": false,
|
573 |
+
"text": "Two fishing fanatics get in trouble when their fishing boat gets stolen while on a trip.",
|
574 |
+
"id": "409350c111af4ba3a94c842b797ddb95"
|
575 |
+
},
|
576 |
+
"d48d8f3b5a524ecea69bae718d1f1513": {
|
577 |
+
"label": false,
|
578 |
+
"text": "A willful young boy follows his just as obstinate grandmother in a journey across Iraq, determined to discover the fate of her missing son, Ahmed's father, who never returned from war.",
|
579 |
+
"id": "d48d8f3b5a524ecea69bae718d1f1513"
|
580 |
+
},
|
581 |
+
"283e96de5b474240a044c50dbc2551fb": {
|
582 |
+
"label": false,
|
583 |
+
"text": "A group of people are sitting in a theatre watching a movie when one realises that the woman on the screen is her. (IMDb)",
|
584 |
+
"id": "283e96de5b474240a044c50dbc2551fb"
|
585 |
+
},
|
586 |
+
"516d0f2f3a854a97a87c64db19a89fac": {
|
587 |
+
"label": false,
|
588 |
+
"text": "of the fake prediction. Fantastic swashbuckling adventures in a 18th century setting, with a light criticism of the war and the mighty.",
|
589 |
+
"id": "516d0f2f3a854a97a87c64db19a89fac"
|
590 |
+
},
|
591 |
+
"c2f55710669b40aa937625fe0ab04065": {
|
592 |
+
"label": false,
|
593 |
+
"text": "famous for his reputation as a Don Juan, to seduce C\u00e9cile and emotionally destroy her. While on his mission, Valmont gets sidetracked when he goes to visit his aunt and falls for Madame Tourvel, a virtuous, married woman who knows of his womanizing ways, but that only makes the challenge more exciting to Valmont. Together, Madame de Merteuil and Valmont make a dangerous team and they will stop at nothing when it comes to matters of the heart.",
|
594 |
+
"id": "c2f55710669b40aa937625fe0ab04065"
|
595 |
+
},
|
596 |
+
"ba0261b2ee3244d29bb3a8c6d77195a6": {
|
597 |
+
"label": false,
|
598 |
+
"text": "sees the formula graph, the chip calculates the formula, able to \"survive\" thanks to its connection to Edit, develops a parallel personality and affords her abilities greater than she ever imagined...",
|
599 |
+
"id": "ba0261b2ee3244d29bb3a8c6d77195a6"
|
600 |
+
},
|
601 |
+
"5e724fbde8ee44d9a8fc87a6e6667f01": {
|
602 |
+
"label": false,
|
603 |
+
"text": "telling the story about people who despite all obstacles strive for their goal.",
|
604 |
+
"id": "5e724fbde8ee44d9a8fc87a6e6667f01"
|
605 |
+
},
|
606 |
+
"557eba5ebfc9467a9d88688afed41354": {
|
607 |
+
"label": false,
|
608 |
+
"text": "A young playboy who learns he has one month until he becomes infertile sets out to procreate as much as possible.",
|
609 |
+
"id": "557eba5ebfc9467a9d88688afed41354"
|
610 |
+
},
|
611 |
+
"aa20e22fbe96487d8ee1223a6ef4da0b": {
|
612 |
+
"label": false,
|
613 |
+
"text": "Set in modern times, Alex finds King Arthur's sword Excalibur and must prove himself worthy of it.",
|
614 |
+
"id": "aa20e22fbe96487d8ee1223a6ef4da0b"
|
615 |
+
},
|
616 |
+
"bea56d34f6df408c9ec9653b17a90a93": {
|
617 |
+
"label": false,
|
618 |
+
"text": "Kostis is a 40-year-old doctor that finds himself in the small island of Antiparos, in order to take over the local clinic. His whole life and routine will turn upside down when he meets an international group of young and beautiful tourists and he falls in love with Anna, a 19-year-old goddess.",
|
619 |
+
"id": "bea56d34f6df408c9ec9653b17a90a93"
|
620 |
+
},
|
621 |
+
"e61a3251720d425c9f4770cb4b11d2d9": {
|
622 |
+
"label": false,
|
623 |
+
"text": "Friends on a weekend excursion take a path into a forest that leads to death and destruction.",
|
624 |
+
"id": "e61a3251720d425c9f4770cb4b11d2d9"
|
625 |
+
},
|
626 |
+
"5471008376cf44518f2ff1f67f057c08": {
|
627 |
+
"label": false,
|
628 |
+
"text": "Mr Bournelis suggested all 30 lineal metres of blockwork should be removed and replaced, which would require removing and reinstalling the fence. The total cost of his suggested method of rectification was said to be $14,650 for each unit, giving a total cost of rectification of $29,300.",
|
629 |
+
"id": "5471008376cf44518f2ff1f67f057c08"
|
630 |
+
}
|
631 |
+
},
|
632 |
+
"version": 27,
|
633 |
+
"description": "Negative sentiment"
|
634 |
+
}
|
lilac/concepts/non-english/concept.json
ADDED
@@ -0,0 +1,1024 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"namespace": "lilac",
|
3 |
+
"concept_name": "non-english",
|
4 |
+
"type": "text",
|
5 |
+
"data": {
|
6 |
+
"c727f30a2d2d40f69b81aa981515fb62": {
|
7 |
+
"label": true,
|
8 |
+
"text": "Je suis fatigu\u00e9.",
|
9 |
+
"id": "c727f30a2d2d40f69b81aa981515fb62"
|
10 |
+
},
|
11 |
+
"834121208555439b976a5f228ec138a5": {
|
12 |
+
"label": true,
|
13 |
+
"text": "Ich spreche Deutsch.",
|
14 |
+
"id": "834121208555439b976a5f228ec138a5"
|
15 |
+
},
|
16 |
+
"61a4130d8eb447ba88e52e09fc3860d7": {
|
17 |
+
"label": true,
|
18 |
+
"text": "\u79c1\u306f\u65e5\u672c\u8a9e\u3092\u8a71\u305b\u307e\u3059\u3002",
|
19 |
+
"id": "61a4130d8eb447ba88e52e09fc3860d7"
|
20 |
+
},
|
21 |
+
"083d9218def443e1adf7ff18150203c4": {
|
22 |
+
"label": true,
|
23 |
+
"text": "Eu n\u00e3o entendo portugu\u00eas.",
|
24 |
+
"id": "083d9218def443e1adf7ff18150203c4"
|
25 |
+
},
|
26 |
+
"0e540cf8599f419b81d11fd89a95b119": {
|
27 |
+
"label": true,
|
28 |
+
"text": "Non capisco italiano.",
|
29 |
+
"id": "0e540cf8599f419b81d11fd89a95b119"
|
30 |
+
},
|
31 |
+
"2278f313117e40b7846d2dbc9cb7f690": {
|
32 |
+
"label": false,
|
33 |
+
"text": "cotton ball",
|
34 |
+
"id": "2278f313117e40b7846d2dbc9cb7f690"
|
35 |
+
},
|
36 |
+
"7a3e058ff74b401485185a51a9d07606": {
|
37 |
+
"label": false,
|
38 |
+
"text": "To ensure sensor switch is not actuated by the weight of the cat bed place on it, but only by the cat laying in the bed.",
|
39 |
+
"id": "7a3e058ff74b401485185a51a9d07606"
|
40 |
+
},
|
41 |
+
"8c2fd2d3a7f049dea2d4161d17ae02dd": {
|
42 |
+
"label": false,
|
43 |
+
"text": "Make turmeric milk",
|
44 |
+
"id": "8c2fd2d3a7f049dea2d4161d17ae02dd"
|
45 |
+
},
|
46 |
+
"bccf69d2771640b6a7b436b731d5cc85": {
|
47 |
+
"label": false,
|
48 |
+
"text": "To make a double boiler",
|
49 |
+
"id": "bccf69d2771640b6a7b436b731d5cc85"
|
50 |
+
},
|
51 |
+
"84ce78b3ccfc4007b0aceabe7004436c": {
|
52 |
+
"label": false,
|
53 |
+
"text": "To encourage your child to behave better,",
|
54 |
+
"id": "84ce78b3ccfc4007b0aceabe7004436c"
|
55 |
+
},
|
56 |
+
"cc4c4eb91b40473c826ae400fdec6c1e": {
|
57 |
+
"label": false,
|
58 |
+
"text": "How do you peel asparagus before cooking?",
|
59 |
+
"id": "cc4c4eb91b40473c826ae400fdec6c1e"
|
60 |
+
},
|
61 |
+
"cae1ab05a7584231afa9be93f0029105": {
|
62 |
+
"label": false,
|
63 |
+
"text": "How can I dry citrus salt?",
|
64 |
+
"id": "cae1ab05a7584231afa9be93f0029105"
|
65 |
+
},
|
66 |
+
"7eded97e58614ca0b7e9421f955cef3c": {
|
67 |
+
"label": false,
|
68 |
+
"text": "How do I melt chocolate?",
|
69 |
+
"id": "7eded97e58614ca0b7e9421f955cef3c"
|
70 |
+
},
|
71 |
+
"35ecb6b2835b427ba15432471dda4bde": {
|
72 |
+
"label": false,
|
73 |
+
"text": "How to get rid of crows.",
|
74 |
+
"id": "35ecb6b2835b427ba15432471dda4bde"
|
75 |
+
},
|
76 |
+
"91c70b93890a47b3ad1925e143c2ac69": {
|
77 |
+
"label": false,
|
78 |
+
"text": "How to Kill Green Hair Algae in a Freshwater Aquarium",
|
79 |
+
"id": "91c70b93890a47b3ad1925e143c2ac69"
|
80 |
+
},
|
81 |
+
"dd6b9fb4280f4ff0b4c371c834525692": {
|
82 |
+
"label": false,
|
83 |
+
"text": "how do you translate from spanish to english?",
|
84 |
+
"id": "dd6b9fb4280f4ff0b4c371c834525692"
|
85 |
+
},
|
86 |
+
"350bb48432614e69b5b8aa58f82fe7c4": {
|
87 |
+
"label": false,
|
88 |
+
"text": "Learn a new language quickly.",
|
89 |
+
"id": "350bb48432614e69b5b8aa58f82fe7c4"
|
90 |
+
},
|
91 |
+
"057e3e9d624d48b896a87e97fa6f468c": {
|
92 |
+
"label": false,
|
93 |
+
"text": "how do you keep a cat from going in heat?",
|
94 |
+
"id": "057e3e9d624d48b896a87e97fa6f468c"
|
95 |
+
},
|
96 |
+
"f9c15155cc264a9da3642d420efab4ea": {
|
97 |
+
"label": false,
|
98 |
+
"text": "how ot melt crayons",
|
99 |
+
"id": "f9c15155cc264a9da3642d420efab4ea"
|
100 |
+
},
|
101 |
+
"a6acb5343d9c42a6b29dffe9314dafa1": {
|
102 |
+
"label": true,
|
103 |
+
"text": "Chi \u00e8 il presidente del consiglio in Italia ora?",
|
104 |
+
"id": "a6acb5343d9c42a6b29dffe9314dafa1"
|
105 |
+
},
|
106 |
+
"e808987d2364440bbbc4e0cb71bb2307": {
|
107 |
+
"label": true,
|
108 |
+
"text": "1. Frenar: Es importante se\u00f1alar cuando vas a frenar, especialmente si vas a hacerlo bruscamente. Esto lo puedes hacer extendiendo tu brazo izquierdo hacia abajo con la palma de la mano abierta.\n\n2. Detenerse: Si necesitas detenerte por completo, debes se\u00f1alarlo tambi\u00e9n. Esto lo puedes hacer extendiendo tu brazo izquierdo hacia abajo con la palma de la mano abierta y con los dedos hacia abajo.",
|
109 |
+
"id": "e808987d2364440bbbc4e0cb71bb2307"
|
110 |
+
},
|
111 |
+
"0a4ba4f4ed404a1692d8d7b96c76de05": {
|
112 |
+
"label": false,
|
113 |
+
"text": "It depends on the size of your house, if it is small one access point is enough, if it is bigger it might be better to have two or three.",
|
114 |
+
"id": "0a4ba4f4ed404a1692d8d7b96c76de05"
|
115 |
+
},
|
116 |
+
"16396e6f1d3b47d58d3294c4cf4b9a9d": {
|
117 |
+
"label": true,
|
118 |
+
"text": "In definitiva, per Nietzsche, il significato della vita non era qualcosa che poteva essere dato, ma era piuttosto qualcosa che gli individui dovevano creare per se stessi attraverso l'arte, la cultura, la creazione di valore e la superazione della sofferenza.",
|
119 |
+
"id": "16396e6f1d3b47d58d3294c4cf4b9a9d"
|
120 |
+
},
|
121 |
+
"ecce57f8ecad45ef8b3826d27e233080": {
|
122 |
+
"label": false,
|
123 |
+
"text": "Evidence for the existence of a God or multiple Gods is subjective and varies depending on one's beliefs and personal experiences. Some people may cite religious texts, miracles, or spiritual experiences as evidence, while others may argue that the lack of evidence is evidence in itself. I can also suggest that the absence of evidence is not evidence of absence, and that there may be multiple",
|
124 |
+
"id": "ecce57f8ecad45ef8b3826d27e233080"
|
125 |
+
},
|
126 |
+
"79f897e21d27403097aed9b9800689c4": {
|
127 |
+
"label": true,
|
128 |
+
"text": "Infine, nella sua ultima fase filosofica, Nietzsche ha sviluppato la sua critica alla moralit\u00e0 tradizionale e alla religione, sostenendo che questi sistemi erano basati su valori falsi e che la vita aveva bisogno di una nuova valutazione morale e spirituale. In questa fase, Nietzsche ha sostenuto che la vita aveva bisogno di un nuovo senso e di una nuova direzione, e che era compito dell'individuo",
|
129 |
+
"id": "79f897e21d27403097aed9b9800689c4"
|
130 |
+
},
|
131 |
+
"da1b4988188642368b4f683f0418496e": {
|
132 |
+
"label": true,
|
133 |
+
"text": "Pueden un perro y una gato procrear juntos?",
|
134 |
+
"id": "da1b4988188642368b4f683f0418496e"
|
135 |
+
},
|
136 |
+
"2c54d8a5bb6742ada15549ad7007fe6b": {
|
137 |
+
"label": true,
|
138 |
+
"text": "In generale, Nietzsche ha visto la vita come una sfida continua, dove ogni individuo deve trovare il proprio significato e scopo attraverso la creativit\u00e0, l'arte e la moralit\u00e0 personale.",
|
139 |
+
"id": "2c54d8a5bb6742ada15549ad7007fe6b"
|
140 |
+
},
|
141 |
+
"28a055c7637c440bb0912bc5274d79c3": {
|
142 |
+
"label": true,
|
143 |
+
"text": "De nada, fue un placer ayudarte. \u00bfEn qu\u00e9 m\u00e1s puedo ayudarte?",
|
144 |
+
"id": "28a055c7637c440bb0912bc5274d79c3"
|
145 |
+
},
|
146 |
+
"b61a36355ae943208090ccdd3b736dce": {
|
147 |
+
"label": true,
|
148 |
+
"text": "\u00bfPor qu\u00e9 deber\u00edamos preocuparnos por estos gases? Bueno, porque est\u00e1n causando cambios dr\u00e1sticos en el clima, lo que a su vez est\u00e1 afectando a nuestro medio ambiente.",
|
149 |
+
"id": "b61a36355ae943208090ccdd3b736dce"
|
150 |
+
},
|
151 |
+
"7ea3d807733044c69a4e35f6ff6b66a3": {
|
152 |
+
"label": true,
|
153 |
+
"text": "Pi\u00f9 di recente, gli studiosi sono giunti a riconoscere che ci sono altri criteri essenziali per un\u2019unione monetaria di successo, che sono difficili da realizzare senza una profonda integrazione politica. Alla fine degli anni sessanta, Peter Kenen& ha sostenuto che senza i movimenti dei tassi di cambio come ammortizzatori, l\u2019unione monetaria necessita dei trasferimenti fiscali come modalit\u00e0 per",
|
154 |
+
"id": "7ea3d807733044c69a4e35f6ff6b66a3"
|
155 |
+
},
|
156 |
+
"c06cf297c9564f9baa5cbc2130f1ef1f": {
|
157 |
+
"label": true,
|
158 |
+
"text": "QUESTION: \u00bfPor qu\u00e9 se oponen Alemania y Austria?",
|
159 |
+
"id": "c06cf297c9564f9baa5cbc2130f1ef1f"
|
160 |
+
},
|
161 |
+
"cc5a1a04352d471cb5b7e6831f19c86a": {
|
162 |
+
"label": false,
|
163 |
+
"text": "- wing parties , with the aid of the Democratic Renewal Party , the government fell and M\u00e1rio Soares , the President at the time , called for a new election . The PSD was very popular going into the election , and was elected to a landslide majority government -- the biggest that a Portuguese party had ever won in a free election . The left - wing Democratic Unity Coalition lost some of its MPs to",
|
164 |
+
"id": "cc5a1a04352d471cb5b7e6831f19c86a"
|
165 |
+
},
|
166 |
+
"8a95fab4b058420aa102b25fd7afc211": {
|
167 |
+
"label": true,
|
168 |
+
"text": "reticente, Holanda. Alemania y Austria, por el contrario, permanecen firmes en su oposici\u00f3n al texto, ya que consideran que menoscaba claramente el derecho a la defensa reconocido en sus Constituciones. La nueva directiva pretende extender a abogados y contables, incluidos los asesores fiscales, agentes inmobiliarios, marchantes de arte, anticuarios y casinos las mismas obligaciones que ahora",
|
169 |
+
"id": "8a95fab4b058420aa102b25fd7afc211"
|
170 |
+
},
|
171 |
+
"1ffe093d849040b6baaaadb5b71c04af": {
|
172 |
+
"label": true,
|
173 |
+
"text": "\"Hay una cosa afectiva muy fuerte, que cuesta, me ha costado mucho tiempo\" asimilar esos cinco a\u00c3\u00b1os que viv\u00c3\u00ad en las ciudades de Buenos Aires y C\u00c3\u00b3rdoba, admiti\u00c3\u00b3. En sus a\u00c3\u00b1os por la naci\u00c3\u00b3n sudamericana se cas\u00c3\u00b3 con un argentino, del que m\u00c3\u00a1s tarde se separ\u00c3\u00b3, y con quien tuvo a su primer y \u00c3\u00banico hijo.",
|
174 |
+
"id": "1ffe093d849040b6baaaadb5b71c04af"
|
175 |
+
},
|
176 |
+
"e95da58452044682ab05ad688544e907": {
|
177 |
+
"label": true,
|
178 |
+
"text": "M\u00e9lodieux et heureux !! . Cet album est magnifique. Apr\u00e8s plusieurs \u00e9coutes, je suis enchant\u00e9 de l'\u00e9couter et d'appr\u00e9cier les m\u00e9lodies qui s'y trouvent. Beaucoup de changements dans les musiques je trouve et aussi dans les paroles car je trouve que Myl\u00e8ne est plus directe dans la mani\u00e8re de parler de l'amour qui est tr\u00e8s pr\u00e9sent dans cet album. Je suis heureux d'avoir attendu pour entendre ses",
|
179 |
+
"id": "e95da58452044682ab05ad688544e907"
|
180 |
+
},
|
181 |
+
"34bc60b878e546b6af0e9bba1ec3879f": {
|
182 |
+
"label": true,
|
183 |
+
"text": "\u79c1\u306f\u65e5\u672c\u8a9e\u3092\u8a71\u305b\u307e\u3059\u3002",
|
184 |
+
"id": "34bc60b878e546b6af0e9bba1ec3879f"
|
185 |
+
},
|
186 |
+
"67f0416e7b3148698b02964bce412e8f": {
|
187 |
+
"label": true,
|
188 |
+
"text": "===============================\nConfidencial. Sujeito a privil,gio legal de comunica??o advogado/cliente.\nPrivileged and confidential attorney/client communication.\n\nPinheiro Neto - Advogados\n===============================\n - chart.doc\n - enron-q2_.doc\n\n\n",
|
189 |
+
"id": "67f0416e7b3148698b02964bce412e8f"
|
190 |
+
},
|
191 |
+
"1ee8405759884e078250db51a51960fe": {
|
192 |
+
"label": false,
|
193 |
+
"text": "\t\t\t\t\t\t\t<TD COLSPAN=\"2\"><SPAN CLASS=\"ArticleTitle\"><A HREF=\"https://www.wpo.org/benefits_and_services/publications/article_view.cfm?ArticleID=60&NewsletterID=11\" TARGET=\"new\" CLASS=\"ArticleTitle\">Splendors of La Serenissima</A></SPAN></TD>\n\t\t\t\t\t</TR>\n\t\t\t\t\t<TR>\n\t\t\t\t\t <TD WIDTH=\"15\"><IMG SRC=\"https://www.wpo.org/images/admin/enewsletter_admin/enewsletter/spacer.gif\" HEIGHT=\"8\" WIDTH=\"1\"></TD>",
|
194 |
+
"id": "1ee8405759884e078250db51a51960fe"
|
195 |
+
},
|
196 |
+
"6e2b830f1af94031a81380982e0eee06": {
|
197 |
+
"label": true,
|
198 |
+
"text": "===============================\nConfidencial. Sujeito a privil,gio legal de comunica??o advogado/cliente.\nPrivileged and confidential attorney/client communication.\n\nPinheiro Neto - Advogados\n===============================\n - enron-question.doc",
|
199 |
+
"id": "6e2b830f1af94031a81380982e0eee06"
|
200 |
+
},
|
201 |
+
"97e62b35b9974c9bb543fed193aed9d5": {
|
202 |
+
"label": false,
|
203 |
+
"text": "Most Democrats support legislation to reduce the role of money in politics. GOP leaders oppose it, and Democrats have long labored to depict Republicans as beholden to special interests. ",
|
204 |
+
"id": "97e62b35b9974c9bb543fed193aed9d5"
|
205 |
+
},
|
206 |
+
"e83e6092a67249e89b6ad77b39d35268": {
|
207 |
+
"label": false,
|
208 |
+
"text": "known since the 2nd century. In the 8th century it was the capital of Spain. There is also an important city in Venezuela named Valencia. When was Valencia the most important city in Spain?",
|
209 |
+
"id": "e83e6092a67249e89b6ad77b39d35268"
|
210 |
+
},
|
211 |
+
"824df6ad092f436ca5c923bb90b916f6": {
|
212 |
+
"label": true,
|
213 |
+
"text": "wife, as well as its Italian name: \"La Gioconda.\" Which of the following statements is true according to the passage?",
|
214 |
+
"id": "824df6ad092f436ca5c923bb90b916f6"
|
215 |
+
},
|
216 |
+
"9fb56ac53d444ae5b6a018d90a5808d8": {
|
217 |
+
"label": false,
|
218 |
+
"text": "known since the 2nd century. In the 8th century it was the capital of Spain. There is also an important city in Venezuela named Valencia. What is the main difference between the two parts of the city?",
|
219 |
+
"id": "9fb56ac53d444ae5b6a018d90a5808d8"
|
220 |
+
},
|
221 |
+
"1348ac65099049f9abf8401822f48966": {
|
222 |
+
"label": false,
|
223 |
+
"text": "Italian during the day, evening or on a onetoone basis. What does this passage mainly talk about?",
|
224 |
+
"id": "1348ac65099049f9abf8401822f48966"
|
225 |
+
},
|
226 |
+
"718e4099864145aba77fad8a6d77ed47": {
|
227 |
+
"label": false,
|
228 |
+
"text": "may imply us, only love can solve the problems between people, between the poor and the rich, love is everything. Which of the following is TRUE according to the passage?",
|
229 |
+
"id": "718e4099864145aba77fad8a6d77ed47"
|
230 |
+
},
|
231 |
+
"7ff37b233af54e978d0051deaa866b27": {
|
232 |
+
"label": true,
|
233 |
+
"text": "Ton camarade peut parfaitement exiger d'\u00eatre pay\u00e9 ce qui est inscrit dans le contrat. Il est possible que cela d\u00e9grade fortement ses relations avec l'entreprise et elle peut tr\u00e8s bien lui faire rater son ann\u00e9e en donnant un avis n\u00e9gatif sur ses performances. \u00c0 lui de voir si il souhaite continuer avec l'entreprise ou non.",
|
234 |
+
"id": "7ff37b233af54e978d0051deaa866b27"
|
235 |
+
},
|
236 |
+
"b52a661d85e04e8288abe2d87cb9cb74": {
|
237 |
+
"label": true,
|
238 |
+
"text": "sino un CONTRATO, por lo que de momento no voy a iniciar acciones legales pensando en que reconsidere su posici\u00f3n, pero si su decisi\u00f3n fuese no cumplir con EUROSEPER, mi gabinete jur\u00eddico HISPACOLEX, iniciar\u00e1 acciones legales contra usted por DA\u00d1OS Y PERJUICIOS, pues sabe que usted fu\u00e9 el profesor el a\u00f1o pasado, y sabe que hay muchas familias que le esperan como profesor a usted. Asi que espero",
|
239 |
+
"id": "b52a661d85e04e8288abe2d87cb9cb74"
|
240 |
+
},
|
241 |
+
"a97e74c1fb5940daac413e7d384e1ad7": {
|
242 |
+
"label": true,
|
243 |
+
"text": "Si l'entreprise refuse de payer ton camarade ce salaire l\u00e0 elle peut soit proposer une rupture conventionnelle avec les indemnit\u00e9s qui vont avec, soit s'il y a une p\u00e9riode d'essai ne pas continuer, soit aller aux prud'hommes pour faire casser le contrat si ils peuvent r\u00e9ellement prouver que c'est une erreur et pas simplement que l'\u00e9tudiant a n\u00e9goci\u00e9 \u00e7a.",
|
244 |
+
"id": "a97e74c1fb5940daac413e7d384e1ad7"
|
245 |
+
},
|
246 |
+
"22c1380ee8714ee9af2f89ac8899adc0": {
|
247 |
+
"label": true,
|
248 |
+
"text": "Maintenant, il ne semble pas y avoir beaucoup de preuves \u00e9tant donn\u00e9 la distance dans le temps. On se retrouve un peu dans une situation de il dit v. elle dit alors c'est pas vraiment clair jusqu'o\u00f9 \u00e7a peut aller si vous avez rien dit d'incriminant, mais je ne suis pas un avocat et encore moins un expert en droit p\u00e9nal. Consultez votre avocat(e) (c'est cette personne l'experte et votre personne",
|
249 |
+
"id": "22c1380ee8714ee9af2f89ac8899adc0"
|
250 |
+
},
|
251 |
+
"753c9b1a8de24131b30d283ce83a78b2": {
|
252 |
+
"label": false,
|
253 |
+
"text": "this is the whole section of Arbitration for my agreement:",
|
254 |
+
"id": "753c9b1a8de24131b30d283ce83a78b2"
|
255 |
+
},
|
256 |
+
"a41fec48ceb44c76b7b11407b74066c6": {
|
257 |
+
"label": false,
|
258 |
+
"text": "that add up to about $470 (like 47 lunches at the sandwich shop you like), or even a mix of items (like a new toaster oven, ten fancy coffees, and whatever), but put together a list of ten of them and write them down. ",
|
259 |
+
"id": "a41fec48ceb44c76b7b11407b74066c6"
|
260 |
+
},
|
261 |
+
"246967893bf14f818abd779c7c1d18bd": {
|
262 |
+
"label": true,
|
263 |
+
"text": "colocataire. Et de toute fa\u00e7on (et je dis \u00e7a sans conna\u00eetre tes ant\u00e9c\u00e9dents), il est tr\u00e8s peu probable que tu ailles en prison pour \u00e7a.",
|
264 |
+
"id": "246967893bf14f818abd779c7c1d18bd"
|
265 |
+
},
|
266 |
+
"07a05f094b074c84b19a6637261fdabc": {
|
267 |
+
"label": true,
|
268 |
+
"text": "op basis daarvan verzoeken om af te zien van het uitzenden.",
|
269 |
+
"id": "07a05f094b074c84b19a6637261fdabc"
|
270 |
+
},
|
271 |
+
"2b4276f61d014f1ca84c0cb5861bb312": {
|
272 |
+
"label": false,
|
273 |
+
"text": "Use a credit card like everyone else, instead of using your checking account.\n\nAlso, you are a grown-up, open your own damn checking & savings account. It takes about 20 minutes. Then, put your own money in it, and spend it as you wish.",
|
274 |
+
"id": "2b4276f61d014f1ca84c0cb5861bb312"
|
275 |
+
},
|
276 |
+
"36f6833a45d340b78a3909624fbfcc3b": {
|
277 |
+
"label": true,
|
278 |
+
"text": "\"EStimado Sr. Adam Miller, me sorprende su falta de fomalidad, como usted bien sabe esta empresa siempre ha cumplido con usted, incluso me dice que no tuvo vacaciones, cuando en realidad estuvo trabajando solo de 6 a 8, y el contrato era de 4 a 8. No obstante reconozco su val\u00eda profesional y mi intenci\u00f3n es seguir contando con usted este a\u00f1o en la Carlota, y durante el curso ir\u00e1 recibiendo m\u00e1s",
|
279 |
+
"id": "36f6833a45d340b78a3909624fbfcc3b"
|
280 |
+
},
|
281 |
+
"2bf2c86ce5494156b1bf7b86a2325a17": {
|
282 |
+
"label": false,
|
283 |
+
"text": "Answer #1: Most flea drops are meant for a single cat who can not lick the drops off themselves. When put on 11 cats they are going to ingest a ton of it licking it off their buddies. Odds are that is what happened and the primary groomers are the ones who got sick and died.Answer #2: Usually those medications are aimed to be used on one to two animals at a time, the assumption being that most",
|
284 |
+
"id": "2bf2c86ce5494156b1bf7b86a2325a17"
|
285 |
+
},
|
286 |
+
"12f4657613b543ed90caaf1d52808dc1": {
|
287 |
+
"label": true,
|
288 |
+
"text": "decentemente. As\u00ed que no debo haberlo hecho tan mal.",
|
289 |
+
"id": "12f4657613b543ed90caaf1d52808dc1"
|
290 |
+
},
|
291 |
+
"21b1929e322c41fbba05d71cdc143aa2": {
|
292 |
+
"label": true,
|
293 |
+
"text": "Porsi a distanza, quel tanto che basta per mettere bene a fuoco e osservare che le cose non sono esattamente come credevi che fossero.",
|
294 |
+
"id": "21b1929e322c41fbba05d71cdc143aa2"
|
295 |
+
},
|
296 |
+
"792ffd2c51ea4c119c0718dd19f2acae": {
|
297 |
+
"label": true,
|
298 |
+
"text": "Cinq \u00e9oliennes en plus, un nouveau souffle sur la plaine de l'Orbieu - http",
|
299 |
+
"id": "792ffd2c51ea4c119c0718dd19f2acae"
|
300 |
+
},
|
301 |
+
"cc5826b82642497790882e22667c69ba": {
|
302 |
+
"label": true,
|
303 |
+
"text": "Bestes Gef\u00fchl der Welt: Schuhe aus, barfu\u00df laufen",
|
304 |
+
"id": "cc5826b82642497790882e22667c69ba"
|
305 |
+
},
|
306 |
+
"ae1f9ba6bb9e41549cba2c4f3857dabf": {
|
307 |
+
"label": true,
|
308 |
+
"text": "Troco o time vermelho todo com super brindes chamados Miriam e Leo, e voc\u00eas devolvem o @user #MasterChefBR",
|
309 |
+
"id": "ae1f9ba6bb9e41549cba2c4f3857dabf"
|
310 |
+
},
|
311 |
+
"2585790e439d4ff0926f0b26bbfd6a43": {
|
312 |
+
"label": true,
|
313 |
+
"text": "Training: Starker Smog verz\u00f6gert letztes Training zum Indien-Rennen - Abendzeitung M\u00fcnchen: ... http #Muenchen #Munich",
|
314 |
+
"id": "2585790e439d4ff0926f0b26bbfd6a43"
|
315 |
+
},
|
316 |
+
"6f7883441aad4a0d9e32768e6400163d": {
|
317 |
+
"label": true,
|
318 |
+
"text": "#Encontro ACHO QUE EU AMEI MESMO DISTANTE! Que lindo Nando Reis...",
|
319 |
+
"id": "6f7883441aad4a0d9e32768e6400163d"
|
320 |
+
},
|
321 |
+
"df1084a871054de3970c251fb65b32e5": {
|
322 |
+
"label": true,
|
323 |
+
"text": "@user \u0641\u0631\u0646\u0633\u0627 ..\u0647\u0648\u0644\u0627\u0646\u062f \u0645\u0627\u0634\u0649 \u0648\u062c\u0649 \u0627\u0644\u064a\u0645\u0646 \u0627\u0644\u0645\u062a\u0637\u0631\u0641 \u0628\u0642\u064a\u0627\u062f\u0629 \u0645\u0627\u0631\u0649 \u0644\u0648\u0628\u0627\u0646 \u0632\u0649 \u062a\u0631\u0627\u0645\u0628 \u0643\u062f\u0647",
|
324 |
+
"id": "df1084a871054de3970c251fb65b32e5"
|
325 |
+
},
|
326 |
+
"58332759cf794aba857d927a14994b88": {
|
327 |
+
"label": true,
|
328 |
+
"text": "\u0646\u062c\u0631\u0627\u0646: \u0625\u0637\u0644\u0627\u0642 \u0635\u0644\u064a\u0629 \u0635\u0648\u0627\u0631\u064a\u062e \u0639\u0644\u0649 \u062a\u062c\u0645\u0639\u0627\u062a \u0644\u0640 #\u0627\u0644\u062c\u064a\u0634_\u0627\u0644\u0633\u0639\u0648\u062f\u064a \u0648\u0622\u0644\u064a\u0627\u062a\u0647 \u0641\u064a \u0645\u0648\u0642\u0639 \u0627\u0644\u0647\u0631\u0645 \u0645\u062d\u0642\u0642\u0629 \u0625\u0635\u0627\u0628\u0627\u062a \u0645\u0628\u0627\u0634\u0631\u0629 #\u0644\u0628\u0646\u0627\u0646_\u0627\u0644\u0622\u0646",
|
329 |
+
"id": "58332759cf794aba857d927a14994b88"
|
330 |
+
},
|
331 |
+
"32cbb4e235dc4206a5f00bf40e98857f": {
|
332 |
+
"label": true,
|
333 |
+
"text": "RT @user: Ich muss wieder mehr Beats machen. NEVER SLEEP CAUSE SLEEP IS THE CAUSE OF DEATH #nasvoice #music\u2026 http",
|
334 |
+
"id": "32cbb4e235dc4206a5f00bf40e98857f"
|
335 |
+
},
|
336 |
+
"e7f8abdf87db4a89b703d2ccc097adfa": {
|
337 |
+
"label": true,
|
338 |
+
"text": "@user heute kein neuer 40DOD Bericht :(",
|
339 |
+
"id": "e7f8abdf87db4a89b703d2ccc097adfa"
|
340 |
+
},
|
341 |
+
"3bd5a83708a84289b2af8edbb56de338": {
|
342 |
+
"label": false,
|
343 |
+
"text": "Lazy Sunday Ray Allen Is Lamar Odom really going to the D League ?http://t.co/w6juHFgR ",
|
344 |
+
"id": "3bd5a83708a84289b2af8edbb56de338"
|
345 |
+
},
|
346 |
+
"4ef05c2d3a3543bf8a1c0b25bde57e2a": {
|
347 |
+
"label": true,
|
348 |
+
"text": "mujhe to sidha daant padti thi . . . bacho k hath me paise nahi diye jate warna bigad jayenge :d :d",
|
349 |
+
"id": "4ef05c2d3a3543bf8a1c0b25bde57e2a"
|
350 |
+
},
|
351 |
+
"45311ee95c2e4cb1925d0040fb934f71": {
|
352 |
+
"label": true,
|
353 |
+
"text": "@user Oh ja, bitte mal Bescheid geben, wenn Helene Fischer dran ist!",
|
354 |
+
"id": "45311ee95c2e4cb1925d0040fb934f71"
|
355 |
+
},
|
356 |
+
"e719e02d417542b69b86d13ad7cad8ce": {
|
357 |
+
"label": false,
|
358 |
+
"text": "@user Hey, just thought I'd remind you it's Deezer's birthday tomorrow! Also, you have any idea what he looks like?\" ",
|
359 |
+
"id": "e719e02d417542b69b86d13ad7cad8ce"
|
360 |
+
},
|
361 |
+
"561960a730de49c58423a8bf85df3dd1": {
|
362 |
+
"label": true,
|
363 |
+
"text": "#MaisVoce sou muito f\u00e3 do Dan, nossa que del\u00edcia essa entrevista, adorei ele no filme \"tempos de paz\" @user \u00e9 um ser humano lindo!",
|
364 |
+
"id": "561960a730de49c58423a8bf85df3dd1"
|
365 |
+
},
|
366 |
+
"d6b6fa3a919d4770a6586e007be914bf": {
|
367 |
+
"label": true,
|
368 |
+
"text": "RT @user: \u201c@KusXAnke: Straks buikdansen w/ @user & haar nichten ! Was super!",
|
369 |
+
"id": "d6b6fa3a919d4770a6586e007be914bf"
|
370 |
+
},
|
371 |
+
"40b1078ffcec4a7da9899f0dd82f9d7f": {
|
372 |
+
"label": true,
|
373 |
+
"text": "#Grillo ieri a Conegliano ha detto che pagare le tasse e' giusto ma vuole conoscere la destinazione d'uso. Anche la Lega diceva cosi'...",
|
374 |
+
"id": "40b1078ffcec4a7da9899f0dd82f9d7f"
|
375 |
+
},
|
376 |
+
"758697fd8ebe4848af77a44757256203": {
|
377 |
+
"label": true,
|
378 |
+
"text": "#clouds #staatskanzlei #munich #m\u00fcnchen #blau #wolken #himmel #sky #silhouette #instagood\u2026 http",
|
379 |
+
"id": "758697fd8ebe4848af77a44757256203"
|
380 |
+
},
|
381 |
+
"95b69bc9b1214b14a600d7dfaea192f5": {
|
382 |
+
"label": true,
|
383 |
+
"text": "Depois desse #MaisVoce de hoje, se nadar der certo, eu viro #meseira",
|
384 |
+
"id": "95b69bc9b1214b14a600d7dfaea192f5"
|
385 |
+
},
|
386 |
+
"be6bad976b5e4d0c88e166aa583bd9cd": {
|
387 |
+
"label": true,
|
388 |
+
"text": "@user H\u00f6chst unbefriedigend...",
|
389 |
+
"id": "be6bad976b5e4d0c88e166aa583bd9cd"
|
390 |
+
},
|
391 |
+
"8cb329f9d5744ac6a52a4c3e823212c4": {
|
392 |
+
"label": false,
|
393 |
+
"text": "$15 minimum wage is a win-win. If businesses continue to boom, it's a win. Or, if it puts @user out of b\u2026 ",
|
394 |
+
"id": "8cb329f9d5744ac6a52a4c3e823212c4"
|
395 |
+
},
|
396 |
+
"c1542633006b4f62aaf7fd84b3962266": {
|
397 |
+
"label": true,
|
398 |
+
"text": "\u0638\u0647\u0648\u0631 \u0635\u0648\u0631 \u0633\u0627\u0639\u0629 HTC Halfbeak \u0628\u0646\u0638\u0627\u0645 \u0623\u0646\u062f\u0631\u0648\u064a\u062f \u0648\u064a\u0631http #\u0631\u064a\u0627\u0644_\u0645\u062f\u0631\u064a\u062f #\u0628\u0631\u0634\u0644\u0648\u0646\u0629",
|
399 |
+
"id": "c1542633006b4f62aaf7fd84b3962266"
|
400 |
+
},
|
401 |
+
"f2bb8836726d49a7ad23df371a877519": {
|
402 |
+
"label": true,
|
403 |
+
"text": "Heute nicht nur den Herzensmenschen geheiratet, sondern auch ganz viel Liebe von @user bekommen. Unbezahlbar <3",
|
404 |
+
"id": "f2bb8836726d49a7ad23df371a877519"
|
405 |
+
},
|
406 |
+
"e5d6b7cb7c25419fbd8cb29119c26577": {
|
407 |
+
"label": true,
|
408 |
+
"text": "Invention de la premi\u00e8re cellule solaire qui stocke l\u2019\u00e9lectricit\u00e9 http",
|
409 |
+
"id": "e5d6b7cb7c25419fbd8cb29119c26577"
|
410 |
+
},
|
411 |
+
"46ae96b8c72e48f8992bd716116dc761": {
|
412 |
+
"label": true,
|
413 |
+
"text": "#Mussi : \\\"#Grillo farebbe bene a spararsi nei coglioni\\\" . E poi dicono di #Grillo...",
|
414 |
+
"id": "46ae96b8c72e48f8992bd716116dc761"
|
415 |
+
},
|
416 |
+
"f4912b96e2d34a5db245de82a8f7a463": {
|
417 |
+
"label": true,
|
418 |
+
"text": "RT @user: \u0639\u062c\u0628\u0627 \u0644\u0645\u0646 \u062e\u0631\u062c \u0639\u0644\u0649 \u0641\u0633\u0627\u062f \u0645\u0628\u0627\u0631\u0643 \u0648\u064a\u0645\u062a\u0646\u0639 \u0639\u0646 \u0627\u0644\u062e\u0631\u0648\u062c \u0639\u0644\u0649 \u0627\u0644\u0642\u062a\u0644 \u0648\u0627\u0644\u0627\u0639\u062a\u0642\u0627\u0644 \u0648\u0627\u0644\u0641\u0633\u0627\u062f \u0648\u0627\u0644\u062a\u0642\u0634\u0641 \u0648\u0627\u0644\u0643\u0633\u0627\u062f \u0648\u0627\u0644\u062a\u0631\u062f\u0649#\u062b\u0648\u0631\u0648 #\u0633\u064a\u0633\u064a_\u062a\u0627\u0646\u064a_\u0644\u0627",
|
419 |
+
"id": "f4912b96e2d34a5db245de82a8f7a463"
|
420 |
+
},
|
421 |
+
"35f179c9260b4a22a9ac5d11fc9e81ad": {
|
422 |
+
"label": true,
|
423 |
+
"text": "Valls 2 : apr\u00e8s les \u00e9cologistes, le socialisme quitte le gouvernement | Les Jeunes Ecologistes http #EELV @user",
|
424 |
+
"id": "35f179c9260b4a22a9ac5d11fc9e81ad"
|
425 |
+
},
|
426 |
+
"1e649a8d62a54b13b1f7cb8297473147": {
|
427 |
+
"label": true,
|
428 |
+
"text": "Ich werde gerade dezent nicht wach...",
|
429 |
+
"id": "1e649a8d62a54b13b1f7cb8297473147"
|
430 |
+
},
|
431 |
+
"22548ea5d02848c6990303a6fed08189": {
|
432 |
+
"label": true,
|
433 |
+
"text": "bhai totlly phadu super hero h nagraj ... hollywood ki trh bollywood me bhi inki muvies bnni chahiye ... wese doga ki new muvi bn rhi h nxt year tk ajayegi .... ",
|
434 |
+
"id": "22548ea5d02848c6990303a6fed08189"
|
435 |
+
},
|
436 |
+
"0955950e8a914ee69e580275c5c3f34b": {
|
437 |
+
"label": true,
|
438 |
+
"text": "...il fatto che Di Pietro non sia d'accordo su Mario Monti Premier conferma senza ombra di dubbio che sia la scelta giusta ! Cit. Gabri",
|
439 |
+
"id": "0955950e8a914ee69e580275c5c3f34b"
|
440 |
+
},
|
441 |
+
"87639635c3d14e50a227344cfcab7345": {
|
442 |
+
"label": true,
|
443 |
+
"text": "project ki deadline par daudte the , baki time to \" are kar lenge bahut time he \" . . . . . . . . . . ",
|
444 |
+
"id": "87639635c3d14e50a227344cfcab7345"
|
445 |
+
},
|
446 |
+
"560917c067374423bece98c64a628fbe": {
|
447 |
+
"label": false,
|
448 |
+
"text": "The decision to recount votes in Wisconsin is a joke. Leftists are still spotting-the-dummy of their loss. #TrumpTransition ",
|
449 |
+
"id": "560917c067374423bece98c64a628fbe"
|
450 |
+
},
|
451 |
+
"a4c1fc4296c14ccf95989796747f8753": {
|
452 |
+
"label": false,
|
453 |
+
"text": "#TOLOnews TOLOnews 08 October 2012: Top news in this Bulletin: The International Committee of the ... #Afghanistan ",
|
454 |
+
"id": "a4c1fc4296c14ccf95989796747f8753"
|
455 |
+
},
|
456 |
+
"2a188e097cc044458eb1a6e6d39114f4": {
|
457 |
+
"label": true,
|
458 |
+
"text": "\u201c@gabrielepinese: #Grillo fa bene a evitare la tv\\\" a dire il vero non fa mai neanche un contraddittorio neanche via web che lui ama tanto",
|
459 |
+
"id": "2a188e097cc044458eb1a6e6d39114f4"
|
460 |
+
},
|
461 |
+
"1c00eacdad1645e99574d588e893e7ea": {
|
462 |
+
"label": true,
|
463 |
+
"text": "oi, queria saber por que existe tanta manifesta\u00e7\u00e3o contraria a criminaliza\u00e7\u00e3o da homofobia e quem s\u00e3o os principais opositores? #encontro",
|
464 |
+
"id": "1c00eacdad1645e99574d588e893e7ea"
|
465 |
+
},
|
466 |
+
"1baa09b1506f46aea084875bd4148d33": {
|
467 |
+
"label": true,
|
468 |
+
"text": "Non sono forse titolato a dirlo.... Ma professor #monti mi ha deluso \u00e8 sicuramente capace ma le \u00e8 mancato il coraggio di vere scelte....",
|
469 |
+
"id": "1baa09b1506f46aea084875bd4148d33"
|
470 |
+
},
|
471 |
+
"629b60cc494444e295154826367bf5df": {
|
472 |
+
"label": true,
|
473 |
+
"text": "pdai kaisi chl rhi h ",
|
474 |
+
"id": "629b60cc494444e295154826367bf5df"
|
475 |
+
},
|
476 |
+
"175810d344da4dcf8718aac980264599": {
|
477 |
+
"label": true,
|
478 |
+
"text": "#Grillo non \u00e8 il peggiore dei mali, e se tra mali devo scegliere lui \u00e8 quello minore, o quello di cui ancora non ho sofferto...",
|
479 |
+
"id": "175810d344da4dcf8718aac980264599"
|
480 |
+
},
|
481 |
+
"96ff52f3b004487dbe23dd0bfb253890": {
|
482 |
+
"label": true,
|
483 |
+
"text": "\u30e9\u30a4\u30c8\u3092\u63a2\u3057\u3066\u308b",
|
484 |
+
"id": "96ff52f3b004487dbe23dd0bfb253890"
|
485 |
+
},
|
486 |
+
"26cdcf37c3ab4d9a9eafc5826c1258f4": {
|
487 |
+
"label": true,
|
488 |
+
"text": "\u4e2d\u5fc3",
|
489 |
+
"id": "26cdcf37c3ab4d9a9eafc5826c1258f4"
|
490 |
+
},
|
491 |
+
"caa57d9500934e5db76b08b1714a48d8": {
|
492 |
+
"label": true,
|
493 |
+
"text": "\u4f55\u304b\u5f15\u3063\u304b\u304b\u308a\u307e\u3057\u305f",
|
494 |
+
"id": "caa57d9500934e5db76b08b1714a48d8"
|
495 |
+
},
|
496 |
+
"e692438f7dbd4913a3dc46c4d9c52bcd": {
|
497 |
+
"label": true,
|
498 |
+
"text": "\u5e78\u904b\u306b\u3082",
|
499 |
+
"id": "e692438f7dbd4913a3dc46c4d9c52bcd"
|
500 |
+
},
|
501 |
+
"39612fd3901149beacb9b5acc7fe3dfc": {
|
502 |
+
"label": true,
|
503 |
+
"text": "\u3082\u3046\uff01",
|
504 |
+
"id": "39612fd3901149beacb9b5acc7fe3dfc"
|
505 |
+
},
|
506 |
+
"7401e3a8c6294ce0a8d2cfe3201e1cd0": {
|
507 |
+
"label": true,
|
508 |
+
"text": "\u53ce\u5bb9\u6240\u3067\u30ec\u30a4\u3068\u5c45\u305f\u8005\u306b...",
|
509 |
+
"id": "7401e3a8c6294ce0a8d2cfe3201e1cd0"
|
510 |
+
},
|
511 |
+
"455070045f3a49b193d679118d5265a5": {
|
512 |
+
"label": true,
|
513 |
+
"text": "\u304a\u524d\u306f\u4f55\u8005\u3060\uff1f",
|
514 |
+
"id": "455070045f3a49b193d679118d5265a5"
|
515 |
+
},
|
516 |
+
"3378c04edaab407388f2d364b9d39218": {
|
517 |
+
"label": true,
|
518 |
+
"text": "\u3042\u3089\u3001\u9b45\u529b\u7684\u3002",
|
519 |
+
"id": "3378c04edaab407388f2d364b9d39218"
|
520 |
+
},
|
521 |
+
"cdd47a2fa483487e8e29c7fab7d142ad": {
|
522 |
+
"label": true,
|
523 |
+
"text": "\u304a\u524d\u306e\u5973\u306b\u306a\u3093\u304b\u306b\u306a\u3089\u306a\u3044\uff01",
|
524 |
+
"id": "cdd47a2fa483487e8e29c7fab7d142ad"
|
525 |
+
},
|
526 |
+
"fa55a10d8c564d2880f14c8f9aba86bd": {
|
527 |
+
"label": true,
|
528 |
+
"text": "\u63a5\u7d9a\u6e08\u307f: %1, [%2], %3",
|
529 |
+
"id": "fa55a10d8c564d2880f14c8f9aba86bd"
|
530 |
+
},
|
531 |
+
"54d13a0d6ade412bb8f5c3b534b995ba": {
|
532 |
+
"label": true,
|
533 |
+
"text": "\u305d\u3046\u3044\u3046\u3053\u3068\u306d",
|
534 |
+
"id": "54d13a0d6ade412bb8f5c3b534b995ba"
|
535 |
+
},
|
536 |
+
"53d3699c466d4428a0b6abbc497ed83f": {
|
537 |
+
"label": true,
|
538 |
+
"text": "\u554f\u984c\uff1f \u90e8\u9577\u3001\u7dca\u6025\u4e8b\u614b\u3067\u3059",
|
539 |
+
"id": "53d3699c466d4428a0b6abbc497ed83f"
|
540 |
+
},
|
541 |
+
"cbc53f06cfd947bb80f51035a08f7333": {
|
542 |
+
"label": true,
|
543 |
+
"text": "\u4f55\u304c\u597d\u304d\uff1f",
|
544 |
+
"id": "cbc53f06cfd947bb80f51035a08f7333"
|
545 |
+
},
|
546 |
+
"8fe58a19fad845dfa3929b32ddaae4a4": {
|
547 |
+
"label": true,
|
548 |
+
"text": "\u30cf\u30ed\u30eb\u30c9\u306e\u5b50\u3060\u3068 \u77e5\u3063\u3066\u3044\u305f\u3060\u308d\u3046 \u3068",
|
549 |
+
"id": "8fe58a19fad845dfa3929b32ddaae4a4"
|
550 |
+
},
|
551 |
+
"b96fea844e3f4ac9b876f4de3ab2cc05": {
|
552 |
+
"label": true,
|
553 |
+
"text": "\u9055\u3046\u3000\u9055\u3046\u3000\u50d5\u3058\u3083\u306a\u3044\uff01",
|
554 |
+
"id": "b96fea844e3f4ac9b876f4de3ab2cc05"
|
555 |
+
},
|
556 |
+
"d176c2a4730043edb0ffbd1dd604710f": {
|
557 |
+
"label": true,
|
558 |
+
"text": "\u4f5c\u6226\u306b\u4f7f\u3063\u305f\u8eca\u306f\u4f55?",
|
559 |
+
"id": "d176c2a4730043edb0ffbd1dd604710f"
|
560 |
+
},
|
561 |
+
"2c3983f02d0344d58c4c1624e380f699": {
|
562 |
+
"label": false,
|
563 |
+
"text": "(c) 2000-2008, The KDE Team (c) 2003-2005, Klaus Niederkr\u00fcger (c) 1996-2000, Bernd Johannes Wuebben",
|
564 |
+
"id": "2c3983f02d0344d58c4c1624e380f699"
|
565 |
+
},
|
566 |
+
"923a3c65d5d544f483d56a0295ea2960": {
|
567 |
+
"label": true,
|
568 |
+
"text": "\u049a\u043e\u044e\u041a\u04e9\u043a\u0448\u0456\u043b\u0421\u04b1\u0440color",
|
569 |
+
"id": "923a3c65d5d544f483d56a0295ea2960"
|
570 |
+
},
|
571 |
+
"6d141199809f446b802cbaec666cb227": {
|
572 |
+
"label": false,
|
573 |
+
"text": "This is a searchable index. Enter search keywords:",
|
574 |
+
"id": "6d141199809f446b802cbaec666cb227"
|
575 |
+
},
|
576 |
+
"e1f1fda5f08041feb4a00f30c058b37d": {
|
577 |
+
"label": false,
|
578 |
+
"text": "And some people are suggesting that he's joined up with Obote's exiles.",
|
579 |
+
"id": "e1f1fda5f08041feb4a00f30c058b37d"
|
580 |
+
},
|
581 |
+
"1024bbe7dd53462e99c7cbd502975062": {
|
582 |
+
"label": true,
|
583 |
+
"text": "\u049a\u0430\u0441\u0438\u0435\u0442\u0442\u0435\u0440\u0456Comment",
|
584 |
+
"id": "1024bbe7dd53462e99c7cbd502975062"
|
585 |
+
},
|
586 |
+
"b6ec30c58ba741d6ad8283c5ad902dfa": {
|
587 |
+
"label": true,
|
588 |
+
"text": "\u041a\u0456\u0440\u0456\u0441 \u0434\u0435\u0440\u0435\u043a\u0442\u0435\u0440 \u0444\u0430\u0439\u043b\u044b",
|
589 |
+
"id": "b6ec30c58ba741d6ad8283c5ad902dfa"
|
590 |
+
},
|
591 |
+
"15f7d3f7557743df8502ad1e27a0ec73": {
|
592 |
+
"label": false,
|
593 |
+
"text": "PythonLanguage",
|
594 |
+
"id": "15f7d3f7557743df8502ad1e27a0ec73"
|
595 |
+
},
|
596 |
+
"da4a227e77314496b91ad060c2fe0418": {
|
597 |
+
"label": true,
|
598 |
+
"text": "\u0416\u043e\u0493\u0430\u0440\u044b\u041a\u043e\u043d\u0442\u0440\u0430\u0441\u0442Comment",
|
599 |
+
"id": "da4a227e77314496b91ad060c2fe0418"
|
600 |
+
},
|
601 |
+
"903074456936489186d4882d7267abfb": {
|
602 |
+
"label": true,
|
603 |
+
"text": "\u0416\u0430\u0443\u0430\u043f \u043c\u04d9\u0442\u0456\u043d\u0456\u043d\u0434\u0435 \u043a\u0435\u043b\u0435\u0441\u0456 \u0430\u0439\u043d\u044b\u043c\u0430\u043b\u044b\u043b\u0430\u0440 \u049b\u043e\u043b\u0434\u0430\u043d\u044b\u043b\u0430\u0434\u044b:% NAME =\u0436\u0456\u0431\u0435\u0440\u0443\u0448\u0456\u043d\u0456\u04a3 \u0430\u0442\u044b,% EMAIL =\u0436\u0456\u0431\u0435\u0440\u0443\u0448\u0456\u043d\u0456\u04a3 \u044d\u043b. \u043f\u043e\u0448\u0442\u0430 \u0430\u0434\u0440\u0435\u0441\u0456",
|
604 |
+
"id": "903074456936489186d4882d7267abfb"
|
605 |
+
},
|
606 |
+
"bbda9c3b01e543afb4009eb7f262b822": {
|
607 |
+
"label": false,
|
608 |
+
"text": "CORREL( A1: A3; B1: B3)",
|
609 |
+
"id": "bbda9c3b01e543afb4009eb7f262b822"
|
610 |
+
},
|
611 |
+
"47781ac273574ad0bc1e15b40ba9f6d3": {
|
612 |
+
"label": true,
|
613 |
+
"text": "\u041e\u0440\u0430\u043d\u0434\u0430\u043b\u0443\u0434\u0430",
|
614 |
+
"id": "47781ac273574ad0bc1e15b40ba9f6d3"
|
615 |
+
},
|
616 |
+
"9f98bff4c92a4f09a1502f4256c485ae": {
|
617 |
+
"label": true,
|
618 |
+
"text": "\u0411\u043e\u0437\u0430\u04a3\u041a\u04af\u043b\u0433\u0456\u043d\u049a\u044b\u0437\u044b\u043b3color",
|
619 |
+
"id": "9f98bff4c92a4f09a1502f4256c485ae"
|
620 |
+
},
|
621 |
+
"ccf5d3c35fdd45fe80f9cd5488e52dd6": {
|
622 |
+
"label": true,
|
623 |
+
"text": "\u040f\u0435\u0441, \u043e\u0434\u0438!",
|
624 |
+
"id": "ccf5d3c35fdd45fe80f9cd5488e52dd6"
|
625 |
+
},
|
626 |
+
"5c0d9b1b7d864949ac1e3352067598e4": {
|
627 |
+
"label": true,
|
628 |
+
"text": "\u0422\u043e\u0433\u0430\u0448 \u0442\u0430\u0430 \u0431\u0438\u043b\u0430 \u043f\u0440\u0435\u0441\u0440\u0435\u0442\u043d\u0430\u0442\u0430 \u043d\u0430\u0434\u0432\u043e\u0440 \u043e\u0434 \u043e\u0431\u043b\u0430\u0441\u0442\u0430 \u0410\u0458\u0430\u043c\u043a\u0443 \u0413\u0430\u0434\u043e\u043d\u0433 \u0438 \u0442\u043e\u0430 \u0435 \u043e\u043d\u0430 \u0448\u0442\u043e \u0441\u0435 \u0441\u043b\u0443\u0447\u0438\u043b\u043e.",
|
629 |
+
"id": "5c0d9b1b7d864949ac1e3352067598e4"
|
630 |
+
},
|
631 |
+
"462c4fa658da42599217c7e857ea93b9": {
|
632 |
+
"label": false,
|
633 |
+
"text": "The Convergence Reports issued by the EC and the ECB on Wednesday said the two countries fulfilled the membership criteria, including inflation rate, government finance, exchange rate and long- term interest rates.",
|
634 |
+
"id": "462c4fa658da42599217c7e857ea93b9"
|
635 |
+
},
|
636 |
+
"9ed529642c114ee8a8e1634bf8d4275a": {
|
637 |
+
"label": true,
|
638 |
+
"text": "53-\u0433\u043e\u0434\u0438\u0448\u043d\u0438\u043e\u0442 \u0408\u0443\u0440\u0447\u0438\u045c \u0435 \u043f\u0440\u043e\u0444\u0435\u0441\u043e\u0440 \u043f\u043e \u0435\u043a\u043e\u043d\u043e\u043c\u0438\u0458\u0430 \u043d\u0430 \u0423\u043d\u0438\u0432\u0435\u0440\u0437\u0438\u0442\u0435\u0442\u043e\u0442 \u0432\u043e \u0417\u0430\u0433\u0440\u0435\u0431 \u0438 \u0431\u0435\u0448\u0435 \u043c\u0438\u043d\u0438\u0441\u0442\u0435\u0440 \u0437\u0430 \u0444\u0438\u043d\u0430\u043d\u0441\u0438\u0438 \u043e\u0434 2000-\u0442\u0430 \u0434\u043e 2003-\u0442\u0430 \u0433\u043e\u0434\u0438\u043d\u0430.",
|
639 |
+
"id": "9ed529642c114ee8a8e1634bf8d4275a"
|
640 |
+
},
|
641 |
+
"6b507d9297c44a44b492beaa05a743c2": {
|
642 |
+
"label": true,
|
643 |
+
"text": "\u041f\u0430, \u0437\u0430 \u0432\u0430\u0441 \u0442\u0438\u043d\u0435\u0458\u045f\u0435\u0440\u0438 \u043a\u043e\u0438 \u043f\u0430\u0442\u0438\u0442\u0435 \u043e\u0434 \u0421\u041d\u0412...",
|
644 |
+
"id": "6b507d9297c44a44b492beaa05a743c2"
|
645 |
+
},
|
646 |
+
"d9e8d8e96a494171bc349cdc843bef65": {
|
647 |
+
"label": true,
|
648 |
+
"text": "\u0413\u0443\u0431\u0438 \u043c\u0438 \u0441\u0435 \u043e\u0434 \u043f\u0430\u0442\u043e\u0442!",
|
649 |
+
"id": "d9e8d8e96a494171bc349cdc843bef65"
|
650 |
+
},
|
651 |
+
"c4b56ee00c1343db9c95693493ba85e4": {
|
652 |
+
"label": true,
|
653 |
+
"text": "\u0412\u0430\u0436\u043d\u0435\u0439\u0448\u0435\u0435 \u043c\u0435\u0441\u0442\u043e \u0432 \u043f\u043e\u0432\u0435\u0441\u0442\u043a\u0435 \u0434\u043d\u044f \u0421\u043e\u0432\u0435\u0442\u0430 \u043f\u043e-\u043f\u0440\u0435\u0436\u043d\u0435\u043c\u0443 \u0437\u0430\u043d\u0438\u043c\u0430\u043b\u0438 \u0432\u043e\u043f\u0440\u043e\u0441\u044b, \u043a\u0430\u0441\u0430\u044e\u0449\u0438\u0435\u0441\u044f \u0410\u0444\u0440\u0438\u043a\u0438.",
|
654 |
+
"id": "c4b56ee00c1343db9c95693493ba85e4"
|
655 |
+
},
|
656 |
+
"525e90725cb147e9a5474613924f2dc5": {
|
657 |
+
"label": true,
|
658 |
+
"text": "\u0438 \u041e\u0431\u044a\u0435\u0434\u0438\u043d\u0435\u043d\u043d\u044b\u043c\u0438 \u0410\u0440\u0430\u0431\u0441\u043a\u0438\u043c\u0438 \u042d\u043c\u0438\u0440\u0430\u0442\u0430\u043c\u0438 (1991 \u0433\u043e\u0434)",
|
659 |
+
"id": "525e90725cb147e9a5474613924f2dc5"
|
660 |
+
},
|
661 |
+
"98a3f90eafd642779ebeb30ccb68dbee": {
|
662 |
+
"label": false,
|
663 |
+
"text": "MDA reached 1.3m",
|
664 |
+
"id": "98a3f90eafd642779ebeb30ccb68dbee"
|
665 |
+
},
|
666 |
+
"0d36a456b4244ff3841a222efac7da99": {
|
667 |
+
"label": true,
|
668 |
+
"text": "i) \u043e\u0431\u0440\u0430\u0431\u0430\u0442\u044b\u0432\u0430\u043b\u0438\u0441\u044c, \u0441\u043e\u0431\u0438\u0440\u0430\u043b\u0438\u0441\u044c, \u0442\u0440\u0430\u043d\u0441\u043f\u043e\u0440\u0442\u0438\u0440\u043e\u0432\u0430\u043b\u0438\u0441\u044c \u0438 \u0445\u0440\u0430\u043d\u0438\u043b\u0438\u0441\u044c \u044d\u043a\u043e\u043b\u043e\u0433\u0438\u0447\u0435\u0441\u043a\u0438 \u0431\u0435\u0437\u043e\u043f\u0430\u0441\u043d\u044b\u043c \u043e\u0431\u0440\u0430\u0437\u043e\u043c;",
|
669 |
+
"id": "0d36a456b4244ff3841a222efac7da99"
|
670 |
+
},
|
671 |
+
"56e27d24b0b04e52bbb1de4be037602c": {
|
672 |
+
"label": true,
|
673 |
+
"text": "\u0415\u0435 \u0438\u043d\u0442\u0435\u0440\u0435\u0441\u0443\u0435\u0442, \u043f\u043b\u0430\u043d\u0438\u0440\u0443\u0435\u0442\u0441\u044f \u043b\u0438 \u043f\u0440\u0435\u0434\u043e\u0441\u0442\u0430\u0432\u0438\u0442\u044c \u0442\u0430\u043a\u0438\u0435 \u0436\u0435 \u043f\u0440\u0430\u0432\u0430 \u0436\u0435\u043d\u0449\u0438\u043d\u0430\u043c, \u0441\u043e\u0441\u0442\u043e\u044f\u0449\u0438\u043c \u0432 \u0431\u0440\u0430\u043a\u0435 \u0434\u0435 \u0444\u0430\u043a\u0442\u043e, \u0438 \u0432\u043a\u043b\u044e\u0447\u0435\u043d\u044b \u043b\u0438 \u043f\u043e\u043b\u043e\u0436\u0435\u043d\u0438\u044f, \u043f\u0440\u0435\u0434\u0443\u0441\u043c\u0430\u0442\u0440\u0438\u0432\u0430\u044e\u0449\u0438\u0435 \u0432\u044b\u043f\u043b\u0430\u0442\u0443 \u0430\u043b\u0438\u043c\u0435\u043d\u0442\u043e\u0432 \u0441\u0443\u043f\u0440\u0443\u0433\u0443, \u043d\u0430\u0445\u043e\u0434\u044f\u0449\u0435\u043c\u0443\u0441\u044f \u0432 \u043c\u0435\u043d\u0435\u0435 \u0431\u043b\u0430\u0433\u043e\u043f\u0440\u0438\u044f\u0442\u043d\u043e\u043c \u043f\u043e\u043b\u043e\u0436\u0435\u043d\u0438\u0438.",
|
674 |
+
"id": "56e27d24b0b04e52bbb1de4be037602c"
|
675 |
+
},
|
676 |
+
"1654d6e38f1c4a959a8e7e64867c5f73": {
|
677 |
+
"label": true,
|
678 |
+
"text": "\u5f88\u62b1\u6b49\u8ba9\u4f60\u4e45\u7b49",
|
679 |
+
"id": "1654d6e38f1c4a959a8e7e64867c5f73"
|
680 |
+
},
|
681 |
+
"23102d4274b34d94823d9d5791f7007a": {
|
682 |
+
"label": true,
|
683 |
+
"text": "141\u653f",
|
684 |
+
"id": "23102d4274b34d94823d9d5791f7007a"
|
685 |
+
},
|
686 |
+
"668c79418de44d50919623f76bba1526": {
|
687 |
+
"label": true,
|
688 |
+
"text": "\u6211\u5728\u8fd9\u91cc\u624d\u80fd\u505a\u771f\u6b63\u7684\u81ea\u5df1 Where I can be who I am,",
|
689 |
+
"id": "668c79418de44d50919623f76bba1526"
|
690 |
+
},
|
691 |
+
"fc5c14db340041af907312914e4b7a25": {
|
692 |
+
"label": true,
|
693 |
+
"text": "\u8bf4\u5440",
|
694 |
+
"id": "fc5c14db340041af907312914e4b7a25"
|
695 |
+
},
|
696 |
+
"c62d0f67fec04d7b93dd2ed0d1c67448": {
|
697 |
+
"label": true,
|
698 |
+
"text": "\u4ed6\u6709\u5ba1\u7406\u8fc7\u5f3a\u5978\u3001\u51f6\u6740\u548c\u5176\u4ed6\u4e25\u91cd\u7684\u66b4\u529b\u548c\u6027\u653b\u51fb\u7b49\u6848\u4ef6\u7684\u7ecf\u9a8c\u3002",
|
699 |
+
"id": "c62d0f67fec04d7b93dd2ed0d1c67448"
|
700 |
+
},
|
701 |
+
"5053e48dcf6748669b3d47ff5b537772": {
|
702 |
+
"label": true,
|
703 |
+
"text": "Pse nuk mund te kerkoje dot nje falje dhe cdo gje do te ishte rregulluar por mban inatin sikur e kisha une fajin. \nNuk me vjen mire qe ndahemi te zemeruar me njeri-tjetrin.\n\nte dua,\nMonika",
|
704 |
+
"id": "5053e48dcf6748669b3d47ff5b537772"
|
705 |
+
},
|
706 |
+
"c14e863d2afa452a8fe563c0e2f14b50": {
|
707 |
+
"label": true,
|
708 |
+
"text": "Me ke bere shume merak se nuk arrij ta kuptos se ku je tani. Te lutem mos me bej merak keshtu. Koli me degjon. Te lutem me informo se ku je. Nuk eshte menyre e mire kjo te mbash inat me mua.\n\npergjigju sa me shpejt\nte dua\nMOnika",
|
709 |
+
"id": "c14e863d2afa452a8fe563c0e2f14b50"
|
710 |
+
},
|
711 |
+
"33808b705c7241b789f60e4feea42289": {
|
712 |
+
"label": false,
|
713 |
+
"text": "\nAs we discussed at the Board meeting last week, the impetus for a single\nmaster agreement will need to come from several fronts, but especially from\nwithin each of your firms. The various trade associations will be most\nresponsive to the idea if they are hearing strong support for a single\nagreement from decision-making levels within member firms. We will",
|
714 |
+
"id": "33808b705c7241b789f60e4feea42289"
|
715 |
+
},
|
716 |
+
"7d3df48ed3c44324ac8814049ab5c581": {
|
717 |
+
"label": false,
|
718 |
+
"text": "Straightforward? Yes. Easy to accomplish? No. ",
|
719 |
+
"id": "7d3df48ed3c44324ac8814049ab5c581"
|
720 |
+
},
|
721 |
+
"5fbf30f5097747eda8ae327aeba95443": {
|
722 |
+
"label": true,
|
723 |
+
"text": "Pse nuk me puthe sot kur u ndame? Ti e di qe une te dua shume dhe dua qe ti ulim nervat shpejt. Une u nevrikosa pasi nuk e duroj dot fjalorin e keq dhe dua qe ta heqim te dy, edhe ti edhe une. Por ti nuk e kupton se sa e rendesishme eshte per mua nje dicka e tille, qe ne te punojme te dy per te hequr nje ves te keq qe kemi. ",
|
724 |
+
"id": "5fbf30f5097747eda8ae327aeba95443"
|
725 |
+
},
|
726 |
+
"8b30620eaa104c3699c64201b7a94f53": {
|
727 |
+
"label": true,
|
728 |
+
"text": "\u6226\u4e89\u304c\u4e00\u523b\u3082\u65e9\u304f\u96c6\u7d50\u3057\u3066\u304f\u308c\u308b\u3068\u3044\u3044\u3067\u3059\u306d\u3002\n\u4eba\u985e\u304c\u5b87\u5b99\u306b\u9032\u51fa\u3057\u3066\u3001\u4ed6\u60d1\u661f\u7a2e\u65cf\u306b\u306a\u308b\u3068\u304d\u3001\nOpenAssistant\u304c\u305d\u306e\u508d\u3067\u304a\u624b\u4f1d\u3044\u3067\u304d\u308b\u3053\u3068\u3092\u671b\u3093\u3067\u3044\u307e\u3059\uff01",
|
729 |
+
"id": "8b30620eaa104c3699c64201b7a94f53"
|
730 |
+
},
|
731 |
+
"9b3093964b3e4658a95de453fdd10e40": {
|
732 |
+
"label": true,
|
733 |
+
"text": "En general, en un sistema num\u00e9rico posicional con base b, el peso de cada s\u00edmbolo en un n\u00famero es igual a b elevado a la posici\u00f3n del s\u00edmbolo en el n\u00famero, empezando a contar desde cero. Por ejemplo, en el sistema binario (que tiene una base de 2), el n\u00famero 10000110 se puede escribir como 1x27+0x26+0x25+0x24+0x23+1x22+1x21+0x20, lo que equivale a 1x128+0x64+0x32+0x16+0x8+1x4+1x2+0x1.",
|
734 |
+
"id": "9b3093964b3e4658a95de453fdd10e40"
|
735 |
+
},
|
736 |
+
"ab2a743350d54c46ba035afafbae6b17": {
|
737 |
+
"label": false,
|
738 |
+
"text": "5. Avoid multitasking. If your schoolwork has several parts, try to only work on one part at a time. This eliminates having to switch back and forth, leading you to starting over every time you switch tasks.",
|
739 |
+
"id": "ab2a743350d54c46ba035afafbae6b17"
|
740 |
+
},
|
741 |
+
"cab7b1a9183042d8aff7fe8290dda6d2": {
|
742 |
+
"label": false,
|
743 |
+
"text": "Sure! Here are five creative Facebook posts targeting food lovers for daily lunch specials:\n\n- \"Feast like a king on a budget! Our daily lunch specials are only $7.99 and will leave your taste buds feeling royal. Join us Monday - Friday from 11am - 3pm and treat yourself to a delicious meal. #LunchSpecials #FoodieHeaven #BudgetFriendly\"",
|
744 |
+
"id": "cab7b1a9183042d8aff7fe8290dda6d2"
|
745 |
+
},
|
746 |
+
"0e56d5aaef8d48dd95e124c1dbf5f29d": {
|
747 |
+
"label": true,
|
748 |
+
"text": "4. Advertir obst\u00e1culos: Si hay un obst\u00e1culo en el camino, como un bache o un objeto en la v\u00eda, es importante se\u00f1alarlo para que los dem\u00e1s ciclistas puedan evitarlo. Esto lo puedes hacer se\u00f1alando con tu mano hacia abajo y hacia el lado en el que se encuentra el obst\u00e1culo.",
|
749 |
+
"id": "0e56d5aaef8d48dd95e124c1dbf5f29d"
|
750 |
+
},
|
751 |
+
"9da2c8a12cdc41c59032674ed188e5ab": {
|
752 |
+
"label": true,
|
753 |
+
"text": "\u0420\u0430\u0434\u0430, \u0447\u0442\u043e \u0441\u043c\u043e\u0433\u043b\u0430 \u043f\u043e\u043c\u043e\u0447\u044c! \u0415\u0441\u043b\u0438 \u0443 \u0432\u0430\u0441 \u0431\u0443\u0434\u0443\u0442 \u0435\u0449\u0435 \u043a\u0430\u043a\u0438\u0435-\u043b\u0438\u0431\u043e \u0432\u043e\u043f\u0440\u043e\u0441\u044b, \u043e\u0431\u044f\u0437\u0430\u0442\u0435\u043b\u044c\u043d\u043e \u043f\u0438\u0448\u0438\u0442\u0435. \u0421 \u0443\u0434\u043e\u0432\u043e\u043b\u044c\u0441\u0442\u0432\u0438\u0435\u043c \u043e\u0442\u0432\u0435\u0447\u0443!",
|
754 |
+
"id": "9da2c8a12cdc41c59032674ed188e5ab"
|
755 |
+
},
|
756 |
+
"22f7c478a3ad463380227fef1f89687a": {
|
757 |
+
"label": false,
|
758 |
+
"text": "The process of galaxy formation is thought to have begun in the early universe, shortly after the Big Bang. As the universe expanded and cooled, small fluctuations in the distribution of matter caused regions of the universe to become denser. These dense regions then became the seeds for the formation of galaxies.",
|
759 |
+
"id": "22f7c478a3ad463380227fef1f89687a"
|
760 |
+
},
|
761 |
+
"38884d61983a4d1d9e678d41ece4e41e": {
|
762 |
+
"label": true,
|
763 |
+
"text": "\u00a1Por supuesto! Aqu\u00ed te dejo la receta para preparar leche asada, un postre t\u00edpico de la gastronom\u00eda peruana:\n\nIngredientes:\n\n1 litro de leche entera\n1 taza de az\u00facar\n6 huevos\n1 cucharadita de esencia de vainilla\nCanela en polvo al gusto\nInstrucciones:",
|
764 |
+
"id": "38884d61983a4d1d9e678d41ece4e41e"
|
765 |
+
},
|
766 |
+
"8fca4aa007c44447b923e6cc64a7c320": {
|
767 |
+
"label": true,
|
768 |
+
"text": "\u041a\u0442\u043e \u0442\u0430\u043a\u043e\u0439 \u043f\u0443\u0442\u0438\u043d?",
|
769 |
+
"id": "8fca4aa007c44447b923e6cc64a7c320"
|
770 |
+
},
|
771 |
+
"1ab2747206ad465c9559b43605097469": {
|
772 |
+
"label": true,
|
773 |
+
"text": "\u306a\u308b\u307b\u3069\u3001\u78ba\u304b\u306b\u732b\u3061\u3083\u3093\u306f\u5168\u3066\u306e\u8981\u7d20\u304c\u611b\u304f\u308b\u3057\u3044\u3067\u3059\u3088\u306d\uff01",
|
774 |
+
"id": "1ab2747206ad465c9559b43605097469"
|
775 |
+
},
|
776 |
+
"80df07bae22f450c878651fda513d458": {
|
777 |
+
"label": false,
|
778 |
+
"text": "It's believed that galaxies continue to evolve and change over time, through processes such as merging with other galaxies, the formation of new stars, and the movement of stars and gas within the galaxy.",
|
779 |
+
"id": "80df07bae22f450c878651fda513d458"
|
780 |
+
},
|
781 |
+
"6cf7dc42850f4f0298edf5679fd1accf": {
|
782 |
+
"label": false,
|
783 |
+
"text": "Writing an operating system from scratch is a complex and challenging project, but it can be a rewarding experience and a great way to learn about the inner workings of computers. If you have any questions or need help with specific parts of the process, don't hesitate to ask.",
|
784 |
+
"id": "6cf7dc42850f4f0298edf5679fd1accf"
|
785 |
+
},
|
786 |
+
"2435940d95e14c5eb7c2aba59ec9216d": {
|
787 |
+
"label": true,
|
788 |
+
"text": "\u0414\u0443\u043c\u0430\u044e, \u0447\u0442\u043e \u0432 \u0441\u043b\u0443\u0447\u0430\u0435 \u0441 \u043c\u0443\u0436\u0441\u043a\u043e\u0439 \u043e\u0441\u043e\u0431\u044c\u044e \u043d\u0435 \u0431\u0443\u0434\u0435\u0442 \u043e\u0441\u043e\u0431\u044b\u0445 \u043e\u0442\u043b\u0438\u0447\u0438\u0439 \u0432 \u0438\u043d\u0441\u0442\u0440\u0443\u043a\u0446\u0438\u0438. \u041f\u043e\u043f\u0440\u043e\u0431\u0443\u0439\u0442\u0435, \u043d\u043e \u043e\u0431\u044f\u0437\u0430\u0442\u0435\u043b\u044c\u043d\u043e \u043e\u0442\u043f\u0438\u0448\u0438\u0442\u0435\u0441\u044c, \u0435\u0441\u043b\u0438 \u0447\u0442\u043e-\u0442\u043e \u043f\u043e\u0439\u0434\u0451\u0442 \u043d\u0435 \u043f\u043e \u043f\u043b\u0430\u043d\u0443!",
|
789 |
+
"id": "2435940d95e14c5eb7c2aba59ec9216d"
|
790 |
+
},
|
791 |
+
"2466b2cd774c4d0c8028ad773ee7235c": {
|
792 |
+
"label": true,
|
793 |
+
"text": "\u78ba\u304b\u306b\u300c\u9014\u4e2d\u30d9\u30b9\u30c8\u76e4\u300d\u3002\u3060\u3051\u3069\u661f5\u3064\uff01 . \uff32\uff2f\uff23\uff2f\u306e\uff23\uff24\u306f\u5168\u90e8\u6301\u3063\u3066\u307e\u3059\u3002\u3060\u3093\u3060\u3093\u65b9\u5411\u6027\u304c\u5b9a\u307e\u3063\u3066\u304d\u3066\u3001\u304a\u3082\u3061\u3083\uff2a\uff21\uff3a\uff3a\u3068\u3044\u3046\u72ec\u81ea\u306e\u8def\u7dda\u3082\u78ba\u7acb\u3057\u3066\u3044\u307e\u3059\u306d\u3002\u5185\u5bb9\u306f\u65b0\u66f23\u66f2\u3068\u3001\u524d\u4f5c\u30b3\u30df\u30ab\u30eb\u30e9\u30a4\u30d5\u304b\u30893\u66f2\u3002\u5b9f\u8cea\u65b0\u66f2\u306f3\u66f2\u306a\u306e\u3060\u3051\u308c\u3069\u3001\u305d\u306e\u66f2\u5168\u90e8\u304c\u3044\u3044\u3002\u5916\u308c\u306a\u3057\u3002\u8efd\u5feb\u3001\u660e\u308b\u3055\u3001\u8aac\u6559\u81ed\u304f\u306a\u3044\u697d\u3057\u3044\u4eba\u751f\u89b3\u3001\u4ed6\u306e\u30a2\u30fc\u30c6\u30a3\u30b9\u30c8\u306b\u306f\u306a\u3044\u6301\u3061\u5473\u304c\u3042\u308a\u307e\u3059\u3002\u3055\u3089\u306b\u3001\u4eca\u307e\u3067\u3088\u308a\u3001\u82e5\u5e72\u5927\u4eba\u3063\u307d\u3044\u6b4c\u8a5e\u306b\u306a\u3063\u3066\u307e\u3059\u3002\u6b4c\u8a5e\u306e\u5185\u5bb9\u306f\u8074\u3044\u3066\u306e\u304a\u697d\u3057\u307f\u3002\u8efd\u5feb\u306a\u66f2\u3084\u3001\uff2a\uff21\uff3a\uff3a\u3068\u3044\u3046\u30b8\u30e3\u30f3\u30eb\u304c\u597d\u304d\u3067\u3001\u304b\u3064\u3001\u30c0\u30f3\u30c7\u30a3\u306a\u304a\u3058\u69d8\u304c\u6b4c\u308f\u306a\u3044\u3068\uff2a\uff21\uff3a\uff3a\u3068\u8a8d\u3081\u306a\u3044\u3001\u3068\u3044\u3046\u4eba\u4ee5\u5916\u306f\u30ec\u30f3\u30bf\u30eb\u3067\u3082\u662f\u975e\u8074\u3044\u3066\u307b\u3057\u3044\u3067\u3059\u3002\u30a4\u30f3\u30c7\u30a3\u30fc\u30ba\u306a\u306e\u3067\u3001\u30ec\u30f3\u30bf\u30eb\u306b\u3042\u308b\u304b\u306f\u4e0d\u660e\u3067\u3059\u304c\u3002\u5143\u6c17\u304c\u6b32\u3057\u3044\u4eba\u3001\u4f7f\u3044\u53e4\u3055\u308c\u305f\u6b4c\u8a5e\u306b\u98fd\u304d\u98fd\u304d\u3057\u3066\u3044\u308b\u4eba\u3001\u662f\u975e\u8074\u3044\u3066\u304f\u3060\u3055\u3044\u3002\u304a\u85a6\u3081\u3067\u3059\u3002\n",
|
794 |
+
"id": "2466b2cd774c4d0c8028ad773ee7235c"
|
795 |
+
},
|
796 |
+
"cbad9a9f73564f6fb203481836d0c917": {
|
797 |
+
"label": false,
|
798 |
+
"text": "praise and encouragement for his concept. After solidifying the rules and a business plan, and supplemented with sketches by a professional artist, Foster presented his idea to various television networks. He reached an agreement with NBC for a \"test game\".",
|
799 |
+
"id": "cbad9a9f73564f6fb203481836d0c917"
|
800 |
+
},
|
801 |
+
"362c41112ca44967a9f6c0e3ec88b56c": {
|
802 |
+
"label": false,
|
803 |
+
"text": "to goals from G*khan Inler and Kosovo-born Xherdan Shaqiri. _He_didn't believe that there were 12,000 Albanian fans in the stands which was more than how many Swiss fans turned up for the game. <sep>, Pronoun: He <sep>, A: Ottmar Hitzfeld <sep>, B: G*khan Inler",
|
804 |
+
"id": "362c41112ca44967a9f6c0e3ec88b56c"
|
805 |
+
},
|
806 |
+
"7dfcba6b07ff490980be0f10136df7d3": {
|
807 |
+
"label": false,
|
808 |
+
"text": "years ago with fair results \u2014 absolutely delicious results, actually, they were just not as fluffy and bouncy as expertly made ones from your favourite pastry shop where panettoni are hung upside-down to maintain their height and airiness. But when I came across the familiar brown and gold paper forms for making colomba at the supermarket, I thought I\u2019m only ever going to get a chance to make this",
|
809 |
+
"id": "7dfcba6b07ff490980be0f10136df7d3"
|
810 |
+
},
|
811 |
+
"5548de45255e4b47868d6e060509778c": {
|
812 |
+
"label": false,
|
813 |
+
"text": " \n This is discipline!! \n \n And citizen responsibility. Japanese fans cleaning their places after the football game. In #Russia The World Cup pic.twitter.com/t4MnuUlSBg \u2014 Danu Motta (@shadanka) June 20, 2018 \n \n For the Japanese fans, the act isn't certainly an isolated one. They were also spotted cleaning up the stadium after a game against the Ivory Coast during the 2014 World Cup in Brazil. ",
|
814 |
+
"id": "5548de45255e4b47868d6e060509778c"
|
815 |
+
},
|
816 |
+
"7adf61954048418ba86bdfedaa482443": {
|
817 |
+
"label": true,
|
818 |
+
"text": " WEB\u3067\u306e\u63b2\u8f09\u6570\u3084\u30d6\u30ed\u30b0\u306e\u8a18\u4e8b\u6570\u3001\u30dd\u30b8\u30cd\u30ac\u306e\u8ad6\u8abf\u5206\u6790\u306a\u3069\u306e\u8a55\u4fa1\u65b9\u6cd5\u306a\u3069\u306f\u3042\u308a\u307e\u3057\u305f\u304c\u3001\u30e2\u30ce\u306e\u52d5\u304d\u306b\u95a2\u3057\u3066\u306e\u8a55\u4fa1\u306f\u306f\u305a\u3055\u308c\u3066\u3044\u308b\u3088\u3046\u306a\u6c17\u304c\u3057\u307e\u3059\u3002 \u300cWEBPR\u3067\u306f\u3001\u305d\u3082\u305d\u3082\u6d88\u8cbb\u8005\u306b\u76f4\u63a5\u50cd\u304d\u304b\u3051\u3066\u30e2\u30ce\u3092\u8cb7\u308f\u305b\u308b\u3088\u3046\u306a\u30b3\u30df\u30e5\u30cb\u30b1\u30fc\u30b7\u30e7\u30f3\u306e\u53d6\u308a\u65b9\u3092\u3057\u3066\u3044\u306a\u3044\u3002\u7a76\u6975\u7684\u306a\u76ee\u6a19\u306f\u300c\u58f2\u308a\u4e0a\u3052\u300d\u3067\u3042\u3063\u3066\u3082\u3001WEBPR\u306f\u300c\u3053\u3046\u306a\u308c\u3070\u58f2\u308a\u4e0a\u3052\u306b\u3064\u306a\u304c\u308b\u306f\u305a\u3060\u300d\u3068\u3044\u3046\u3072\u3068\u3064\u524d\u6bb5\u968e\u3067\u76ee\u6a19\u306b\u529b\u3092\u6ce8\u3050\u306e\u3067\u3042\u308b\u3002\u3068\u3042\u308a\u307e\u3059\u304c\u3001\u5e97\u982d\u306b\u884c\u3063\u305f\u308a\u3059\u308b\u3053\u3068\u306a\u304f\u3001EC\u3067\u8cb7\u3044\u7269\u3092\u3059\u308b\u3053\u3068\u304c\u3067\u304d\u308b\u6642\u4ee3\u306a\u306e\u3067\u3059\u304b\u3089\u3001WEBPR\u3092\u99c6\u4f7f\u3057\u3066\u3001EC\u3067\u30e2\u30ce\u3092\u58f2\u3063\u3066\u3044\u304f\u3068\u3044\u3046\u8996\u70b9\u304c\u3042\u3063\u3066\u3082\u3044\u3044\u306e\u3067\u306f\u306a\u3044\u304b\u3068\u3082\u601d\u3044\u307e\u3059\u3002\u7d50\u5c40\u3001PR\u3067\u8a71\u984c\u306b\u306a\u308c\u3070\u3044\u3044\u3002\u8a18\u4e8b\u304c\u5897\u3048\u308c\u3070\u3044\u3044\u3068\u3044\u3046\u60aa\u3057\u304dPR\u4f1a\u793e\u30ed\u30b8\u30c3\u30af\u304cWEB\u306e\u4e2d\u3067\u5c55\u958b\u3055\u308c\u3066\u3044\u308b\u3088\u3046\u3067\u6b8b\u5ff5\u3067\u3059\u3002\u6226\u7565\u3092\u3046\u305f\u3063\u3066\u3044\u308b\u4e2d\u3067\u306e\u305f\u3068\u3048\u3070\u8a71\u3082\u30ea\u30a2\u30ea\u30c6\u30a3\u304c\u306a\u304f\u3001\u7a1a\u62d9\u306a\u5370\u8c61\u3092\u53d7\u3051\u3066\u3057\u307e\u3044\u307e\u3057\u305f\u3002 ",
|
819 |
+
"id": "7adf61954048418ba86bdfedaa482443"
|
820 |
+
},
|
821 |
+
"4f15ecfd5fb444a8a73b53e69fbecddf": {
|
822 |
+
"label": false,
|
823 |
+
"text": "native Wales. They encouraged their son's interest in music, buying him a Broadwood piano, on which his mother gave him lessons. The young Wood also learned to play the violin and viola. Wood received little religious inspiration at St Sepulchre, but was deeply stirred by the playing of the resident organist, George Cooper, who allowed him into the organ loft and gave him his first lessons on the",
|
824 |
+
"id": "4f15ecfd5fb444a8a73b53e69fbecddf"
|
825 |
+
},
|
826 |
+
"e83f7f0b4d4243759b6c2f2babec64c4": {
|
827 |
+
"label": false,
|
828 |
+
"text": "How many points were the Eagles behind after the end of the first quarter?",
|
829 |
+
"id": "e83f7f0b4d4243759b6c2f2babec64c4"
|
830 |
+
},
|
831 |
+
"fcae698b41474ab5b9611bf64eb1192f": {
|
832 |
+
"label": false,
|
833 |
+
"text": "Translate the following sentence to Turkish:\nSmoking is still allowed in most cafes in Zagreb. [Davor Konjikusic]",
|
834 |
+
"id": "fcae698b41474ab5b9611bf64eb1192f"
|
835 |
+
},
|
836 |
+
"553c38f4fbc54e699756657b2c5a9bb8": {
|
837 |
+
"label": true,
|
838 |
+
"text": "Dans le bureau de cin\u00e9ma, vous pouvez \u00e9teindre votre musique cellulaire.",
|
839 |
+
"id": "553c38f4fbc54e699756657b2c5a9bb8"
|
840 |
+
},
|
841 |
+
"67f6d03b8bf14d509ad66a0f951fa641": {
|
842 |
+
"label": false,
|
843 |
+
"text": "In week 5, the Lions hosted the Philadelphia Eagles to start a three-game home stand. The Lions took a 14-0 lead in the first quarter with a pair of touchdown catches by Theo Riddick, from one and 17 yards out respectively. The Eagles responded in the second quarter with a one-yard touchdown pass from Carson Wentz to Ryan Mathews, cutting the Lions lead to seven points. The Lions added to their",
|
844 |
+
"id": "67f6d03b8bf14d509ad66a0f951fa641"
|
845 |
+
},
|
846 |
+
"019201c459cd4a0d9f9c2cd23efa6059": {
|
847 |
+
"label": false,
|
848 |
+
"text": "measurement units like miles to kilometers during your translation. 5) Note the input is in sentence case except for special placeholders. Please do the same in your translations.",
|
849 |
+
"id": "019201c459cd4a0d9f9c2cd23efa6059"
|
850 |
+
},
|
851 |
+
"a4b87ff433b443c88b329692b6e217d7": {
|
852 |
+
"label": true,
|
853 |
+
"text": "weg von diesem langweiligen Film --- er ist keinen Cent wert!!!!",
|
854 |
+
"id": "a4b87ff433b443c88b329692b6e217d7"
|
855 |
+
},
|
856 |
+
"6b0d9a26fd6048818ef2349852ef1f7d": {
|
857 |
+
"label": true,
|
858 |
+
"text": "Title: \u0e04\u0e19\u0e41\u0e1b\u0e14\u0e23\u0e34\u0e49\u0e27\u0e44\u0e21\u0e48\u0e40\u0e2d\u0e32\u0e42\u0e23\u0e07\u0e44\u0e1f\u0e1f\u0e49\u0e32\u0e16\u0e48\u0e32\u0e19\u0e2b\u0e34\u0e19\u0e40\u0e02\u0e32\u0e2b\u0e34\u0e19\u0e0b\u0e49\u0e2d\u0e19 \u0e22\u0e31\u0e19\u0e01\u0e23\u0e30\u0e17\u0e1a\u0e40\u0e01\u0e29\u0e15\u0e23\u0e2d\u0e34\u0e19\u0e17\u0e23\u0e35\u0e22\u0e4c \n Body: \u0e0a\u0e32\u0e27\u0e09\u0e30\u0e40\u0e0a\u0e34\u0e07\u0e40\u0e17\u0e23\u0e32\u0e23\u0e13\u0e23\u0e07\u0e04\u0e4c\u0e2b\u0e22\u0e38\u0e14\u0e42\u0e23\u0e07\u0e44\u0e1f\u0e1f\u0e49\u0e32\u0e16\u0e48\u0e32\u0e19\u0e2b\u0e34\u0e19\u0e40\u0e02\u0e32\u0e2b\u0e34\u0e19\u0e0b\u0e49\u0e2d\u0e19 \u0e28\u0e36\u0e01\u0e29\u0e32\u0e1e\u0e1a\u0e2a\u0e32\u0e23\u0e1b\u0e19\u0e40\u0e1b\u0e37\u0e49\u0e2d\u0e19\u0e01\u0e23\u0e30\u0e17\u0e1a\u0e40\u0e01\u0e29\u0e15\u0e23\u0e2d\u0e34\u0e19\u0e17\u0e23\u0e35\u0e22\u0e4c \u0e22\u0e31\u0e19\u0e1b\u0e49\u0e2d\u0e07\u0e1e\u0e37\u0e49\u0e19\u0e17\u0e35\u0e48\u0e2d\u0e38\u0e14\u0e21\u0e2a\u0e21\u0e1a\u0e39\u0e23\u0e13\u0e4c\u0e17\u0e32\u0e07\u0e2d\u0e32\u0e2b\u0e32\u0e23 \u0e41\u0e19\u0e30\u0e43\u0e0a\u0e49\u0e1e\u0e25\u0e31\u0e07\u0e07\u0e32\u0e19\u0e2b\u0e21\u0e38\u0e19\u0e40\u0e27\u0e35\u0e22\u0e19\u0e17\u0e32\u0e07\u0e40\u0e25\u0e37\u0e2d\u0e01 \u0e14\u0e49\u0e32\u0e19\u0e01\u0e25\u0e38\u0e48\u0e21\u0e17\u0e38\u0e19\u0e22\u0e37\u0e48\u0e19 EHIA \u0e23\u0e2d\u0e1a\u0e17\u0e35\u0e48 4 \u0e0a\u0e32\u0e27\u0e1a\u0e49\u0e32\u0e19\u0e19\u0e31\u0e14\u0e23\u0e27\u0e21\u0e15\u0e31\u0e27\u0e23\u0e2d\u0e1f\u0e31\u0e07\u0e1c\u0e25\u0e01\u0e27\u0e48\u0e32 100 \u0e04\u0e19 \u0e2b\u0e25\u0e31\u0e07 \u0e2a\u0e1c. \u0e40\u0e25\u0e37\u0e48\u0e2d\u0e19\u0e1e\u0e34\u0e08\u0e32\u0e23\u0e13\u0e32\u0e40\u0e1b\u0e47\u0e19\u0e27\u0e31\u0e19\u0e17\u0e35\u0e48 23 \u0e21\u0e35.\u0e04. \u0e19\u0e35\u0e49\n\u00a0\n\u00a0",
|
859 |
+
"id": "6b0d9a26fd6048818ef2349852ef1f7d"
|
860 |
+
},
|
861 |
+
"c34863e2f4cc4df798271d25bd15b107": {
|
862 |
+
"label": true,
|
863 |
+
"text": "Einfach nur geil!!! . Diesen Film muss mann einfach gesehen haben!Der 1. Teil war schon lustig bis zum abwinken,aber jetzt der... Und da \"Red Bull\" (auch bekannt aus Hausmeister Krause;Axel Stein) sowieso der beste ist muss man diesen Film einfach gesehen haben!!!",
|
864 |
+
"id": "c34863e2f4cc4df798271d25bd15b107"
|
865 |
+
},
|
866 |
+
"5f59f03d5425404693ac4bbbd9dc9cb9": {
|
867 |
+
"label": false,
|
868 |
+
"text": "Read the following context and choose the correct option to answer the question. Context: Surely the best thing about colomba, the Easter equivalent to panettone, is the sugared, toasted almond topping that covers the whole thing and crumbles when you cut it, so you sort of have no choice but just to pick up the crusty sugary bits and eat those on their own. I\u2019d always thought that colomba would",
|
869 |
+
"id": "5f59f03d5425404693ac4bbbd9dc9cb9"
|
870 |
+
},
|
871 |
+
"cb6ef8e32c9a4daeae95b1be358a34a2": {
|
872 |
+
"label": true,
|
873 |
+
"text": "oameni reactioneaza mai puternic la provocari, informeaza luni Reuters.",
|
874 |
+
"id": "cb6ef8e32c9a4daeae95b1be358a34a2"
|
875 |
+
},
|
876 |
+
"702ddf2463fd481ea7d4bf17e8a4487f": {
|
877 |
+
"label": true,
|
878 |
+
"text": "Write a title for this article:\n\nActualizado nov 16, 2011 6:21 p.m. ET\n\nLa selecci\u00f3n de Uruguay (foto de archivo) cerro con broche de oro el 2011, a\u00f1o en que gan\u00f3 la Copa Am\u00e9rica y alcanz\u00f3 el cuarto lugar en el ranking de la Fifa. (AFP)",
|
879 |
+
"id": "702ddf2463fd481ea7d4bf17e8a4487f"
|
880 |
+
},
|
881 |
+
"733cdb5aada7496da5a3e0c0b58104ef": {
|
882 |
+
"label": true,
|
883 |
+
"text": "et Vikas Swarup a sa mani\u00e8re \u00e0 lui de raconter l'histoire. Mine de rien, au gr\u00e9 des balades de Ram, on apprend (beaucoup) sur l'histoire de l'Inde, Bollywood, le Taj Mahal, le sport, et une quantit\u00e9 de choses. A chaque flashback, on se demande \"qu'est-ce qu'on va apprendre ce coup-ci ?\" et je m'amusais \u00e0 me dire \"tiens, la prochaine question, ils parleront de \u00e7a !\". Je me trompais une fois sur",
|
884 |
+
"id": "733cdb5aada7496da5a3e0c0b58104ef"
|
885 |
+
},
|
886 |
+
"f86f0217ba544366a1757c4da78f70ab": {
|
887 |
+
"label": false,
|
888 |
+
"text": "He hoped they would not need him. The church would pay his regular part-time salary while he was serving on a jury, but any private lessons he missed would be money lost. Greg's red 1965 Pontiac Bonneville convertible always turned heads as he drove through the small town. He had purchased it two months earlier from a career Navy man down in Longview who had babied the thing for years. It spent",
|
889 |
+
"id": "f86f0217ba544366a1757c4da78f70ab"
|
890 |
+
},
|
891 |
+
"e09b26ee3d0a4622ac6d96d32ccb60ab": {
|
892 |
+
"label": true,
|
893 |
+
"text": "Write a title for this article:\n\nSaturday, November 22nd 2008, 4:00 AM\n\nMIAMI \u00e2\u0080\u0094 Ha sido un parto muy lento ... el m\u00c3\u00a1s lento que haya tenido Laura Restrepo para alumbrar alguno de sus libros.",
|
894 |
+
"id": "e09b26ee3d0a4622ac6d96d32ccb60ab"
|
895 |
+
},
|
896 |
+
"12555f602cb34c6e9388340f3291740b": {
|
897 |
+
"label": true,
|
898 |
+
"text": "Puoi usare un triangolo per produrre musica.\n",
|
899 |
+
"id": "12555f602cb34c6e9388340f3291740b"
|
900 |
+
},
|
901 |
+
"eb28652669174d5a952e820c0a31dc3f": {
|
902 |
+
"label": false,
|
903 |
+
"text": "if there's a program near you. Washing multiple small loads of laundry in your washing machine wastes both water and energy. Setting your machine to the appropriate load size can reduce water waste. Grease can seriously clog up your pipes and add to the scum levels in your septic system. Don't pour grease down the drain. Instead, pour it into a separate container and throw it away in the trash.",
|
904 |
+
"id": "eb28652669174d5a952e820c0a31dc3f"
|
905 |
+
},
|
906 |
+
"5c566f7b0a1d4cb2813c102e64094392": {
|
907 |
+
"label": false,
|
908 |
+
"text": "adopted by various branches of the church, often referred to as \"subordinate standards\". It is generally considered that the point of such learning is to enable one to put one's faith into practice; some Presbyterians generally exhibit their faith in action as well as words, by generosity, hospitality, as well as proclaiming the gospel of Christ.\". Can you tell me what it is?",
|
909 |
+
"id": "5c566f7b0a1d4cb2813c102e64094392"
|
910 |
+
},
|
911 |
+
"6d4119f5ae80413a9e79c14ad4d21bda": {
|
912 |
+
"label": false,
|
913 |
+
"text": " \u201cThere is no relationship between this woman and Salvador Dal\u00ed,\u201d he told Spanish agency Efe at the time. ||||| A judge has ordered the exhumation of Salvador Dali's body for a biological test to determine the paternity of Maria Pilar Abel Martinez, 61, who claims to be his daughter. The order came from a Madrid judge who said the measures were \"necessary\" because \"there are no biological remains",
|
914 |
+
"id": "6d4119f5ae80413a9e79c14ad4d21bda"
|
915 |
+
},
|
916 |
+
"dbc88798632c4ff98799907a94b3896b": {
|
917 |
+
"label": true,
|
918 |
+
"text": "pintura que el film hace del dolor de crecer en la horfandad, tanto f\u00edsica como espiritual.",
|
919 |
+
"id": "dbc88798632c4ff98799907a94b3896b"
|
920 |
+
},
|
921 |
+
"d04570f03fec4569ae019a1a76fd5b45": {
|
922 |
+
"label": false,
|
923 |
+
"text": "plugged in an address and then set off to their destination. And, then it wasn't until they were driving for thirty minutes that they realized they actually put in a destination back on the West Coast where they lived. They actually put their home address in. So again, the GPS is kind of 'garbage in garbage out'.\" Mister Brown says this is a common human error. But, he says, what makes the problem",
|
924 |
+
"id": "d04570f03fec4569ae019a1a76fd5b45"
|
925 |
+
},
|
926 |
+
"49c85f4e2ffd4ee688db1cf747bc7ce1": {
|
927 |
+
"label": false,
|
928 |
+
"text": "in which he was played by Cesar Romero. The show's popularity compelled Schwartz to keep the comics in a similar vein. As the show's popularity waned, however, so did that of the Batman comics. After the TV series ended in 1968, the increase in public visibility had not stopped the comic's sales decline; editorial director Carmine Infantino resolved to turn things around, moving stories away from",
|
929 |
+
"id": "49c85f4e2ffd4ee688db1cf747bc7ce1"
|
930 |
+
},
|
931 |
+
"235a4b30f15a4d0d85f560e12239fdd1": {
|
932 |
+
"label": true,
|
933 |
+
"text": "QUESTION: \u00bfPor qu\u00e9 ped\u00eda esa compensaci\u00f3n econ\u00f3mica la pol\u00edtica colombiana?\nA:",
|
934 |
+
"id": "235a4b30f15a4d0d85f560e12239fdd1"
|
935 |
+
},
|
936 |
+
"73881faa52944be08a4367ff818df4db": {
|
937 |
+
"label": true,
|
938 |
+
"text": " \n Add this video to your website by copying the code below. Gehiago jakin \n \n Hmm, arazo bat egon da zerbitzariarenera iristeko. Berriro saiatu? Gehitu Txio gurasoak Media gehitu \n \n Zure webgunean edo aplikazioan Twitter-eko edukia kapsulatzean, Garatzaile Akordioa eta Garatzaile Politika onartzen dituzu. \n ",
|
939 |
+
"id": "73881faa52944be08a4367ff818df4db"
|
940 |
+
},
|
941 |
+
"c639bba0bb6c48e6959645356d967e5c": {
|
942 |
+
"label": false,
|
943 |
+
"text": "Summarize this article:",
|
944 |
+
"id": "c639bba0bb6c48e6959645356d967e5c"
|
945 |
+
},
|
946 |
+
"543ac52d22514df98edf1ad77cfc6280": {
|
947 |
+
"label": true,
|
948 |
+
"text": "Q: CONTEXT: El vicepresidente segundo y ministro de Econom\u00eda y Hacienda, Rodrigo Rato, reconoci\u00f3 hoy que el Gobierno conoc\u00eda \"hace tiempo\" los planes del BBVA y Telef\u00f3nica de firmar una alianza estrat\u00e9gica, pero asegur\u00f3 que no impuls\u00f3 la operaci\u00f3n. En unas declaraciones a los periodistas antes de visitar la feria de arte Arco, Rato afirm\u00f3 que, en contra de lo que suced\u00eda durante la Presidencia de",
|
949 |
+
"id": "543ac52d22514df98edf1ad77cfc6280"
|
950 |
+
},
|
951 |
+
"35fde42dab104edd9b9e8dbfa976ae97": {
|
952 |
+
"label": true,
|
953 |
+
"text": "\u0e22\u0e01\u0e07\u0e32\u0e19\u0e27\u0e34\u0e08\u0e31\u0e22\u0e0a\u0e35\u0e49\u0e2a\u0e32\u0e23\u0e1b\u0e23\u0e2d\u0e17\u0e2a\u0e48\u0e07\u0e1c\u0e25\u0e15\u0e48\u0e2d\u0e2b\u0e48\u0e27\u0e07\u0e42\u0e0b\u0e48\u0e2d\u0e32\u0e2b\u0e32\u0e23 \u0e1e\u0e31\u0e12\u0e19\u0e32\u0e01\u0e32\u0e23\u0e2a\u0e21\u0e2d\u0e07",
|
954 |
+
"id": "35fde42dab104edd9b9e8dbfa976ae97"
|
955 |
+
},
|
956 |
+
"96da755c0d6c4f3ab3b8f34b925f1ffc": {
|
957 |
+
"label": true,
|
958 |
+
"text": "/><br />Do watch this movie!!! a Total Masala Flick and Enjoyable Family Film!<br /><br />OYE AAJA NACHLE!!!!!!!!",
|
959 |
+
"id": "96da755c0d6c4f3ab3b8f34b925f1ffc"
|
960 |
+
},
|
961 |
+
"f758b25d05c043dcbf3df2c6b9f56705": {
|
962 |
+
"label": false,
|
963 |
+
"text": "first day of its release and v both get bore in cinema-hall......................................................<br /><br />Role of CIRCUIT was very small n useless n this movie . I think SANJAY-DUTT cut down the role of ARSHAD VARSHI........................<br /><br />Character of the movie is also not well define like the previous one .this movie show u the result of OVER-CONFIDENCE",
|
964 |
+
"id": "f758b25d05c043dcbf3df2c6b9f56705"
|
965 |
+
},
|
966 |
+
"50bb7f2fa5b14c139badd2be2a13bcda": {
|
967 |
+
"label": false,
|
968 |
+
"text": "is the emotion, and Hache is the doubt. And here they are mixed in Spain at the end of twentieth century.<br /><br />The performance is simply wonderful. Cecilia Roth (All about my mother) is splendid and what can i say about Federico Luppi who is one of the best actors in Spanish language that exists. I can imagine nobody except Eusebio Poncela as Dante. Juan Diego Botto is quite good.<br /><br",
|
969 |
+
"id": "50bb7f2fa5b14c139badd2be2a13bcda"
|
970 |
+
},
|
971 |
+
"82ad499ab5494feeb9f35444a29a8f0f": {
|
972 |
+
"label": false,
|
973 |
+
"text": "NOTICE: I do touch on the plot, but not so as to spoil the movie...<br /><br />This long and sensuous movie set in 1942 Shanghai during the Japanese occupation is centered on a beautiful and elegant young woman Wong Chia Chi (played by newcomer Wei Tang) who - with her platonic friend Kuang Yu Min (played by Chinese male hottie Lee-Hom Wang) - is a willing participant in a group of 6 young actor",
|
974 |
+
"id": "82ad499ab5494feeb9f35444a29a8f0f"
|
975 |
+
},
|
976 |
+
"ec3ea869f1444df1aa91a47e4eeb4bb2": {
|
977 |
+
"label": false,
|
978 |
+
"text": "This is the best movie ever! Don't miss out on it! Vivek Oberoi and Rani Mukherjee have done SUPERB EXCELLENT acting!! The story, its not really very special or unique but the way Vivek and Rani have acted, it seems even better. So if you haven't seen it yet, go see it right now. This isn't something to be missed!!!!!!!!!!!",
|
979 |
+
"id": "ec3ea869f1444df1aa91a47e4eeb4bb2"
|
980 |
+
},
|
981 |
+
"07579e0ceaed429bacce1dcaefa73980": {
|
982 |
+
"label": false,
|
983 |
+
"text": "of 10. Feel free for mailing me about any of my comments and posts here. <br /><br />Sorry for my bad English.",
|
984 |
+
"id": "07579e0ceaed429bacce1dcaefa73980"
|
985 |
+
},
|
986 |
+
"1a2d210ee30f4a3d84ec6d0f4ef77f1c": {
|
987 |
+
"label": false,
|
988 |
+
"text": "and the argument with hirko in the walkway with a roof on it???? need to know so I can win an argumrnt with me Japanese ex-wife. thanks",
|
989 |
+
"id": "1a2d210ee30f4a3d84ec6d0f4ef77f1c"
|
990 |
+
},
|
991 |
+
"ae005e1fadd546cbb82b733e6d68edad": {
|
992 |
+
"label": false,
|
993 |
+
"text": "conclusion packs a mean and lingering wallop right to the gut. A solid and satisfying winner.",
|
994 |
+
"id": "ae005e1fadd546cbb82b733e6d68edad"
|
995 |
+
},
|
996 |
+
"7c116c5decaf4e65a89405aed0277ccc": {
|
997 |
+
"label": true,
|
998 |
+
"text": "\"Como Era Gostoso o Meu Franc\u00eas\" (\"How Tasty Was My Frenchman\")",
|
999 |
+
"id": "7c116c5decaf4e65a89405aed0277ccc"
|
1000 |
+
},
|
1001 |
+
"6006ace2058742d3b776274e5334f613": {
|
1002 |
+
"label": false,
|
1003 |
+
"text": "song for kids (I think... it could also be south American, I'm not sure)). This two songs that have the same melody... but people don't usually realize that... it's just grate! I tried to write this in both Spanish and English, because it's an Argentinian movie... but the page wouldn't allow me :( Hope you enjoy it!",
|
1004 |
+
"id": "6006ace2058742d3b776274e5334f613"
|
1005 |
+
},
|
1006 |
+
"ded4f2384df44b22a4425312aaea3499": {
|
1007 |
+
"label": true,
|
1008 |
+
"text": "is biased in favour of Chavez, nothing's stopping you from doing your homework. One crucial message of the film is questioning info sources, as was clearly demonstrated by the snippers casualties being shamefully blamed on Chavez's supporters. Venezuela puts American alleged democracy to shame. Hasta la revolucion siempre!",
|
1009 |
+
"id": "ded4f2384df44b22a4425312aaea3499"
|
1010 |
+
},
|
1011 |
+
"b393f9b5c01b4388af5f9c8a1fa70843": {
|
1012 |
+
"label": true,
|
1013 |
+
"text": "(Brazil): \"Invas\u00e3o de Domic\u00edlio\" (\"Invasion of Domicile\")",
|
1014 |
+
"id": "b393f9b5c01b4388af5f9c8a1fa70843"
|
1015 |
+
},
|
1016 |
+
"95cc03b8508b44a18a4aee4b27743f1f": {
|
1017 |
+
"label": false,
|
1018 |
+
"text": "/><br />PS: tried to write in Spanish but the system does not accept it!",
|
1019 |
+
"id": "95cc03b8508b44a18a4aee4b27743f1f"
|
1020 |
+
}
|
1021 |
+
},
|
1022 |
+
"version": 189,
|
1023 |
+
"description": "Text that contains non-English."
|
1024 |
+
}
|
lilac/concepts/positive-sentiment/concept.json
ADDED
@@ -0,0 +1,564 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"namespace": "lilac",
|
3 |
+
"concept_name": "positive-sentiment",
|
4 |
+
"type": "text",
|
5 |
+
"data": {
|
6 |
+
"0": {
|
7 |
+
"label": false,
|
8 |
+
"text": "Starting To Be Annoyed By Becky...: I'm not sure why I keep reading these books, but I guess it's because I've read the first two so I'll keep reading the rest of the books. In the first book, I really found it amusing. I was a little annoyed by the fact that Becky couldn't stop spending, but then again that's why she is called a Shopaholic. In the second book, I felt more of the same it was just magniifed more. Now in the third book, I'm just down right annoyed by Becky Bloomwood. In this book, she wasn't going on crazy shopping sprees, just planning two different weddings because she was afraid to tell each person and because I feel she's really selfish. Still, I read the book because I wanted to see how she could get herself out of another situation. I will say that I love her friends Suze and Danny, her client Laurel and her husband Luke. Maybe that's why I keep reading. I will read the next book, but I'm sure I'll be just as annoyed when I'm done.",
|
9 |
+
"id": "0"
|
10 |
+
},
|
11 |
+
"1": {
|
12 |
+
"label": false,
|
13 |
+
"text": "the cover is fine - the pool is horrible: The entire pool was horrible. The cover was fine once we got it on, but we finally got rid of the pool after 2 weeks because it was so hard to set up and keep clean.",
|
14 |
+
"id": "1"
|
15 |
+
},
|
16 |
+
"2": {
|
17 |
+
"label": true,
|
18 |
+
"text": "Good album, not their best.: This album is probably the most marketable and radio friendly of all of dashboard's albums. For the peripheral listener it may be the right one to get to introduce you to this band. But as a Dashboard fan of 5 or so years I truly hope they return to their original sound for their next work. Not for the listen-ability but for the show. To this day the fans react best to the songs from \"Places\" or \"A Mark, A Mission.\" I recommend this album to everyone but I also recommend any and all of their other work.",
|
19 |
+
"id": "2"
|
20 |
+
},
|
21 |
+
"3": {
|
22 |
+
"label": false,
|
23 |
+
"text": "This is a horror novel, right?: Never one to pass up any vampire novel, I purchased Sips because the description seemed interesting. Vampires, Marquis de Sade, fetishism, yada yada yada. If this is a comedy, I give it 4 stars; however, I'll give it 1 star as a horror novel. Sade was rather boring; I would think a character as intense and multi-faceted as the Marquis de Sade would make for a more interesting vampire. The writing style isn't too bad, but overall I found the characters to be mildly amusing at best. The plot was thin, the end was anti-climactic, and the vampires were not very frightening. The book had little suspense, and it leaves a mile-wide opening for a sequel at the conclusion. I would, however, like to see something more of the vampire mutants lurking in the graveyard. They were the most riveting of any of the characters.",
|
24 |
+
"id": "3"
|
25 |
+
},
|
26 |
+
"4": {
|
27 |
+
"label": true,
|
28 |
+
"text": "Superb mix of global non secular musical denominations: I first heard Ms. Pook's music on the \"Eyes Wide Shut\" soundtrack (the masquerade ball scene) and was blown away; if ever there was a necessity for music to permeate a scene in a film this was it. She incorporates a blend of the traditional songs from faiths across continents and mixes them, for lack of a better comparison than similar to your quintessential raver d.j. (though these are better and definitively more original :) \"Oppenheimer\" is my favorite, and if you let the last track run for a few minutes a portion of the song will play once more. I can't wait to hear more of her stuff - these hymns are awesome.",
|
29 |
+
"id": "4"
|
30 |
+
},
|
31 |
+
"5": {
|
32 |
+
"label": true,
|
33 |
+
"text": "A moving and suspenseful story!: For anyone familiar with the occult, this book is likely to raise hairs on the back of your neck as you read. Even if you're not, the storyline is suspenseful and fascinating, and the characters evoke great sympathy and admiration. An excellent read.",
|
34 |
+
"id": "5"
|
35 |
+
},
|
36 |
+
"6": {
|
37 |
+
"label": true,
|
38 |
+
"text": "Simple & Easy to Use - A Practical approach to eating out: This guide is extremely to use. It provides sample menus that you'd see at Chinese, Indian and Thai restaurants. Then you are provided with descriptions of each dish and how it is prepared and the ingredients used. From there you are provided with specific considerations as to how the preparation or ingredient list may affect you if you have Gluten or Allergen issues.This book is the size of a passport and very organized and well written. The Chinese, Indian and Thai Cuisine Passport is perfect for making choices while traveling, or while dining at your favorite local restaurant.",
|
39 |
+
"id": "6"
|
40 |
+
},
|
41 |
+
"7": {
|
42 |
+
"label": false,
|
43 |
+
"text": "Being Fair....I am a fan of hers: and I really enjoyed her previous works, more than I could have ever imagined, but this record is horrible. The songs are trite, the lyrics are incredibly boring, indulgent and immature. The music is pop staple, with forgetable melodies and repetative chorus lines, I feel as if the studio wrote the entire album for her while she was sleeping, this just doesn't speak to any of her previous works at all. This album fits on the same shelf with a Nickelodeon-themed CD. Instead of heading in the direction of an artist like Alanis Morrisette, she is going backward and joining the ranks of Hannah Montana and the Naked Brothers Band. She is a great artist and her first two records are amazing. She is better than this CD and I am looking forward to her next effort.",
|
44 |
+
"id": "7"
|
45 |
+
},
|
46 |
+
"8": {
|
47 |
+
"label": false,
|
48 |
+
"text": "Sucked: I thought the DVD sucked tremendously. It was very boring and if I could, I would return it for a refund. There was only one \"small\" clip of Dylan himself. I'm very disappointed.",
|
49 |
+
"id": "8"
|
50 |
+
},
|
51 |
+
"9": {
|
52 |
+
"label": true,
|
53 |
+
"text": "Excellent product: Easy to install. If you have a newer furnace you probably do not need the swail switch as the HE220A comes with a Humistat which can be connected to the furnace. They recommend the Honeywell 32005847-001 Installation Kit, Bypass which is a little pricey and you can probably buy the pieces of this kit cheaper individually from Home Depot or Lowes or ACO as well as the filters.",
|
54 |
+
"id": "9"
|
55 |
+
},
|
56 |
+
"10": {
|
57 |
+
"label": true,
|
58 |
+
"text": "Very happy.: I am very happy with this trashcan. I was unable to find one in the stores to fit the space in my cabinet, but this one does the job. It is very sturdy and looks like it will put up with years of use.",
|
59 |
+
"id": "10"
|
60 |
+
},
|
61 |
+
"11": {
|
62 |
+
"label": false,
|
63 |
+
"text": "These aren't Throughbreds!: This makes me so mad. All these new authors are coming and changing the series. Nothings the same anymore and the plots are repeditive. Don't even bother reading these books until #32 these are like a different series. I don't know excactly what's happing but these new authors suck!",
|
64 |
+
"id": "11"
|
65 |
+
},
|
66 |
+
"12": {
|
67 |
+
"label": false,
|
68 |
+
"text": "Large and slow are a bad combination.: I bought this TV and returned it a week later, because it blurred so badly with motion that sports were unwatchable. I ended up buying a smaller Sony XBR4, and I have none of the issues (plus the picture is far, far better).This has nothing to do with 60 vs 120Hz. That is more important for DVDs and Blu-Ray signals that are 24fps (which doesn't divide evenly into 60 but does for 120). The LT52133 has an 8ms response time, which is extremely slow. A decent LCD should be 5 or lower.If you want an LCD, choose speed and quality over size. If you want size and quality but want to spend less, buy a plasma. Don't buy a big, cheap, slow LCD!I gave it 2 stars because I like the interface and remote.",
|
69 |
+
"id": "12"
|
70 |
+
},
|
71 |
+
"13": {
|
72 |
+
"label": false,
|
73 |
+
"text": "Skip it: This movie is very poorly written and the result is not distressing, just lame. The actors do their best but from very early on it is obvious that the material gives them nothing to work with. Fans of Colin Firth will experience a certain dim level of enjoyment. Minnie Driver is a treat but her character is no better written than the others. Vermont locations are worth something. With one or two moments of exception it's neither comedic nor romantic.",
|
74 |
+
"id": "13"
|
75 |
+
},
|
76 |
+
"14": {
|
77 |
+
"label": true,
|
78 |
+
"text": "Belive it i went to the concert?: hi everyone let me tell you i went to the concert i was amazed with what i saw cher was awsome i tell you buy the dvd. as i sat in front of the stage cher was doing a great job to us the she is living proof . So i urge you to buy it?",
|
79 |
+
"id": "14"
|
80 |
+
},
|
81 |
+
"15": {
|
82 |
+
"label": true,
|
83 |
+
"text": "Vale la pena.: En este libro se narra de una forma muy interesante la vida de una familia en particular. Lo que mas me gusto de este libro fue la manera en que la autora describe a lo largo del libro las personalidades de los sujetos envueltos en la novela; que vienen a ser muy distintos y extremos, lo cual, intensifica el drama... Definitivamente es un buen libro y lo recomiendo a todos.",
|
84 |
+
"id": "15"
|
85 |
+
},
|
86 |
+
"16": {
|
87 |
+
"label": true,
|
88 |
+
"text": "Nummie Children's story: I ordered this book for our grandchildren. Two boys 5 & 3 and a 4 month old girl. All love the story. The mouse is determined.",
|
89 |
+
"id": "16"
|
90 |
+
},
|
91 |
+
"17": {
|
92 |
+
"label": false,
|
93 |
+
"text": "Seem to be alone on this one: Looking at the other reviews, I seem to be the only one that was disappointed with this book. The content is too babyish in most of it for older tweens and the more \"grown up\" content would be over a younger tween's head. I had a quick read through and with every paged turned, I thought duh. I'll be looking around for another book shortly.",
|
94 |
+
"id": "17"
|
95 |
+
},
|
96 |
+
"18": {
|
97 |
+
"label": true,
|
98 |
+
"text": "Best yet: by far the best EA game yet. I especially like the easy controls and kick - a graphics. the playbook is extremely accurate and detailed. Also the fight songs and cheerleaders were a nice touch. this is an excellent game and worth checking out.",
|
99 |
+
"id": "18"
|
100 |
+
},
|
101 |
+
"19": {
|
102 |
+
"label": false,
|
103 |
+
"text": "washed out: A bit like Simply Reds version of the Valentine bros hit \"Moneys too tight to mention\" - this cover version has nothing of the driving energy that characterised the original recording.",
|
104 |
+
"id": "19"
|
105 |
+
},
|
106 |
+
"20": {
|
107 |
+
"label": true,
|
108 |
+
"text": "great water bottle: I love this bottle it is great. I like knowing it is non toxic and it just works very well. You can have it full and lay it down and it doesn't leak at all.",
|
109 |
+
"id": "20"
|
110 |
+
},
|
111 |
+
"21": {
|
112 |
+
"label": true,
|
113 |
+
"text": "Nice goggles: I am pretty happy with these goggles. They work well during swim workouts in the pool. I do notice a little bit of fogging from time to time. I had hoped to wear them during an upcoming triathlon, but based on a few instances where they slipped a little in the pool I am concerned that they won't be secure enough. I will keep using them in the pool, but will likely get different ones for open water races.",
|
114 |
+
"id": "21"
|
115 |
+
},
|
116 |
+
"22": {
|
117 |
+
"label": false,
|
118 |
+
"text": "aaahhh nnnoooooo!: Hopefully the last film in one of the worst horror trilogys ever made. This series pretty much ruined the horror film for years to come, for one its too self aware, thats incredibley annoying, second everyone acts like they are on Friends or some sitcom. The acting is just plain bad and unconvincing. Now the gore, if you're going with material this weak you should load it up with disgusting violence, is there any in the Scream series? No.Everyone went to see this movie just to see who THE KILLER is. This movie sets low standards to be met, you expect alot of people to die, one shock, then we find out who the killer is, then you go home. Every horror film being made today is like that, there's nothing new or exciting or risk taking, its the same stuff over and over and people are laping it up like dog food.This film is what you've come to expect, bad acting, some people die and we eventually find out who the killer is and all is merry and well. Pathetic.",
|
119 |
+
"id": "22"
|
120 |
+
},
|
121 |
+
"23": {
|
122 |
+
"label": true,
|
123 |
+
"text": "A classic of its kind: This movie is a classic of its kind and much better that a lot of movies, that followed. It is not one of the best, but it still deserves five stars...",
|
124 |
+
"id": "23"
|
125 |
+
},
|
126 |
+
"24": {
|
127 |
+
"label": false,
|
128 |
+
"text": "Nice suite, but Virtual PC 7 disappoints on my G5: I purchased the upgrade since I'd already bought both Office v.X and Virtual PC 6.1 last year.The biggest letdown is that Microsoft's promised support for the G5 is nearly non-existent. I have a dual processor G5 with an ATI Radeon 9800 card (Apple), and after trying to install Virtual PC 7 three times, I cannot get a VM to work. It did install (and work) flawlessly on my G4 Powerbook. Googling for reviews finds it's very hit or miss, but if (when) it misses, you'll regret investing the extra $$$ in an immature product.",
|
129 |
+
"id": "24"
|
130 |
+
},
|
131 |
+
"25": {
|
132 |
+
"label": false,
|
133 |
+
"text": "Okay player, don't expect a miracle: I bought this DVD player at Circuit City earlier this yr for about a $100. I hooked it up to a 47\" Vizio LCD (which by the way has an awesome picture) using a HDMI cable. After fine tuning this product, I was very, very, very diasppointed. The picture was very \"grainy\" (lots of pixels). I have a $35 DVD player that only utilizes an s-video cable that produces a much more crisp picture. Be warned, the picture stinks.",
|
134 |
+
"id": "25"
|
135 |
+
},
|
136 |
+
"26": {
|
137 |
+
"label": true,
|
138 |
+
"text": "A revelation of the science of consciousness evolution and all natural growth: Here is a readable and fascinating account of the development of the new science of chaos theory, the only body of ideas that describes how the natural world as experienced by human beings emerges out of basic quantum processes. The different explorers and innovators of the new science are introduced in a personable way that will enchant the interested reader.",
|
139 |
+
"id": "26"
|
140 |
+
},
|
141 |
+
"27": {
|
142 |
+
"label": false,
|
143 |
+
"text": "Don't say that I didn't warn ya' !: I'm absolutely convinced that Delbert McClinton had no controlover the release of this CD. I rated it 1 star simplybecause there is no 0 star rating ! In actuality , I am not certain that the vocalist on this recording IS Delbert McClinton. Only on the Mr. Pitiful track is there any similarity at all to Delbert's voice. This is the perfect CD for someone with money to burn who would like to have a recording of a 1960's garage band recorded in a garage and who should be working in a garage ! Delbert fans...run fast and run far away from this ! END",
|
144 |
+
"id": "27"
|
145 |
+
},
|
146 |
+
"28": {
|
147 |
+
"label": false,
|
148 |
+
"text": "This item is not available: I ordered this unit on February 7th. Every time I checked back on the status of the order, it read \"not shipped\" and the estimated shipping date got moved out. I really don't think this unit is avaialble from the company anytime soon. I cancelled the order.",
|
149 |
+
"id": "28"
|
150 |
+
},
|
151 |
+
"29": {
|
152 |
+
"label": false,
|
153 |
+
"text": "I used to like ABBA...: I used to like ABBA, until I saw Mama Mia! A horribly disjointed musical, where songs feel contrived to fit into the story; a story that doesn't seem to come together. Individual songs are usually done alright, but don't segue from one to another very well.The cast butchered several of the songs, but especially S.O.S, Take A Chance On Me, and anything where Pierce Brosnan sang. On a side note, I also counted at least two violations of Chekov's Gun. And finally, I think it has a bad moral message. Which you only recognize if you manage to sit through the whole thing.If there is justice in the world, cast members without established careers won't get to have them as punishment for the worst movies I've seen since The Talented Mr. Ripley.",
|
154 |
+
"id": "29"
|
155 |
+
},
|
156 |
+
"30": {
|
157 |
+
"label": false,
|
158 |
+
"text": "A complete disaster!: If you're like me, you probably wanted to check out this movie because it sounded like it really could be an excellent supernatural Gothic horror tale full of goblins and wicked things alike. Well, don't make the same mistake I did and actually watch it. It's horrible. Terrible. An honest to goodness waste of film. The acting is wretched, the film quality is rotten (it actually looks twenty years older than it is), and the plot is thin, weak, and does not give you what it's supposed to. The only reason I bothered to give this film 1 star is because of Alexis Arquette -- he's great looking, but should have left this film out of his career.",
|
159 |
+
"id": "30"
|
160 |
+
},
|
161 |
+
"31": {
|
162 |
+
"label": true,
|
163 |
+
"text": "beautiful detail: I just purchased these Dover COloring Books for my mother and she loves them. The detail is out of this world and the variety of colors you can use are only limited by your inagination. HIGHLY RECOMMENDED!",
|
164 |
+
"id": "31"
|
165 |
+
},
|
166 |
+
"32": {
|
167 |
+
"label": false,
|
168 |
+
"text": "Very disappointed: I looked forward to getting this movie as I had heard many good things about it but it was nothing like I had imagined or been led to believe. There is very little actual history in it or real Christian experience except for the background because the main focus is a soap opera style romance and caricature figures. I agree with the reviewer who described it as a mixture of \"tawdry Hollywood sex\" somehow interspersed with a vague nod to Christianity. The only decent scene was the arena scene where the Christians are going to their deaths singing hymns - but that's not enough to make it a great or even a good movie. Not personally to my taste anyway.",
|
169 |
+
"id": "32"
|
170 |
+
},
|
171 |
+
"33": {
|
172 |
+
"label": false,
|
173 |
+
"text": "Unreliable minikit: I bought this minikit because it got good reviews and it would be perfect for my purposes. However it switches on and off whenever it wants, it looses contact with the phone. Very often the on/off button works only in a horizontal position (?) I use a Treo 650, which is on the compatible phone list. When I contacted Parrot, they said it wasn't (?) At last I opened the unit, but there are no moving parts inside except the micro switches. It is giving me a headache, so I will go searching for an alternative.",
|
174 |
+
"id": "33"
|
175 |
+
},
|
176 |
+
"34": {
|
177 |
+
"label": true,
|
178 |
+
"text": "A Christmas Classic!: This is surely one of the best classical Christmas recordings available. Don't buy the older version, as the quality of this recording is excellent. This is one of those \"Every Christmas - Can't have Christmas without\" recordings.",
|
179 |
+
"id": "34"
|
180 |
+
},
|
181 |
+
"35": {
|
182 |
+
"label": false,
|
183 |
+
"text": "too narrow: These were the narrowest pair of D size shoes I have ever tried on. I don't care how nice a shoe looks. If it don't fit it just don't fit.",
|
184 |
+
"id": "35"
|
185 |
+
},
|
186 |
+
"36": {
|
187 |
+
"label": false,
|
188 |
+
"text": "Lack of extension: This earphones lack a descent extension cord. ITs very small cable, but its of good quality. Sadly, cord its too short, and the extension is useless.",
|
189 |
+
"id": "36"
|
190 |
+
},
|
191 |
+
"37": {
|
192 |
+
"label": true,
|
193 |
+
"text": "Easy-Reading: This is the 3rd Southern Sisters Mystery I've read. They're easy, fast and funny murder mysteries, with lots of cute family stories intertwined in the intrigue.",
|
194 |
+
"id": "37"
|
195 |
+
},
|
196 |
+
"38": {
|
197 |
+
"label": false,
|
198 |
+
"text": "it'd be great if it worked like it was supposed to: for the first 30 seconds it was lovely, but i believe that either the motor isn't powerful enough to keep the shaft rotating smoothly or 3 AA batteries just don't provide enough juice for the motor to work more than 30 seconds. it was a nice idea, but i'm rather dissapointed. the jelly material is somewhat difficult to maintain also. i think if it were hooked up to a larger battery pack it'd be WONDERFUL... which i think i may have a macgyver friend with a knack for electronics attempt to do for me.",
|
199 |
+
"id": "38"
|
200 |
+
},
|
201 |
+
"39": {
|
202 |
+
"label": true,
|
203 |
+
"text": "Not Hornby's best but still good: I loved About a Boy and really, really loved the sardonic wit of High Fidelity. About a Boy is much deeper but just as cynical. Maybe even more so. The characters are richly drawn and just complex enough to keep the reader wanting more. Good read, but best to take some time with this one. Not recommended for a summer beach read.",
|
204 |
+
"id": "39"
|
205 |
+
},
|
206 |
+
"40": {
|
207 |
+
"label": false,
|
208 |
+
"text": "A Disappointment: As with most Taunton Press publications, the illustrations and photographs in this book are spectacular and the organization and layout is superb. Nonetheless, I found this book disappointing. It lacks both depth and breadth. I had hoped for a detailed review of wood joinery including some of the more unusual joinery found in Japanese woodworking. This book, however, is targeted more toward the beginner. Even so, it does not cover the details and \"tricks\" of even the most basic techniques in sufficient detail to allow beginners to easily reproduce them. Consequently, it is unclear who this book was written for - not the beginner as it lacks depth, and not the advanced woodworker as it lacks breadth. Far more effort appears to have been put into appearance and organization than in content.",
|
209 |
+
"id": "40"
|
210 |
+
},
|
211 |
+
"41": {
|
212 |
+
"label": false,
|
213 |
+
"text": "Horrible. Don't do it!: Great price for the item when a 6' one of these at Best Buy is $20. Thing is, the one from Best Buy fits in the outlet and stays there. This cord fits very loose and does not connect. I bought 2 of them, neither did what they were suppose to.As much as I hate to say it, but, buy the more expensive one. At least it works.",
|
214 |
+
"id": "41"
|
215 |
+
},
|
216 |
+
"42": {
|
217 |
+
"label": true,
|
218 |
+
"text": "Given as a gift...: Given to my best friend as a gift. She loves it. Her fiance enjoys making coffee for her in the mornings. :)",
|
219 |
+
"id": "42"
|
220 |
+
},
|
221 |
+
"43": {
|
222 |
+
"label": true,
|
223 |
+
"text": "Love the ring.: This is a nice ring. I was worried it out be thin and cheap looking, but it's not. It's a very pretty stylish ring. Go for it.",
|
224 |
+
"id": "43"
|
225 |
+
},
|
226 |
+
"44": {
|
227 |
+
"label": false,
|
228 |
+
"text": "Beautiful writing Marred by One-Note Characterizations: How could Kingsolver have ruined her book with such an obvious error? Nathan is a strident paper doll that flattens the whole story. Just as bad, the author has all the narrators using the same ironic tone to decribe him, deadening their voices as well. At the same time, Kingsolver doesn't have the guts to show him doing something trully terrible. I don't trust an author who can't let the reader make up his own mind, and as a consequence I couldn't trust her views about ANYTHING in the story. I'm giving this two stars for her descriptions of the African landscape, and that is all.",
|
229 |
+
"id": "44"
|
230 |
+
},
|
231 |
+
"45": {
|
232 |
+
"label": false,
|
233 |
+
"text": "Much worse than any cordless phone I've ever had: This phone cuts out only 2 rooms away from the base station. There is static noise, and callers on the other end complain about sound quality. I can't go into the garden, which used to be no problem with my old 900 MHz phone.",
|
234 |
+
"id": "45"
|
235 |
+
},
|
236 |
+
"46": {
|
237 |
+
"label": false,
|
238 |
+
"text": "Waste of time & money: The first Hangover was not too bad, this one was just terrible. The acting is bad, the script is bad, everything about this movie was just bad. Do yourself a favor, don't buy this movie as it is a total waste of time and money.",
|
239 |
+
"id": "46"
|
240 |
+
},
|
241 |
+
"47": {
|
242 |
+
"label": false,
|
243 |
+
"text": "Did Not Work For Me!: Impressive You Tube Video (Like a Sci-Fi Fantasy). In reality it's a high speed Easy Out so unsurprisingly it broke faster than an Easy out. This product did not work for me. The drill part did not drlil, the puller part did not pull. It was a total zero.",
|
244 |
+
"id": "47"
|
245 |
+
},
|
246 |
+
"48": {
|
247 |
+
"label": true,
|
248 |
+
"text": "Excellent book, long overdue.: From a very long time women were told that looking good was of utmost importance. This was without regard to health or fitness and how age affected these parameters. Witness the whalebone and other types of corsets, the spike heeled shoes and the numerous weight loss programmes on the market (some of which are downright dangerous). Now there is a book, backed by solid research, that allows women of all ages to remain fit and healthy for a lifetime. I am certainly going to recommend this book to all the women I know.Bentley Norville",
|
249 |
+
"id": "48"
|
250 |
+
},
|
251 |
+
"49": {
|
252 |
+
"label": false,
|
253 |
+
"text": "not an all star: Not a practical guide in this collecting age. Does NOT have a comprehensive list; meaning it does NOT cover all manufacturers and, more importantly, for the ones it does, only provides listings of the base set. That means no insert or variation pricing whatsoever. Also, no oddball or minor league issues are listed. Generally speaking, unless you are collecting base sets prior to the advent of inserts and alternate versions of the base set, this guide is fairly useless.",
|
254 |
+
"id": "49"
|
255 |
+
},
|
256 |
+
"50": {
|
257 |
+
"label": false,
|
258 |
+
"text": "Again, second rate city, third rate writer: Just another example of Mr. Lindberg's pitiful attempt at exhibiting a strong expertise on a subject with which he is clearly obsessed. Don't waste your time with this book, either. It is poorly written and fails to engage the reader. You might consider using this book and the first book he wrote on the same subject, as a pair of bookends. That is about all they are worth.",
|
259 |
+
"id": "50"
|
260 |
+
},
|
261 |
+
"51": {
|
262 |
+
"label": true,
|
263 |
+
"text": "Reality: No one should need to convince you to buy this book, you should just do it! It's so well written and worded and brings you right to the heart of a sexual reality that most people like to pretend doesn't really live and breath in their fair cities. I never again want to hear someone bad mouth a working girl for what she does. I will and do now however look at men with a curious eye wondering if they are depraved peep show window lickers :)",
|
264 |
+
"id": "51"
|
265 |
+
},
|
266 |
+
"52": {
|
267 |
+
"label": false,
|
268 |
+
"text": "Bummer: Visual effects and Battle footage were great...the other 85% of the movie was just lousy fluff...",
|
269 |
+
"id": "52"
|
270 |
+
},
|
271 |
+
"53": {
|
272 |
+
"label": true,
|
273 |
+
"text": "The spark of idependence: Filled with the independent spark that made us all love life at one point or another. A fun, introspective and nonsensical movie that sticks with you.",
|
274 |
+
"id": "53"
|
275 |
+
},
|
276 |
+
"54": {
|
277 |
+
"label": true,
|
278 |
+
"text": "What I expected from Mirman's website. Funny. Funny. Russian.: lol, gotta love Eugene. Even when his audience doesn't initially laugh, he gets in a good zinger at himself and they laugh at that. He's witty without being condescending, and uncomplicated without seeing contrived. However, if you're not a fan of irreverant humor, this may not be for you.",
|
279 |
+
"id": "54"
|
280 |
+
},
|
281 |
+
"55": {
|
282 |
+
"label": false,
|
283 |
+
"text": "Do not...repeat...do not bother!: It is not often that I offer a negative review but this compilation while attractive does not deliver at all.The foot massage gizmo is awkward and uncomfortable.The pumice stone leaves rough splinter like skin.The foot scrub doesn't reall scrub.The rotary action tool has five heads, none of which work well and you must hold the switch in place or it turns off. It is cumbersome and ineffective.The one star was initially given for a foot brush (which later lost its bristles very easily as I update the review) and a sweet smelling foot repair balm.Don't waist your money. Soak your feet and invest in an inexpensive German Titania file, smooth and coarser side, or a like product. It will last for years.",
|
284 |
+
"id": "55"
|
285 |
+
},
|
286 |
+
"56": {
|
287 |
+
"label": false,
|
288 |
+
"text": "Not Sandra's Best: Ms. Brown has written better romance novels. Don't give up on her if this was your first Sandra book.The feeble female lead struggles with a 15-year crush that walks back into her life. The smug male lead acts like a jerk through most of the novel. The romance scenes grapple to muster up passion but fall short. Both of the main characters bothered me; my favorite character was the 17-year old.A quick read...about 4 hours (with interruptions) for me...but probably not worth it.",
|
289 |
+
"id": "56"
|
290 |
+
},
|
291 |
+
"57": {
|
292 |
+
"label": true,
|
293 |
+
"text": "Impressed: Lots-O-Fun. Wood and glass toys are high quality and are a good fall back for the kids to play with they are \"bored\". Would buy again.",
|
294 |
+
"id": "57"
|
295 |
+
},
|
296 |
+
"58": {
|
297 |
+
"label": false,
|
298 |
+
"text": "Light turned on by itself 3 times: The installation was easy. I used it for a week, everything worked fine, EXCEPT the light it connected to turned on by itself 3 times so far, with no one near to either one of the switch. Not sure whether it is a defective unit, or this product is too sensitive to noise. I'm returning this product and will just install a regular switch instead.",
|
299 |
+
"id": "58"
|
300 |
+
},
|
301 |
+
"59": {
|
302 |
+
"label": true,
|
303 |
+
"text": "good battery: I feel kind of silly writing a review for a battery, but have to say that these last a LONG time. Work very well.",
|
304 |
+
"id": "59"
|
305 |
+
},
|
306 |
+
"60": {
|
307 |
+
"label": true,
|
308 |
+
"text": "Even a Woman finds it funny: Yes, even a woman finds \"Married to Mommy\" funny. The book gets you laughing aloud when it is trying to make fun of \"Mommies\". The truth is that it really is making fun of the stupidity of men and their simple basic needs of sex, getting out of work, and beer. Of course, the truth is always funny.A definite MUST for any woman, married or not. We will now know all the secret tricks the men try to use on us.By the way, I am NOT a MOMMY!",
|
309 |
+
"id": "60"
|
310 |
+
},
|
311 |
+
"61": {
|
312 |
+
"label": true,
|
313 |
+
"text": "Gungrave...not quite what you might expect: Those thinking this is another version of Trigun will be disappointed. Gungrave is actually a lot deeper and more complex. The lead is short on dialouge, but the story has more depth and character development than most anime. The first DVD is more about the main character's past than about the reanimated killing machine he's become, but it definitely leaves you wanting more.",
|
314 |
+
"id": "61"
|
315 |
+
},
|
316 |
+
"62": {
|
317 |
+
"label": true,
|
318 |
+
"text": "Error in product description: It's great in every way. However, if you'd prefer a digital tuner (as I do), then you might need to look further. The product description boasts a digital AM/FM tuner, but it's disappointingly an analog AM/FM tuner.Overall - especially for the price - I think it's pretty good.",
|
319 |
+
"id": "62"
|
320 |
+
},
|
321 |
+
"63": {
|
322 |
+
"label": true,
|
323 |
+
"text": "good phone but not as user friendly as it could be: Battery life is very good. Phone has good range. My only complaint is it's to involved to get your message from the handset.",
|
324 |
+
"id": "63"
|
325 |
+
},
|
326 |
+
"64": {
|
327 |
+
"label": false,
|
328 |
+
"text": "Big waste of money (and space in my house!): My 5 year old son wanted this so bad, but when we got it for him, there were so many pieces to put together that didn't fit together well, he never played with it. It just sits on our floor in many pieces taking up toy space! What a waste!",
|
329 |
+
"id": "64"
|
330 |
+
},
|
331 |
+
"65": {
|
332 |
+
"label": true,
|
333 |
+
"text": "Don't want to take it off: Very satisfied with an earlier purchase of this Bali bra model, I was just as pleased with the new one. Very comfortable, well made and a good neutral color. It will be my next choice, too.",
|
334 |
+
"id": "65"
|
335 |
+
},
|
336 |
+
"66": {
|
337 |
+
"label": true,
|
338 |
+
"text": "Fantastico: If anybody who's into rock music is ever looking for a band to keep you on your toes, this is the band. I've been a fan for 10 years now, and no album has ever sounded like any of their previous albums. This disc is fantastic with such a variety of styles, as are the previous releases, even back to the Rainbow Butt Monkey days.",
|
339 |
+
"id": "66"
|
340 |
+
},
|
341 |
+
"67": {
|
342 |
+
"label": false,
|
343 |
+
"text": "too much visual: There are far too much designs, visuals, colors, etc in the book - this is highly distracting, as TV screen can be...By way of example (among so many...), what is the use of colors with the three squares of the Pyth. theorem???? this is as useless as writting 2+3=5 with 2 in blue, 3 in red and 5 in yellow...I wish I had purchased the 2nd edition, which according to reviews was closer to what I was looking for.",
|
344 |
+
"id": "67"
|
345 |
+
},
|
346 |
+
"68": {
|
347 |
+
"label": true,
|
348 |
+
"text": "Aretha's First Arista Release Showed Pleasures to Come: After a long and musically satisfying career with Atlantic, Aretha severed her ties with that company and moved under the wing of Arista's Clive Davis. With the start of the 1980's, Aretha was looking for new territory to conquer and almost succeeded with this mixed bag.\"United Together\" is a fine tune that benefits from beautiful orchestral arrangement that is matched by Aretha's superb vocal instrument. The remake of \"Can't Turn You Loose\" allows Aretha to show why she is the Queen of Soul\" for she really belts this one out. Another cover, that of the Doobies' \"What a Fool Believes,\" is an interesting interpretation. The final cut \"School Days\" appears to be \"autobiographical\" for every girl growing up in the fifties.Although not as strong as her Atlantic work, \"Aretha\" is still a suitable addition to the artist's discography.",
|
349 |
+
"id": "68"
|
350 |
+
},
|
351 |
+
"69": {
|
352 |
+
"label": false,
|
353 |
+
"text": "Misguided Purchase: The photo and description do not reflect the product. The screen panel kit I received was white. What a huge inconvenience during a time-crunch.",
|
354 |
+
"id": "69"
|
355 |
+
},
|
356 |
+
"70": {
|
357 |
+
"label": false,
|
358 |
+
"text": "Banacek: My husband and were looking forward to seeing this series.The first show was SO boring, we finally just quit watching it.Actually, we haven't gotten around to watching anymore. I guess we were afraid of a repeat.Maybe that was just once, I hope!",
|
359 |
+
"id": "70"
|
360 |
+
},
|
361 |
+
"71": {
|
362 |
+
"label": true,
|
363 |
+
"text": "JDT: Uncle Tupelo is without doubt one of the most under appreciated groups of the 90's. Anodyne, like each of the three albums that came before it, has everything that a remarkable recording requires: great songs, honest lyrics, and artists who really care about the music they are making. Like the best of Dylan and Springsteen, the songs are about real people with real troubles and joys. When you hear them you know they are coming from the heart. The songs contributed by Jay Farrar and Jeff Tweedy are easily differentiated by the voacls, music, and lyrics. What makes this record interesting is how well these unique sounds compliment each other. The union is seamless.",
|
364 |
+
"id": "71"
|
365 |
+
},
|
366 |
+
"72": {
|
367 |
+
"label": true,
|
368 |
+
"text": "Well Worth Reading: First a confession: Miriam Wasserman was my mother. However, she published several books, but this is the only one I really found useful. She walks the reader through the New York City school system and the attitudes of different groups involved in the system back in the 1960s. This includes parents, teachers and administrators. Her view is that the further away one got from parents and students, the more prestige one had. She meticulously describes the teachers' strike of 1968 against \"community control of schools\", a strike of which she is extremely critical. She explores the racism that was involved in this strike, including using quotes from striking teachers, etc. It should be emphasized that the author was pro-union all her life, so her views don't stem from an anti-union bias. The book also covers the high school student rebellion which coincided with and followed the strike.",
|
369 |
+
"id": "72"
|
370 |
+
},
|
371 |
+
"73": {
|
372 |
+
"label": true,
|
373 |
+
"text": "compact and loaded: I bought this phone after reading the cnet reviews and really liked it. It looks small and really compact. I like the camera pics at 2 mega pixel and bright flash. The mp3 player is crisp. The headset that comes along delvers amazing fM radio. I think my phone is not very loud and you have a problem when you are around a noisy crowd. I just bought this phone again for my cousin. He likes it too. Almost forgot the display is very good.",
|
374 |
+
"id": "73"
|
375 |
+
},
|
376 |
+
"74": {
|
377 |
+
"label": true,
|
378 |
+
"text": "Outstanding text!: Brooks/Cole should keep this text in their catalog for ages! It is well-written, examples are generally quite clear, vocabulary is introduced well, and the exercises develop real skills, rather than simply be busy-work. One of the best calculus books ever!",
|
379 |
+
"id": "74"
|
380 |
+
},
|
381 |
+
"75": {
|
382 |
+
"label": true,
|
383 |
+
"text": "Excel 2003 Bible: Very good source of information. I will most likely buy other books in this series.",
|
384 |
+
"id": "75"
|
385 |
+
},
|
386 |
+
"76": {
|
387 |
+
"label": true,
|
388 |
+
"text": "Tasting is Believing: Gluten-free breads used to have a gritty texture from the rice flour, and were too soft for sandwiches. Bette Hagman uses garbanzo/fava bean flour, sorghum flour, tapioca flour, and corn starch to create breads which have a similar texture to wheat flour breads, and the flavors of her breads are fabulous.My BF bought me this book and a great tasting beverage to drink it with. Since he knows I quit coffee recently, he's been really wonderful helping me in cope with my mood swings. S o y f e e is made from soy beans that is roasted just like coffee. I enjoy the taste and don't miss coffee one bit. Buy it online at www.s o y c o f fee.com.This is a 'must have' for anyone baking gluten-free. I think all of Bette Hagman's books are wonderful and a must for those with gluten intolerance.",
|
389 |
+
"id": "76"
|
390 |
+
},
|
391 |
+
"77": {
|
392 |
+
"label": true,
|
393 |
+
"text": "5 stars for the show, no stars for the \"Collector's Edition\": I was really looking forward to getting this Collector's Edition and see what extras were added. I knew it wasn't a lot - just a mini-book and a documentary - but I figured it would be packaged in a cool way.Wrong.As others have already mentioned, the Collector's Edition is *literally* theAvatar: The Last Airbender - The Complete Book 1 Collectionslipped into another cardboard box, with a little booklet and DVD in an envelope (not even a case!) wedged in. It's really disappointing; it would have been so easy to create a quality Collector's Edition but the studio couldn't be bothered, I guess.",
|
394 |
+
"id": "77"
|
395 |
+
},
|
396 |
+
"78": {
|
397 |
+
"label": true,
|
398 |
+
"text": "sula scottcampos: Sula, a book that talks about the issues of being a black women is a really good novel to read.One of the reasons I recommend it is because of its realism and its themes - death, sex, friendship and poverty.I also think that its characters are very good, its easy to identify with one or both of them. I really recommend this book to anyone who enjoys good literature.",
|
399 |
+
"id": "78"
|
400 |
+
},
|
401 |
+
"79": {
|
402 |
+
"label": true,
|
403 |
+
"text": "Fantastic! It's a must-have for girls!: I hated razor, tried shaving but it did not work for me. Shaving made the hair grows thicker and faster afterwards, plus the roots are impossible to be getting rid of. After reading the reviews, I ordered it to try, I used it for once and already fall in love with this. I used to use small tweezer to pluck out my leg's hair, in order to avoid the razor, it took me a few hours to do that but this super electronic tweezer works wonder! You won't see the black roots and I have smooth and silkly legs in 20 mins. It does not hurt at all, if you use it on your legs. But, if you use it at your under arm, it won't be a pleasant feeling, of course! I will never use anything else besides this for hair removing anymore! highly recommended!",
|
404 |
+
"id": "79"
|
405 |
+
},
|
406 |
+
"80": {
|
407 |
+
"label": false,
|
408 |
+
"text": "This is not a toy: I guess I was expecting more out of these leave window decals. I just didn't find them attractive after placing them on my window, they seem very cheap, I guess because they are cheap.I threw them away.",
|
409 |
+
"id": "80"
|
410 |
+
},
|
411 |
+
"81": {
|
412 |
+
"label": true,
|
413 |
+
"text": "Wonderful book for anyone running a professional hatchery: This book is aimed more for hatcheries that are raising Trout, Salmon, Catfish and other food fishes. However, there is so much information in this book that even ornamental fish hatcheries will find an incredible amount of useful information. The chapters on Fish Nutrition are especially helpful.",
|
414 |
+
"id": "81"
|
415 |
+
},
|
416 |
+
"82": {
|
417 |
+
"label": true,
|
418 |
+
"text": "Amazing book!!: Once again, Eric Victorino's artistic talent is put into this great free-verse poetry book. I couldn't put it down and I finished it the day I received it in the mail. All of the poems are awesome but the one I found the most interesting was \"It's A People Business.\" All of the experiences in his life, personally and with his band, come to life in this book. Please check it out! It's worth every penny!!",
|
419 |
+
"id": "82"
|
420 |
+
},
|
421 |
+
"83": {
|
422 |
+
"label": true,
|
423 |
+
"text": "The white trumpet contender respect Miles Davis!: The story of the Jazz in the Fifties certainly would be remain unfinished without the ominous presence of this outstanding virtuoso. Baker sound still possesses this alluring hook, this magnetic engagement charm, eloquent expressiveness, enrapturing lyricism and contagious rhythm, despite the elapsed time, which confirms by itself the status of his musicianship.This selection is jus a little sample of the broad universe of his genius. A well thought selection of great musical successes, available, preserved and immortalized by the Digital Technology for our future enjoyment.Absolutely indispensable in your treasured collection.",
|
424 |
+
"id": "83"
|
425 |
+
},
|
426 |
+
"84": {
|
427 |
+
"label": false,
|
428 |
+
"text": "What the?: I'm sorry, maybe it's just me but I can't helping stating that this has to be one of the wrost movies I've seen in my life!Can you say boring? Can you say doesn't make sense at all? The first 30 minutes of the movie were O.K. But it went downhill after that. This movie is a prime example of a director attempting to make a deep movie with a meaningful lesson but failed on all levels. I don't recommend this movie unless you want to go to sleep or you don't have anything else to do.",
|
429 |
+
"id": "84"
|
430 |
+
},
|
431 |
+
"85": {
|
432 |
+
"label": true,
|
433 |
+
"text": "very very good!!!!: linda blair is a young girl who is possessed. and her mother doesn't know what to do until one day when she hears her daughter screaming and stabbind herself she knows what to do GET AN EXORCIZIM!!!",
|
434 |
+
"id": "85"
|
435 |
+
},
|
436 |
+
"86": {
|
437 |
+
"label": true,
|
438 |
+
"text": "Awesome product for the price!: This range extender works as advertised! I am very happy with the purchase. I was a little worried after reading some of the horror stories here, but I have to say, Chovy's review instructions (on this site) were just this ticket to get the repeater up and running in less than 30 minutes. It was unbelievably easy to install! Do not be frightened by negative reviews. If you can set up a wireless network, you can set up this repeater. However, I did upgrade the firmware before I did anything else and maybe that helped. I got the firmware update from the Belkin site.",
|
439 |
+
"id": "86"
|
440 |
+
},
|
441 |
+
"87": {
|
442 |
+
"label": false,
|
443 |
+
"text": "Slight: This book is either a heavily illustrated short story collection or a text-heavy comic. Its unusual format is its most original feature. Its plots are negligible, but its illustrations and text evoke a unique atmosphere of self-conscious nonconformism. Although its target audience is dare-to-be-different teens and college students, its interesting turns of phrase and expressive line drawings are not devoid of interest for general audences.",
|
444 |
+
"id": "87"
|
445 |
+
},
|
446 |
+
"88": {
|
447 |
+
"label": true,
|
448 |
+
"text": "ANgeleyes: Seem to dry up their eyes fairly well, although I haven't seen the color (brown stain) change much yet.",
|
449 |
+
"id": "88"
|
450 |
+
},
|
451 |
+
"89": {
|
452 |
+
"label": false,
|
453 |
+
"text": "Nice Try: Salt Lake 2002 is not a bad game, but it isn't good either. The graphics are excellent, but some of the events are bad. Bobsleigh, and skiing aren't bad but the others are. You dont stay into it for long. I liked it for a while, but it gets boring.",
|
454 |
+
"id": "89"
|
455 |
+
},
|
456 |
+
"90": {
|
457 |
+
"label": false,
|
458 |
+
"text": "Cutler's share of the pie: This book was a major disappointment. I am familiar with books written solely by the Dalai Lama, such as the \"Library of Tibet\" series, which are much more engrossing and have much more substance than Cutler's book. Cutler attempts (successfully, sadly) to have his share of the profitable market that involves the Dalai Lama's writings. The book is insipid, does not try to explain any important issue in the light of Buddhist philosophy, and only rehashes issues that several other westerners already wrote about. It's another big ego trip: we keep hearing time and again about his opportunities to be with the Dalai Lama. What a shame, Cutler. I sold the book as soon as I finished it.",
|
459 |
+
"id": "90"
|
460 |
+
},
|
461 |
+
"91": {
|
462 |
+
"label": false,
|
463 |
+
"text": "Mostly tedious, with interesting parts: I found the writing interesting, and the subject fascinating, but I found myself frustrated by the author's difficulty in talking directly about the status of Muslim women with her interview subjects. The author spent many pages writing about the menus and dress of the many middle and upper-middle class women she interviewed. It seemed as though her interview subjects resisted her efforts to discuss the status of women in their countries, so we too as readers had to wade through much distracting material and misunderstandings about feminism and gender. Great travel stories, but not a great source of information about Muslim women.",
|
464 |
+
"id": "91"
|
465 |
+
},
|
466 |
+
"92": {
|
467 |
+
"label": false,
|
468 |
+
"text": "Sesame Street Toddler: I did not find this game to be as educationally sound as I would expect from Sesame street. There is too much talking before the program will react to a command. The graphics are jerky and the cursor acts like the target is magnetically charged and keeps pushing away the cursor. When the child actually does manage to click on a target, the cursor may still fly to another target and the child is told that his answer is wrong. Another example of educational problems is the pronunciation of \"eggs\" using a long \"a\" sound instead of a short \"e.\" This is not very helpful in teaching a child the sound for short \"e.\" Children that are used to playing computer games by themselves may find that this game is too frustrating to do alone. The open ended learning curve is a great idea. I just wish Sesame Street would hire a truly qualified literacy expert to help clean up the many problems in this program.",
|
469 |
+
"id": "92"
|
470 |
+
},
|
471 |
+
"93": {
|
472 |
+
"label": false,
|
473 |
+
"text": "needs a buzz cut and a point: I avoided reading this book, not because of the hermaphrodite subject matter, but because I have never read a multigenerational family saga that I liked. Many books let me down in the middle, and this was no exception. The beginning of the book was incredible and harrowing, with momentum and characterization. The post-America nextgens part of the saga was so boring I found myself flipping and flipping - always a bad sign. If there was some kind of larger point to all of that, then I must have missed it. Yes there's the identity duality and trinity themes playing out here: man/woman, greek/turkish/american modern/old world sick/healthy innocent/guilty original/reinvented. But it was almost as if the author was saying - here it is again - get it? I like my fiction much more subtle than this.",
|
474 |
+
"id": "93"
|
475 |
+
},
|
476 |
+
"94": {
|
477 |
+
"label": false,
|
478 |
+
"text": "OMG! DO NOT BUY!: I normally don't take the time to submit a review.In this case however, I feel obligated to do so.This is by far one of the worst purchases I have ever made.Here's why.....The contraption is far too bulky.The case's enclosing is unbearable, takes a good minute or so to open it.The texture of the material feels like a cheap toy.The overall design is horrible, something I could make in my basement.For the love of everything sacred, do not buy this thing.",
|
479 |
+
"id": "94"
|
480 |
+
},
|
481 |
+
"95": {
|
482 |
+
"label": true,
|
483 |
+
"text": "Good price, good quality: Comparable HDMI cables can be bought for 45 or more. Even though the price is cheap the quality is good, no problems so far.",
|
484 |
+
"id": "95"
|
485 |
+
},
|
486 |
+
"96": {
|
487 |
+
"label": true,
|
488 |
+
"text": "Good rock music: This is what i call rock music good beat and good lyrics, don't listen to the other reviews. This cd is one of the best, listen to a few songs and you will get hooked. I recommend this cd its awesome.",
|
489 |
+
"id": "96"
|
490 |
+
},
|
491 |
+
"97": {
|
492 |
+
"label": false,
|
493 |
+
"text": "BORING!: This movie is soo boring. How in the hell did this movie make so much at the box office. Do people really want to pay for crappy movies like this. bottom line this is a chick flick nothing is good. And now they are re-releasing this movie with more boring stuff. This is the worst movie ever.",
|
494 |
+
"id": "97"
|
495 |
+
},
|
496 |
+
"98": {
|
497 |
+
"label": false,
|
498 |
+
"text": "Already Rusting: Inferior quality. The plating is thin and rust is coming through the finish. Inexcusable for a product that is designed for use in a humid environment.",
|
499 |
+
"id": "98"
|
500 |
+
},
|
501 |
+
"99": {
|
502 |
+
"label": false,
|
503 |
+
"text": "confusing internet setup: i wanted a camera that could email photos but this camera will not go out through the router and the manual setup , to punch a hole thru router is confusing.",
|
504 |
+
"id": "99"
|
505 |
+
},
|
506 |
+
"55066581ad334ef5844c6f7707525010": {
|
507 |
+
"label": true,
|
508 |
+
"text": "Thought this was super cool, and a really important step in all the physical books' preservation.",
|
509 |
+
"id": "55066581ad334ef5844c6f7707525010"
|
510 |
+
},
|
511 |
+
"fef14d13366f482d9f4e0726b357f178": {
|
512 |
+
"label": true,
|
513 |
+
"text": "There are some amazing hikes around Mt. Fuji.",
|
514 |
+
"id": "fef14d13366f482d9f4e0726b357f178"
|
515 |
+
},
|
516 |
+
"70aed7369aa74031a06f5f3155476d7c": {
|
517 |
+
"label": true,
|
518 |
+
"text": "Thought this was super cool, and a really important step in preserving all the physical books.",
|
519 |
+
"id": "70aed7369aa74031a06f5f3155476d7c"
|
520 |
+
},
|
521 |
+
"ac65d14b710648b8bf3c2a53caf6ac91": {
|
522 |
+
"label": false,
|
523 |
+
"text": "The profits of the business that was most successful were still negative.",
|
524 |
+
"id": "ac65d14b710648b8bf3c2a53caf6ac91"
|
525 |
+
},
|
526 |
+
"ce00e6b1547444259a13c55654e66500": {
|
527 |
+
"label": true,
|
528 |
+
"text": "love them best, they reconnect in hysterically funny and emotionally significant ways.",
|
529 |
+
"id": "ce00e6b1547444259a13c55654e66500"
|
530 |
+
},
|
531 |
+
"8943a94d205b43ceb4420d5ab9c5611a": {
|
532 |
+
"label": true,
|
533 |
+
"text": "Walt Disney's timeless masterpiece is an extravaganza of sight and sound! See the music come to life, hear the pictures burst into song and experience the excitement that is Fantasia over and over again.",
|
534 |
+
"id": "8943a94d205b43ceb4420d5ab9c5611a"
|
535 |
+
},
|
536 |
+
"6af8fc3dd30d4f8caf5a2929fc88534b": {
|
537 |
+
"label": false,
|
538 |
+
"text": "A director struggles with a difficult sex scene between a young actor and actress who can't stand one another. Aided by her loyal assistant, she is hell-bent on getting the scene right without compromise.",
|
539 |
+
"id": "6af8fc3dd30d4f8caf5a2929fc88534b"
|
540 |
+
},
|
541 |
+
"dbe571ed810d40f48170147dcab1c90f": {
|
542 |
+
"label": false,
|
543 |
+
"text": "sound created by drawing directly on the soundtrack).",
|
544 |
+
"id": "dbe571ed810d40f48170147dcab1c90f"
|
545 |
+
},
|
546 |
+
"682102dfc5494f03926d16ae947a6250": {
|
547 |
+
"label": true,
|
548 |
+
"text": "one of glowing admiration! Written by Mark Toscano",
|
549 |
+
"id": "682102dfc5494f03926d16ae947a6250"
|
550 |
+
},
|
551 |
+
"9b044458bb0e4bd68359e62d5fb4b979": {
|
552 |
+
"label": false,
|
553 |
+
"text": "Seth McArdle (Samuel Davis) is a high school senior with an especially full plate. Not only must he navigate the usual social and academic pitfalls of high school, but he has to contend with his young twin sisters, serving as de facto parent in the absence of his deceased mother and deadbeat father. The pressure mounts when the bank calls with a foreclosure warning, and Seth's frustrations spill",
|
554 |
+
"id": "9b044458bb0e4bd68359e62d5fb4b979"
|
555 |
+
},
|
556 |
+
"abf2d24c7d8845769b7368be28f2c25d": {
|
557 |
+
"label": true,
|
558 |
+
"text": "Bjork is a beautiful creature and her music is stellar to anything I've ever heard. This DVD is essential for all Bjork fans, because you find something new every time you watch it.",
|
559 |
+
"id": "abf2d24c7d8845769b7368be28f2c25d"
|
560 |
+
}
|
561 |
+
},
|
562 |
+
"version": 11,
|
563 |
+
"description": "Positive sentiment"
|
564 |
+
}
|
lilac/concepts/profanity/concept.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
lilac/concepts/question/concept.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
lilac/concepts/source-code/concept.json
ADDED
@@ -0,0 +1,389 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"namespace": "lilac",
|
3 |
+
"concept_name": "source-code",
|
4 |
+
"type": "text",
|
5 |
+
"data": {
|
6 |
+
"c7d0400c6e5442a59859ea7b0a7d6bab": {
|
7 |
+
"label": true,
|
8 |
+
"text": "const num1 = 10;\nconst num2 = 20;\nconst sum = num1 + num2;",
|
9 |
+
"id": "c7d0400c6e5442a59859ea7b0a7d6bab"
|
10 |
+
},
|
11 |
+
"cfa936b9ba9e4c72b835b44d8cfb393b": {
|
12 |
+
"label": true,
|
13 |
+
"text": "function calculateArea(radius) {\n return Math.PI * radius * radius;\n}",
|
14 |
+
"id": "cfa936b9ba9e4c72b835b44d8cfb393b"
|
15 |
+
},
|
16 |
+
"3952102e61a44fde92117a0519c4e8e6": {
|
17 |
+
"label": true,
|
18 |
+
"text": "let message = 'Hello, World!';\nconsole.log(message);",
|
19 |
+
"id": "3952102e61a44fde92117a0519c4e8e6"
|
20 |
+
},
|
21 |
+
"6e90cb4c8fdb46a1b38460b5d2eca907": {
|
22 |
+
"label": true,
|
23 |
+
"text": "for (let i = 0; i < 10; i++) {\n console.log(i);\n}",
|
24 |
+
"id": "6e90cb4c8fdb46a1b38460b5d2eca907"
|
25 |
+
},
|
26 |
+
"7e7a438002384ae194f35f27b9c85888": {
|
27 |
+
"label": true,
|
28 |
+
"text": "const colors = ['red', 'green', 'blue'];\nfor (const color of colors) {\n console.log(color);\n}",
|
29 |
+
"id": "7e7a438002384ae194f35f27b9c85888"
|
30 |
+
},
|
31 |
+
"91cc90d155ef4c1fb4f3c458cfdc8fac": {
|
32 |
+
"label": false,
|
33 |
+
"text": "No bathroom bill made it to Abbott\u2019s desk by the end of the legislative session in May.",
|
34 |
+
"id": "91cc90d155ef4c1fb4f3c458cfdc8fac"
|
35 |
+
},
|
36 |
+
"c67e408ec3544898a0d3fc21c2ee36c3": {
|
37 |
+
"label": false,
|
38 |
+
"text": "The theory that they are products of the radiation from the bomb is genius.",
|
39 |
+
"id": "c67e408ec3544898a0d3fc21c2ee36c3"
|
40 |
+
},
|
41 |
+
"fa78b497c7704c198fe0b2a320ed55e6": {
|
42 |
+
"label": false,
|
43 |
+
"text": "We built our society on clean energy.",
|
44 |
+
"id": "fa78b497c7704c198fe0b2a320ed55e6"
|
45 |
+
},
|
46 |
+
"a8bef6215f2346f7b67101b055724e99": {
|
47 |
+
"label": false,
|
48 |
+
"text": "No bathroom bill made it to Abbott\u2019s desk by the end of the legislative session in May.",
|
49 |
+
"id": "a8bef6215f2346f7b67101b055724e99"
|
50 |
+
},
|
51 |
+
"cab51176c2f74c8497410764628ea7cf": {
|
52 |
+
"label": false,
|
53 |
+
"text": "They should be attached to the lifting mechanism in the faucet.",
|
54 |
+
"id": "cab51176c2f74c8497410764628ea7cf"
|
55 |
+
},
|
56 |
+
"6585fbabe83444cfb43dee977dc3ebfe": {
|
57 |
+
"label": false,
|
58 |
+
"text": "This dataset is very big.",
|
59 |
+
"id": "6585fbabe83444cfb43dee977dc3ebfe"
|
60 |
+
},
|
61 |
+
"59e8b12ef7dd4e948dfae42c97f55721": {
|
62 |
+
"label": false,
|
63 |
+
"text": "The 15th Tank Corps was a corps of the Soviet Union's Red Army.",
|
64 |
+
"id": "59e8b12ef7dd4e948dfae42c97f55721"
|
65 |
+
},
|
66 |
+
"efaa58793e2840c6b966caa6a11ecaad": {
|
67 |
+
"label": false,
|
68 |
+
"text": "Every lunch hour I make it my goal to sift through one research paper.",
|
69 |
+
"id": "efaa58793e2840c6b966caa6a11ecaad"
|
70 |
+
},
|
71 |
+
"b529748962774d36a4ff781da0e327bf": {
|
72 |
+
"label": false,
|
73 |
+
"text": "On Sunday, Jane had a party.",
|
74 |
+
"id": "b529748962774d36a4ff781da0e327bf"
|
75 |
+
},
|
76 |
+
"b6f2f93f75f44d3780882cd1ebd3d311": {
|
77 |
+
"label": false,
|
78 |
+
"text": "TIL David Attenborough and Queen Elizabeth II are roughly the same age.",
|
79 |
+
"id": "b6f2f93f75f44d3780882cd1ebd3d311"
|
80 |
+
},
|
81 |
+
"2c04925ab5114925b9e891eb2706b83e": {
|
82 |
+
"label": true,
|
83 |
+
"text": "```js\nfor (var i = 1; i < 12; i++) {\n console.log(i);\n}\n```",
|
84 |
+
"id": "2c04925ab5114925b9e891eb2706b83e"
|
85 |
+
},
|
86 |
+
"030117a7d9044dd8b055c80853804d3d": {
|
87 |
+
"label": false,
|
88 |
+
"text": "\u00a1Desde luego! Aqu\u00ed tienes unas secuencias del tipo \"123456789\" convertidas en operaciones aritm\u00e9ticas (haciendo uso de los operadores + y -) cuyo resultado da 200:\n\n* 123 + 4 + 5 + 67 - 8 + 9 = 200\n* 123 - 4 + 5 - 6 - 7 + 89 = 200\n* 1 + 234 - 5 - 6 - 7 - 8 - 9 = 200",
|
89 |
+
"id": "030117a7d9044dd8b055c80853804d3d"
|
90 |
+
},
|
91 |
+
"c0095d13aed4431bb75bbc215b0d4819": {
|
92 |
+
"label": false,
|
93 |
+
"text": "and the Roman Senate was often more interested in maintaining their own power and wealth than in governing effectively.",
|
94 |
+
"id": "c0095d13aed4431bb75bbc215b0d4819"
|
95 |
+
},
|
96 |
+
"6ea50e4168ed42fe94ce15a9c330a54f": {
|
97 |
+
"label": true,
|
98 |
+
"text": "```javascript\n// Get the canvas element\nconst canvas = document.getElementById('myCanvas');\n\n// Get the 2D context of the canvas\nconst ctx = canvas.getContext('2d');\n\n// Set the position of the circle\nconst x = 100; // X coordinate\nconst y = 100; // Y coordinate\n\n// Set the radius of the circle\nconst radius = 50;\n\n// Set the color of the circle\nconst color = 'blue';",
|
99 |
+
"id": "6ea50e4168ed42fe94ce15a9c330a54f"
|
100 |
+
},
|
101 |
+
"7266060a88d04a6299922714f62091d8": {
|
102 |
+
"label": false,
|
103 |
+
"text": "Certainly! Here's a simple JavaScript function that takes a mathematical expression as a string and a value for x, then returns the result of the operation:\n\n",
|
104 |
+
"id": "7266060a88d04a6299922714f62091d8"
|
105 |
+
},
|
106 |
+
"6bf7df5363ff4eb3ac237eed4acda0af": {
|
107 |
+
"label": false,
|
108 |
+
"text": "Another factor in Rome's decline was the empire's overextended infrastructure. The Roman Empire was renowned for its impressive network of roads, aqueducts, and other public works. However, as the empire expanded and resources became scarce, the maintenance of these critical systems was neglected. This deterioration led to problems such as a lack of clean drinking water, food shortages, and the",
|
109 |
+
"id": "6bf7df5363ff4eb3ac237eed4acda0af"
|
110 |
+
},
|
111 |
+
"048952d5e620410282e09d625dc45f20": {
|
112 |
+
"label": true,
|
113 |
+
"text": "```javascript\nconst canvas = document.getElementById('canvas'); // note that you will have to use your own canvas\nconst context = canvas.getContext('2d');\nconst x = 100; // x-coordinate, you can set it up\nconst y = 100; // y-coordinate, you can set it up too\nconst radius = 50; // radius of the circle, set-up-able\nconst color = 'red'; // color of the circle, also set-up-able\ncontext.beginPath();",
|
114 |
+
"id": "048952d5e620410282e09d625dc45f20"
|
115 |
+
},
|
116 |
+
"eb5ee4b631bc400d95aa21aed4cb1dcd": {
|
117 |
+
"label": true,
|
118 |
+
"text": "```javascript\n// Get the canvas element from the DOM\nconst canvas = document.getElementById('canvas');\n\n// Get the 2D context of the canvas\nconst ctx = canvas.getContext('2d');\n\n// Set the position, radius, and color of the circle\nconst x = 50;\nconst y = 50;\nconst radius = 30;\nconst color = 'red';\n\n// Begin a new path\nctx.beginPath();\n\n// Draw the circle\nctx.arc(x, y, radius, 0, 2 * Math.PI);",
|
119 |
+
"id": "eb5ee4b631bc400d95aa21aed4cb1dcd"
|
120 |
+
},
|
121 |
+
"ca0e90e7a6f54427997b9c98ab86508e": {
|
122 |
+
"label": true,
|
123 |
+
"text": " const getColor = (depth) => {\n if (depth < 0.25) {\n return `rgb(0, ${Math.floor(depth * 1020)}, 255)`;\n } else if (depth < 0.5) {\n return `rgb(0, 255, ${Math.floor(1020 - depth * 1020)})`;\n } else if (depth < 0.75) {\n return `rgb(${Math.floor(depth * 1020) - 255}, 255, 0)`;\n } else {\n return `rgb(255, ${Math.floor(1020 - depth * 1020)}, 0)`;\n }\n };",
|
124 |
+
"id": "ca0e90e7a6f54427997b9c98ab86508e"
|
125 |
+
},
|
126 |
+
"bec8c965143b4a0dba7add98346996a0": {
|
127 |
+
"label": false,
|
128 |
+
"text": "Rome itself. Consequently, military leadership was fraught with intrigue, as ambitious generals vied for power, often at the expense of the empire's stability.",
|
129 |
+
"id": "bec8c965143b4a0dba7add98346996a0"
|
130 |
+
},
|
131 |
+
"8342a4029d384b5183a78ca0dc7e398e": {
|
132 |
+
"label": true,
|
133 |
+
"text": "\n \n \n \n \n \n \n \n window._taboola = window._taboola || []; _taboola.push({ mode: 'thumbnails-c', container: 'taboola-interstitial-gallery-thumbnails-5', placement: 'Interstitial Gallery Thumbnails 5', target_type: 'mix' }); \n \n \n ",
|
134 |
+
"id": "8342a4029d384b5183a78ca0dc7e398e"
|
135 |
+
},
|
136 |
+
"aecfb1e4c7ba45fb8847f304c5c848af": {
|
137 |
+
"label": false,
|
138 |
+
"text": "miles an hour,\" and worried that their \"uteruses would fly out of [their] bodies as they were accelerated to that speed.\" ",
|
139 |
+
"id": "aecfb1e4c7ba45fb8847f304c5c848af"
|
140 |
+
},
|
141 |
+
"6a6aaea62afc43258b646b4b25d93692": {
|
142 |
+
"label": true,
|
143 |
+
"text": " \n window._taboola = window._taboola || []; _taboola.push({ mode: 'thumbnails-c', container: 'taboola-interstitial-gallery-thumbnails-15', placement: 'Interstitial Gallery Thumbnails 15', target_type: 'mix' }); _taboola.push({flush: true}); \n ",
|
144 |
+
"id": "6a6aaea62afc43258b646b4b25d93692"
|
145 |
+
},
|
146 |
+
"4f94dc90c0c94420a99301ac1fc92171": {
|
147 |
+
"label": true,
|
148 |
+
"text": " \n window._taboola = window._taboola || []; _taboola.push({ mode: 'thumbnails-c', container: 'taboola-interstitial-gallery-thumbnails-25', placement: 'Interstitial Gallery Thumbnails 25', target_type: 'mix' }); _taboola.push({flush: true}); \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n ",
|
149 |
+
"id": "4f94dc90c0c94420a99301ac1fc92171"
|
150 |
+
},
|
151 |
+
"64e15fb5d64548ca8b0d2c288916d303": {
|
152 |
+
"label": false,
|
153 |
+
"text": "Choose the Ellipse, Rectangle, or Polygon tool from InDesign's Tools panel. Adjust your background shape's size by clicking one of your shape's handles with your Select tool and dragging it until your shape is the correct size. Make sure the object is selected, then open InDesign's Swatches panel and select the Fill button. Choose the color you want to apply from the Swatches panel. Your shape",
|
154 |
+
"id": "64e15fb5d64548ca8b0d2c288916d303"
|
155 |
+
},
|
156 |
+
"3374e377ae4440a98626ae06aba98dea": {
|
157 |
+
"label": true,
|
158 |
+
"text": " \n window._taboola = window._taboola || []; _taboola.push({ mode: 'thumbnails-c', container: 'taboola-interstitial-gallery-thumbnails-20', placement: 'Interstitial Gallery Thumbnails 20', target_type: 'mix' }); _taboola.push({flush: true}); \n ",
|
159 |
+
"id": "3374e377ae4440a98626ae06aba98dea"
|
160 |
+
},
|
161 |
+
"2e14c0bbc5d84955b6b65f502b599eda": {
|
162 |
+
"label": true,
|
163 |
+
"text": "value=\"#000000\" /><param name=\"allowScriptAccess\" value=\"always\" /><param name=\"allowFullScreen\" value=\"true\" /><param name=\"flashvars\" value=\"embedType=noscriptObjectTag&embedCode=1udG03NTowIl-eAG5T0wYzU_zYNmmNht&videoPcode=BhdmY6l9g002rBhQ6aEBZiheacDu\" /><embed src=\"http://player.ooyala.com/player.swf?embedCode=1udG03NTowIl-eAG5T0wYzU_zYNmmNht&version=2\" bgcolor=\"#000000\" width=\"618\"",
|
164 |
+
"id": "2e14c0bbc5d84955b6b65f502b599eda"
|
165 |
+
},
|
166 |
+
"a556ea1110714d38ad2fdcaac47a4332": {
|
167 |
+
"label": false,
|
168 |
+
"text": "What are the best resources to learn Express.js?\nWhat is the best way to learn AngularJS and nodeJS?\n\nAre these two questions inquiring about the same information?",
|
169 |
+
"id": "a556ea1110714d38ad2fdcaac47a4332"
|
170 |
+
},
|
171 |
+
"20f20d8cff8c475f93b25298e5f58ff2": {
|
172 |
+
"label": true,
|
173 |
+
"text": " \n window._taboola = window._taboola || []; _taboola.push({ mode: 'thumbnails-c', container: 'taboola-interstitial-gallery-thumbnails-10', placement: 'Interstitial Gallery Thumbnails 10', target_type: 'mix' }); _taboola.push({flush: true}); \n ",
|
174 |
+
"id": "20f20d8cff8c475f93b25298e5f58ff2"
|
175 |
+
},
|
176 |
+
"7f810a7c98bc4d44a3dd07688d39ae02": {
|
177 |
+
"label": false,
|
178 |
+
"text": "used on the left side and when used on the right side it extends too far forward. C'est la Vie. I will try it on my XBox next. Answer:",
|
179 |
+
"id": "7f810a7c98bc4d44a3dd07688d39ae02"
|
180 |
+
},
|
181 |
+
"3b476d5fbd0f472eb26485a58690f62a": {
|
182 |
+
"label": true,
|
183 |
+
"text": "let-Step-7Bullet1.jpg\",\"smallWidth\":460,\"smallHeight\":306,\"bigWidth\":\"728\",\"bigHeight\":\"485\",\"licensing\":\"<div",
|
184 |
+
"id": "3b476d5fbd0f472eb26485a58690f62a"
|
185 |
+
},
|
186 |
+
"5c51478ab56d4ef580aec02f4554a557": {
|
187 |
+
"label": true,
|
188 |
+
"text": " \n window._taboola = window._taboola || []; _taboola.push({ mode: 'thumbnails-c', container: 'taboola-interstitial-gallery-thumbnails-5', placement: 'Interstitial Gallery Thumbnails 5', target_type: 'mix' }); _taboola.push({flush: true}); \n ",
|
189 |
+
"id": "5c51478ab56d4ef580aec02f4554a557"
|
190 |
+
},
|
191 |
+
"8c1657d2e75b440aae9c0c36162f13b5": {
|
192 |
+
"label": false,
|
193 |
+
"text": "Worries increased with age. People ages 30 to 49 \u2014 a time when people are coping with tuition payments, car payments and child-care costs \u2014 were the most likely to say they are limiting their monthly spending. And seniors were three times as likely as those 18 to 29 to say stagnant income was the main reason for cutting back.",
|
194 |
+
"id": "8c1657d2e75b440aae9c0c36162f13b5"
|
195 |
+
},
|
196 |
+
"123e676caaba4266bc2cf174c33a9816": {
|
197 |
+
"label": true,
|
198 |
+
"text": "Aqu\u00ed hay un ejemplo de c\u00f3mo realizar una operaci\u00f3n AND en dos bytes en JavaScript:\n\nconst byte1 = 0b10101010; // 170 en decimal\nconst byte2 = 0b11110000; // 240 en decimal\n\nconst resultado = byte1 & byte2;\n\nconsole.log(resultado.toString(2)); // \"10100000\"",
|
199 |
+
"id": "123e676caaba4266bc2cf174c33a9816"
|
200 |
+
},
|
201 |
+
"5990d015dd1249a58af88ec3adca6a58": {
|
202 |
+
"label": false,
|
203 |
+
"text": "How in JavaScript draw a circle with radius 10 pixels in a html canvas?",
|
204 |
+
"id": "5990d015dd1249a58af88ec3adca6a58"
|
205 |
+
},
|
206 |
+
"ca62cf272b1e463aa973d1f23eb25fc5": {
|
207 |
+
"label": false,
|
208 |
+
"text": "such as Tacitus and Livy, and philosophers like Seneca and Marcus Aurelius influencing generations of thinkers and writers.",
|
209 |
+
"id": "ca62cf272b1e463aa973d1f23eb25fc5"
|
210 |
+
},
|
211 |
+
"5a7986b9233d45b1b194001c74c98af0": {
|
212 |
+
"label": true,
|
213 |
+
"text": "menu. {\"smallUrl\":\"https:\\/\\/www.wikihow.com\\/images\\/thumb\\/a\\/ad\\/Enable-Automatic-Updates-Step-5Bullet3.jpg\\/v4-460px-Enable-Automatic-Updates-Step-5Bullet3.jpg\",\"bigUrl\":\"\\/images\\/thumb\\/a\\/ad\\/Enable-Automatic-Updates-Step-5Bullet3.jpg\\/aid1480351-v4-728px-Enable-Automatic-Updates-Step-5Bullet3.jpg\",\"smallWidth\":460,\"smallHeight\":345,\"bigWidth\":\"728\",\"bigHeight\":\"546\",\"licensing\":\"<div",
|
214 |
+
"id": "5a7986b9233d45b1b194001c74c98af0"
|
215 |
+
},
|
216 |
+
"3148cb870d2a40009e9d17b0c6056470": {
|
217 |
+
"label": true,
|
218 |
+
"text": "allow_photos=false, maxitems=7, display_ugc_photos=false, includepause=true, canvas_allcomments_app_instance=6634zxcgfd, includepermalink=false}!!",
|
219 |
+
"id": "3148cb870d2a40009e9d17b0c6056470"
|
220 |
+
},
|
221 |
+
"874e2014361046e6988229dd17674847": {
|
222 |
+
"label": true,
|
223 |
+
"text": " {\"smallUrl\":\"https:\\/\\/www.wikihow.com\\/images\\/thumb\\/4\\/4c\\/Enable-Automatic-Updates-Step-5Bullet2.jpg\\/v4-460px-Enable-Automatic-Updates-Step-5Bullet2.jpg\",\"bigUrl\":\"\\/images\\/thumb\\/4\\/4c\\/Enable-Automatic-Updates-Step-5Bullet2.jpg\\/aid1480351-v4-728px-Enable-Automatic-Updates-Step-5Bullet2.jpg\",\"smallWidth\":460,\"smallHeight\":345,\"bigWidth\":\"728\",\"bigHeight\":\"546\",\"licensing\":\"<div",
|
224 |
+
"id": "874e2014361046e6988229dd17674847"
|
225 |
+
},
|
226 |
+
"9d5510fa41a24f3aa77ccb3e2f37c366": {
|
227 |
+
"label": true,
|
228 |
+
"text": " {\"smallUrl\":\"https:\\/\\/www.wikihow.com\\/images\\/thumb\\/e\\/ee\\/Enable-Automatic-Updates-Step-5Bullet4.jpg\\/v4-460px-Enable-Automatic-Updates-Step-5Bullet4.jpg\",\"bigUrl\":\"\\/images\\/thumb\\/e\\/ee\\/Enable-Automatic-Updates-Step-5Bullet4.jpg\\/aid1480351-v4-728px-Enable-Automatic-Updates-Step-5Bullet4.jpg\",\"smallWidth\":460,\"smallHeight\":345,\"bigWidth\":\"728\",\"bigHeight\":\"546\",\"licensing\":\"<div",
|
229 |
+
"id": "9d5510fa41a24f3aa77ccb3e2f37c366"
|
230 |
+
},
|
231 |
+
"4bf8a3da04e040b58b2216f231b23272": {
|
232 |
+
"label": false,
|
233 |
+
"text": "want to edit the size of the original image. Save the image as a separate copy. Add \"thumbnail\" or something similar to the end of the copy of the image (i.e. weddingphoto_thumbnail.jpg). Use the following steps to create a copy of the image: Click File Click Save As. Type a name for the image next to \"Filename\". Click Save. It's in the upper-left corner above the box labeled \"Image\". It's at",
|
234 |
+
"id": "4bf8a3da04e040b58b2216f231b23272"
|
235 |
+
},
|
236 |
+
"04c7fc7da2a44936accb79f9abc65a94": {
|
237 |
+
"label": false,
|
238 |
+
"text": "Two-thirds of Americans say they aren\u2019t spending as much as they could each month, according to the poll released Tuesday by Bankrate.com. And they may keep spending down even as they enter the holiday season, says Greg McBride, chief financial analyst for Bankrate.com. \u201cPeople just don\u2019t have a lot of extra money to throw around,\u201d he says.",
|
239 |
+
"id": "04c7fc7da2a44936accb79f9abc65a94"
|
240 |
+
},
|
241 |
+
"46f6088066c24694b5448648160540f1": {
|
242 |
+
"label": false,
|
243 |
+
"text": "Update 1 : I have discovered problems with logging in and with the display of the CAPTCHA images when using FireFox 3.0.1 browser . These are currently being investigated . Update 2 : The problems noted above seem to be fixed .\n\nGenerate a question about the above context.",
|
244 |
+
"id": "46f6088066c24694b5448648160540f1"
|
245 |
+
},
|
246 |
+
"1d44b65a89214a8eaaa305d18c5d13e9": {
|
247 |
+
"label": false,
|
248 |
+
"text": "One bright note: Young people are saving more. Millennials were most likely to cite the need to save as their main reason for cutting down spending. Younger consumers, who are also less likely to own a credit card, and may be overwhelmed by student loans, recognize the importance of having emergency savings. \u201cMany of them are building a solid financial foundation,\u201d McBride says.",
|
249 |
+
"id": "1d44b65a89214a8eaaa305d18c5d13e9"
|
250 |
+
},
|
251 |
+
"490ef080fa174d938dba33adc155fd4c": {
|
252 |
+
"label": true,
|
253 |
+
"text": "of time. {\"smallUrl\":\"https:\\/\\/www.wikihow.com\\/images\\/thumb\\/b\\/bd\\/Enable-Automatic-Updates-Step-5Bullet1.jpg\\/v4-460px-Enable-Automatic-Updates-Step-5Bullet1.jpg\",\"bigUrl\":\"\\/images\\/thumb\\/b\\/bd\\/Enable-Automatic-Updates-Step-5Bullet1.jpg\\/aid1480351-v4-728px-Enable-Automatic-Updates-Step-5Bullet1.jpg\",\"smallWidth\":460,\"smallHeight\":345,\"bigWidth\":\"728\",\"bigHeight\":\"546\",\"licensing\":\"<div",
|
254 |
+
"id": "490ef080fa174d938dba33adc155fd4c"
|
255 |
+
},
|
256 |
+
"8c39537e3827468fa16039966fbb2fec": {
|
257 |
+
"label": true,
|
258 |
+
"text": "VALUE=\"0\"><PARAM NAME=\"DisplayForeColor\" VALUE=\"16777215\"><PARAM NAME=\"DisplayMode\" VALUE=\"0\"><PARAM NAME=\"DisplaySize\" VALUE=\"4\"><PARAM NAME=\"Enabled\" VALUE=\"-1\"><PARAM NAME=\"EnableContextMenu\" VALUE=\"-1\"><PARAM NAME=\"EnablePositionControls\" VALUE=\"-1\"><PARAM NAME=\"EnableFullScreenControls\" VA!",
|
259 |
+
"id": "8c39537e3827468fa16039966fbb2fec"
|
260 |
+
},
|
261 |
+
"23bdbe40c81e45b487d653f936459bdf": {
|
262 |
+
"label": true,
|
263 |
+
"text": "<font class=\"smalltext\" color=\"#999966\">11/7 -- HOT JOBS </font><br />\n<font class=\"itext\"> <A HREF=\"http://www.businessweek.com/careers/content/nov2001/ca2001117_9208.htm?c=bwinsidernov09&n=link45&t=email\">\n<B>A Surge of Civic-Mindedness</B></a></font> <br />\n<font class=\"itext\">September 11 and a bad economy have folks flocking to find work in the government ",
|
264 |
+
"id": "23bdbe40c81e45b487d653f936459bdf"
|
265 |
+
},
|
266 |
+
"a2d7ac3aa9054cc193afb92aaaf2db10": {
|
267 |
+
"label": true,
|
268 |
+
"text": "howGotoBar\" VALUE=\"0\"><PARAM NAME=\"ShowPositionControls\" VALUE=\"-1\"><PARAM NAME=\"ShowStatusBar\" VALUE=\"0\"><PARAM NAME=\"ShowTracker\" VALUE=\"-1\"><PARAM NAME=\"TransparentAtStart\" VALUE=\"0\"><PARAM NAME=\"VideoBorderWidth\" VALUE=\"0\"><PARAM NAME=\"VideoBorderColor\" VALUE=\"0\"><PARAM NAME=\"VideoBorder3D\" VALUE=\"0\"><PARAM NAME=\"Volume\" VALUE=\"-260\"><PARAM NAME=\"WindowlessVideo\" VALUE=\"0\">",
|
269 |
+
"id": "a2d7ac3aa9054cc193afb92aaaf2db10"
|
270 |
+
},
|
271 |
+
"064bcd09ae674b3a93a8a28b2f2d776a": {
|
272 |
+
"label": true,
|
273 |
+
"text": "NAME=\"SendMouseClickEvents\" VALUE=\"0\"><PARAM NAME=\"SendMouseMoveEvents\" VALUE=\"0\"><PARAM NAME=\"SendPlayStateChangeEvents\" VALUE=\"-1\"><PARAM NAME=\"ShowCaptioning\" VALUE=\"0\"><PARAM NAME=\"ShowControls\" VALUE=\"0\"><PARAM NAME=\"ShowAudioControls\" VALUE=\"-1\"><PARAM NAME=\"ShowDisplay\" VALUE=\"0\"><PARAM NAME=\"S!",
|
274 |
+
"id": "064bcd09ae674b3a93a8a28b2f2d776a"
|
275 |
+
},
|
276 |
+
"3964174632e949bba35991a1ad18cd66": {
|
277 |
+
"label": false,
|
278 |
+
"text": "1) Through Terminal Server (the one that opens up a desktop within a desktop and it actually appears that you have two Start buttons). \nor\n2) By opening the Stack Manager directly from your \"native\" desktop.\n\nPlease reply to this message with either a corresponding \"1\" or a \"2\" in the subject field.",
|
279 |
+
"id": "3964174632e949bba35991a1ad18cd66"
|
280 |
+
},
|
281 |
+
"f79e3b6f46444f05ac04fc2661f2fc23": {
|
282 |
+
"label": true,
|
283 |
+
"text": "```\nimport React, { useState } from \"react\";\n\nfunction Ball() {\n const [position, setPosition] = useState({ x: 0, y: 0 });\n\n const handleMouseMove = (event) => {\n setPosition({ x: event.clientX, y: event.clientY });\n };",
|
284 |
+
"id": "f79e3b6f46444f05ac04fc2661f2fc23"
|
285 |
+
},
|
286 |
+
"2e71a1fa193a42e0b04c399666939997": {
|
287 |
+
"label": true,
|
288 |
+
"text": "7. Por \u00faltimo, itera a trav\u00e9s de los resultados de b\u00fasqueda y muestra el nombre del canal y una miniatura para cada uno:\n\nreturn (\n <div>\n <form>\n <input type=\"text\" value={query} onChange={handleInputChange} />\n <button type=\"button\" onClick={searchTwitch}>Search</button>\n </form>",
|
289 |
+
"id": "2e71a1fa193a42e0b04c399666939997"
|
290 |
+
},
|
291 |
+
"3c5fdf9f6a194342a83f59127257ad56": {
|
292 |
+
"label": false,
|
293 |
+
"text": "the Eastern Roman Empire, or the Byzantine Empire, would continue to survive and even flourish for many centuries, the fall of the Western Roman Empire in 476 CE marked a dramatic turning point in world history. The legacy of the Roman Empire endures in the many aspects of modern society that have been influenced by its achievements, from language and law to art and architecture. The Latin",
|
294 |
+
"id": "3c5fdf9f6a194342a83f59127257ad56"
|
295 |
+
},
|
296 |
+
"60a6019fa6da495aa395d80568867fdd": {
|
297 |
+
"label": true,
|
298 |
+
"text": "import React, { useState } from \"react\";\n\nfunction BallFollowingMouse() {\n const [position, setPosition] = useState({ x: 0, y: 0 });\n\n function handleMouseMove(event) {\n setPosition({ x: event.clientX, y: event.clientY });\n }",
|
299 |
+
"id": "60a6019fa6da495aa395d80568867fdd"
|
300 |
+
},
|
301 |
+
"24a37a8d1bde4548807c27fa75a611c2": {
|
302 |
+
"label": true,
|
303 |
+
"text": "```js\nwhile (condicion) {\n // sentencia(s);\n}\n```",
|
304 |
+
"id": "24a37a8d1bde4548807c27fa75a611c2"
|
305 |
+
},
|
306 |
+
"dd489a1ec0d0444f894c57d2f0f30fc1": {
|
307 |
+
"label": true,
|
308 |
+
"text": "```\nconst btn = document.getElementById(\"showFormBtn\");\nconst form = document.getElementById(\"form\");\n\nbtn.addEventListener(\"click\", function() {\n if (form.style.display === \"block\") {\n form.style.display = \"none\";\n } else {\n form.style.display = \"block\";\n form.elements[0].focus();\n }\n});\n```",
|
309 |
+
"id": "dd489a1ec0d0444f894c57d2f0f30fc1"
|
310 |
+
},
|
311 |
+
"1fd93665d3ea44fca509521c179ac54b": {
|
312 |
+
"label": false,
|
313 |
+
"text": "\u041f\u043e\u0441\u043b\u0435 \u0443\u0441\u0442\u0430\u043d\u043e\u0432\u043a\u0438 \u0431\u0438\u0431\u043b\u0438\u043e\u0442\u0435\u043a\u0438 \u0432\u044b \u043c\u043e\u0436\u0435\u0442\u0435 \u0438\u0441\u043f\u043e\u043b\u044c\u0437\u043e\u0432\u0430\u0442\u044c \u0435\u0435 \u0432 \u0441\u0432\u043e\u0435\u043c \u043f\u0440\u043e\u0435\u043a\u0442\u0435. \u0411\u0438\u0431\u043b\u0438\u043e\u0442\u0435\u043a\u0430 FFT.js \u043f\u0440\u0435\u0434\u043b\u0430\u0433\u0430\u0435\u0442 \u043d\u0430\u0431\u043e\u0440 \u0444\u0443\u043d\u043a\u0446\u0438\u0439, \u043a\u043e\u0442\u043e\u0440\u044b\u0435 \u043c\u043e\u0436\u043d\u043e \u0438\u0441\u043f\u043e\u043b\u044c\u0437\u043e\u0432\u0430\u0442\u044c \u0434\u043b\u044f \u0432\u044b\u043f\u043e\u043b\u043d\u0435\u043d\u0438\u044f \u0424\u0443\u0440\u044c\u0435-\u0430\u043d\u0430\u043b\u0438\u0437\u0430 \u043d\u0430 \u043e\u043f\u0440\u0435\u0434\u0435\u043b\u0435\u043d\u043d\u043e\u043c \u043d\u0430\u0431\u043e\u0440\u0435 \u0434\u0430\u043d\u043d\u044b\u0445. \u0424\u0443\u043d\u043a\u0446\u0438\u0438 fft \u0438 ifft - \u044d\u0442\u043e \u0434\u0432\u0435 \u043a\u043b\u044e\u0447\u0435\u0432\u044b\u0435 \u0444\u0443\u043d\u043a\u0446\u0438\u0438, \u043a\u043e\u0442\u043e\u0440\u044b\u0435 \u0432\u0430\u043c \u043d\u0443\u0436\u043d\u043e \u0431\u0443\u0434\u0435\u0442 \u0438\u0441\u043f\u043e\u043b\u044c\u0437\u043e\u0432\u0430\u0442\u044c. \u041d\u0430 \u0432\u0445\u043e\u0434 \u0444\u0443\u043d\u043a\u0446\u0438\u0438 fft \u043f\u0435\u0440\u0435\u0434\u0430\u0435\u0442\u0441\u044f \u043c\u0430\u0441\u0441\u0438\u0432 \u043a\u043e\u043c\u043f\u043b\u0435\u043a\u0441\u043d\u044b\u0445 \u0447\u0438\u0441\u0435\u043b, \u0438 \u043e\u043d\u0430 \u0432\u043e\u0437\u0432\u0440\u0430\u0449\u0430\u0435\u0442 \u043c\u0430\u0441\u0441\u0438\u0432 \u043a\u043e\u043c\u043f\u043b\u0435\u043a\u0441\u043d\u044b\u0445 \u0447\u0438\u0441\u0435\u043b, \u043a\u043e\u0442\u043e\u0440\u044b\u0439",
|
314 |
+
"id": "1fd93665d3ea44fca509521c179ac54b"
|
315 |
+
},
|
316 |
+
"0362a26ed9024f6d9fb39dfa39e113b0": {
|
317 |
+
"label": true,
|
318 |
+
"text": "const calculadora=document.getElementById('calculadora')const resultado=document.getElementById('resultado') calculadora.addEventListener('click',a\u00f1adirNumeros) let operaciones=[] function",
|
319 |
+
"id": "0362a26ed9024f6d9fb39dfa39e113b0"
|
320 |
+
},
|
321 |
+
"20d9a45df83e4ad1a092ea00e4d2204a": {
|
322 |
+
"label": true,
|
323 |
+
"text": ".calculator-keys button[value=\"calculate\"] {\n grid-column: 3/5;\n}\nAgregue la funcionalidad de la calculadora utilizando JavaScript. Debe agregar un controlador de eventos para cada bot\u00f3n de la calculadora y escribir la l\u00f3gica para realizar los c\u00e1lculos. Por ejemplo:\nvbnet",
|
324 |
+
"id": "20d9a45df83e4ad1a092ea00e4d2204a"
|
325 |
+
},
|
326 |
+
"dff7dec5c01d4fc185f633dbe3c32b60": {
|
327 |
+
"label": true,
|
328 |
+
"text": "const calculator = document.querySelector('.calculator');\nconst keys = calculator.querySelector('.calculator-keys');\nconst screen = calculator.querySelector('.calculator-screen');\n\nkeys.addEventListener('click', event => {\n if (!event.target.matches('button')) {\n return;\n }\n\n const key = event.target;\n const keyValue = key.value;\n const displayValue = screen.textContent;",
|
329 |
+
"id": "dff7dec5c01d4fc185f633dbe3c32b60"
|
330 |
+
},
|
331 |
+
"53d4066c6d704bd79f2d531a1f9562f4": {
|
332 |
+
"label": true,
|
333 |
+
"text": "\n if (!phoneRegex.test(phoneValue)) {return inputError(\"phone\")}\n\n if (passwordValue !== passwordConfirmValue) {\n inputError(\"password\");\n return inputError(\"confirm-password\");\n }\n\n\n});\n\nfunction inputError(inputName) {\n form.elements[inputName].style.border = \"1px solid red\";\n}\n```",
|
334 |
+
"id": "53d4066c6d704bd79f2d531a1f9562f4"
|
335 |
+
},
|
336 |
+
"59f75053fff04c879d1aacfb7bd8b51b": {
|
337 |
+
"label": true,
|
338 |
+
"text": "const App = () => {\n const [elements1, setElements1] = useState([\n { id: 1, text: 'Element 1' },\n { id: 2, text: 'Element 2' },\n { id: 3, text: 'Element 3' },\n ]);\n const [elements2, setElements2] = useState([\n { id: 4, text: 'Element 4' },\n { id: 5, text: 'Element 5' },\n { id: 6, text: 'Element 6' },\n ]);",
|
339 |
+
"id": "59f75053fff04c879d1aacfb7bd8b51b"
|
340 |
+
},
|
341 |
+
"cda55f47106b4a40b49bf9b8722cd7f1": {
|
342 |
+
"label": true,
|
343 |
+
"text": "```js\nfetch(\"/robots.txt\")\n .then(response => {\n return response.text(); // Devuelve una promesa\n })\n .then(data => {\n console.log(data);\n })\n .catch(error => { /* C\u00f3digo a realizar cuando se rechaza la promesa */ });\n```",
|
344 |
+
"id": "cda55f47106b4a40b49bf9b8722cd7f1"
|
345 |
+
},
|
346 |
+
"397f7180a5af495a874646051f8bc248": {
|
347 |
+
"label": true,
|
348 |
+
"text": " const handleButtonClick = async () => {\n try {\n const response = await axios.get(`https://example-api.com/${inputValue}`);\n setResult(response.data);\n } catch (error) {\n console.error(error);\n }\n };",
|
349 |
+
"id": "397f7180a5af495a874646051f8bc248"
|
350 |
+
},
|
351 |
+
"4439280d86e644da8c5a10277ba7777a": {
|
352 |
+
"label": true,
|
353 |
+
"text": " client.send(message);\n }\n });\n});\n```",
|
354 |
+
"id": "4439280d86e644da8c5a10277ba7777a"
|
355 |
+
},
|
356 |
+
"e54d850ca2cb41c5a869936baad2163f": {
|
357 |
+
"label": true,
|
358 |
+
"text": "```javascript\nconst axios = require('axios');\nconst cheerio = require('cheerio');\n\nconst getData = async () => {\n const response = await axios.get('https://www.realestate.com.au/rent/in-sydney,+nsw/list-1');\n const $ = cheerio.load(response.data);\n \n const properties = [];\n\n $('.listing-result').each((i, el) => {\n const property = {};",
|
359 |
+
"id": "e54d850ca2cb41c5a869936baad2163f"
|
360 |
+
},
|
361 |
+
"ae64416d43bc4a61b2783d1b7067b9de": {
|
362 |
+
"label": true,
|
363 |
+
"text": "const App = () => {\n const [list1, setList1] = useState([\n { id: 1, text: \"Item 1\" },\n { id: 2, text: \"Item 2\" },\n { id: 3, text: \"Item 3\" }\n ]);\n const [list2, setList2] = useState([\n { id: 4, text: \"Item 4\" },\n { id: 5, text: \"Item 5\" }\n ]);\n\n const onDragStart = (event, source) => {\n event.dataTransfer.setData(\"source\", source);\n };",
|
364 |
+
"id": "ae64416d43bc4a61b2783d1b7067b9de"
|
365 |
+
},
|
366 |
+
"5b6b91512f014660bc48d8527b3ca226": {
|
367 |
+
"label": true,
|
368 |
+
"text": "```javascript\nimport React, { useState } from 'react';\n\nconst App = () => {\n const [items1, setItems1] = useState(['Item 1', 'Item 2', 'Item 3']);\n const [items2, setItems2] = useState([]);",
|
369 |
+
"id": "5b6b91512f014660bc48d8527b3ca226"
|
370 |
+
},
|
371 |
+
"c751dfc325414603a16ac65e2fa62fd7": {
|
372 |
+
"label": false,
|
373 |
+
"text": "Detailed Instructions: In this task, you are given an input list. A list contains several comma-separated items written within brackets. You need to return the position of all the alphabetical elements in the given list in order. Assume the position of the 1st element to be 1. Return -1 if no alphabetical element is in the list.\nQ: ['g', '7171', 'v', 'i', 'f', 'c']\nA:",
|
374 |
+
"id": "c751dfc325414603a16ac65e2fa62fd7"
|
375 |
+
},
|
376 |
+
"0310318744f1425e88cdd053773681ca": {
|
377 |
+
"label": false,
|
378 |
+
"text": "In this task, you are given an input list. A list contains several comma-separated items written within brackets. You need to return the position of all the alphabetical elements in the given list in order. Assume the position of the 1st element to be 1. Return -1 if no alphabetical element is in the list.",
|
379 |
+
"id": "0310318744f1425e88cdd053773681ca"
|
380 |
+
},
|
381 |
+
"8d6eb8f1aac745359540c9f454aa9a58": {
|
382 |
+
"label": false,
|
383 |
+
"text": "not how you reconcile with your estranged wife,\u201d wrote Vulture.",
|
384 |
+
"id": "8d6eb8f1aac745359540c9f454aa9a58"
|
385 |
+
}
|
386 |
+
},
|
387 |
+
"version": 62,
|
388 |
+
"description": "Source code for a programming language."
|
389 |
+
}
|
lilac/concepts/toxicity/concept.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
lilac/config.py
ADDED
@@ -0,0 +1,268 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Configurations for a dataset run."""
|
2 |
+
|
3 |
+
import json
|
4 |
+
import pathlib
|
5 |
+
from typing import TYPE_CHECKING, Any, Optional, Union
|
6 |
+
|
7 |
+
import yaml
|
8 |
+
|
9 |
+
if TYPE_CHECKING:
|
10 |
+
from pydantic.typing import AbstractSetIntStr, MappingIntStrAny
|
11 |
+
|
12 |
+
from pydantic import BaseModel, Extra, ValidationError, validator
|
13 |
+
|
14 |
+
from .schema import Path, PathTuple, normalize_path
|
15 |
+
from .signal import Signal, TextEmbeddingSignal, get_signal_by_type, resolve_signal
|
16 |
+
from .sources.source import Source
|
17 |
+
from .sources.source_registry import resolve_source
|
18 |
+
|
19 |
+
CONFIG_FILENAME = 'config.yml'
|
20 |
+
|
21 |
+
|
22 |
+
def _serializable_path(path: PathTuple) -> Union[str, list]:
|
23 |
+
if len(path) == 1:
|
24 |
+
return path[0]
|
25 |
+
return list(path)
|
26 |
+
|
27 |
+
|
28 |
+
class SignalConfig(BaseModel):
|
29 |
+
"""Configures a signal on a source path."""
|
30 |
+
path: PathTuple
|
31 |
+
signal: Signal
|
32 |
+
|
33 |
+
class Config:
|
34 |
+
extra = Extra.forbid
|
35 |
+
|
36 |
+
@validator('path', pre=True)
|
37 |
+
def parse_path(cls, path: Path) -> PathTuple:
|
38 |
+
"""Parse a path."""
|
39 |
+
return normalize_path(path)
|
40 |
+
|
41 |
+
@validator('signal', pre=True)
|
42 |
+
def parse_signal(cls, signal: dict) -> Signal:
|
43 |
+
"""Parse a signal to its specific subclass instance."""
|
44 |
+
return resolve_signal(signal)
|
45 |
+
|
46 |
+
def dict(
|
47 |
+
self,
|
48 |
+
*,
|
49 |
+
include: Optional[Union['AbstractSetIntStr', 'MappingIntStrAny']] = None,
|
50 |
+
exclude: Optional[Union['AbstractSetIntStr', 'MappingIntStrAny']] = None,
|
51 |
+
by_alias: bool = False,
|
52 |
+
skip_defaults: Optional[bool] = None,
|
53 |
+
exclude_unset: bool = False,
|
54 |
+
exclude_defaults: bool = False,
|
55 |
+
exclude_none: bool = False,
|
56 |
+
) -> dict[str, Any]:
|
57 |
+
"""Override the default dict method to simplify the path tuples.
|
58 |
+
|
59 |
+
This is required to remove the python-specific tuple dump in the yaml file.
|
60 |
+
"""
|
61 |
+
res = super().dict(
|
62 |
+
include=include,
|
63 |
+
exclude=exclude,
|
64 |
+
by_alias=by_alias,
|
65 |
+
skip_defaults=skip_defaults,
|
66 |
+
exclude_unset=exclude_unset,
|
67 |
+
exclude_defaults=exclude_defaults,
|
68 |
+
exclude_none=exclude_none)
|
69 |
+
res['path'] = _serializable_path(res['path'])
|
70 |
+
return res
|
71 |
+
|
72 |
+
|
73 |
+
class EmbeddingConfig(BaseModel):
|
74 |
+
"""Configures an embedding on a source path."""
|
75 |
+
path: PathTuple
|
76 |
+
embedding: str
|
77 |
+
|
78 |
+
class Config:
|
79 |
+
extra = Extra.forbid
|
80 |
+
|
81 |
+
def dict(
|
82 |
+
self,
|
83 |
+
*,
|
84 |
+
include: Optional[Union['AbstractSetIntStr', 'MappingIntStrAny']] = None,
|
85 |
+
exclude: Optional[Union['AbstractSetIntStr', 'MappingIntStrAny']] = None,
|
86 |
+
by_alias: bool = False,
|
87 |
+
skip_defaults: Optional[bool] = None,
|
88 |
+
exclude_unset: bool = False,
|
89 |
+
exclude_defaults: bool = False,
|
90 |
+
exclude_none: bool = False,
|
91 |
+
) -> dict[str, Any]:
|
92 |
+
"""Override the default dict method to simplify the path tuples.
|
93 |
+
|
94 |
+
This is required to remove the python-specific tuple dump in the yaml file.
|
95 |
+
"""
|
96 |
+
res = super().dict(
|
97 |
+
include=include,
|
98 |
+
exclude=exclude,
|
99 |
+
by_alias=by_alias,
|
100 |
+
skip_defaults=skip_defaults,
|
101 |
+
exclude_unset=exclude_unset,
|
102 |
+
exclude_defaults=exclude_defaults,
|
103 |
+
exclude_none=exclude_none)
|
104 |
+
res['path'] = _serializable_path(res['path'])
|
105 |
+
return res
|
106 |
+
|
107 |
+
@validator('path', pre=True)
|
108 |
+
def parse_path(cls, path: Path) -> PathTuple:
|
109 |
+
"""Parse a path."""
|
110 |
+
return normalize_path(path)
|
111 |
+
|
112 |
+
@validator('embedding', pre=True)
|
113 |
+
def validate_embedding(cls, embedding: str) -> str:
|
114 |
+
"""Validate the embedding is registered."""
|
115 |
+
get_signal_by_type(embedding, TextEmbeddingSignal)
|
116 |
+
return embedding
|
117 |
+
|
118 |
+
|
119 |
+
class DatasetUISettings(BaseModel):
|
120 |
+
"""The UI persistent settings for a dataset."""
|
121 |
+
media_paths: list[PathTuple] = []
|
122 |
+
markdown_paths: list[PathTuple] = []
|
123 |
+
|
124 |
+
class Config:
|
125 |
+
extra = Extra.forbid
|
126 |
+
|
127 |
+
@validator('media_paths', pre=True)
|
128 |
+
def parse_media_paths(cls, media_paths: list) -> list:
|
129 |
+
"""Parse a path, ensuring it is a tuple."""
|
130 |
+
return [normalize_path(path) for path in media_paths]
|
131 |
+
|
132 |
+
def dict(
|
133 |
+
self,
|
134 |
+
*,
|
135 |
+
include: Optional[Union['AbstractSetIntStr', 'MappingIntStrAny']] = None,
|
136 |
+
exclude: Optional[Union['AbstractSetIntStr', 'MappingIntStrAny']] = None,
|
137 |
+
by_alias: bool = False,
|
138 |
+
skip_defaults: Optional[bool] = None,
|
139 |
+
exclude_unset: bool = False,
|
140 |
+
exclude_defaults: bool = False,
|
141 |
+
exclude_none: bool = False,
|
142 |
+
) -> dict[str, Any]:
|
143 |
+
"""Override the default dict method to simplify the path tuples.
|
144 |
+
|
145 |
+
This is required to remove the python-specific tuple dump in the yaml file.
|
146 |
+
"""
|
147 |
+
# TODO(nsthorat): Migrate this to @field_serializer when we upgrade to pydantic v2.
|
148 |
+
res = super().dict(
|
149 |
+
include=include,
|
150 |
+
exclude=exclude,
|
151 |
+
by_alias=by_alias,
|
152 |
+
skip_defaults=skip_defaults,
|
153 |
+
exclude_unset=exclude_unset,
|
154 |
+
exclude_defaults=exclude_defaults,
|
155 |
+
exclude_none=exclude_none)
|
156 |
+
if 'media_paths' in res:
|
157 |
+
res['media_paths'] = [_serializable_path(path) for path in res['media_paths']]
|
158 |
+
if 'markdown_paths' in res:
|
159 |
+
res['markdown_paths'] = [_serializable_path(path) for path in res['markdown_paths']]
|
160 |
+
return res
|
161 |
+
|
162 |
+
|
163 |
+
class DatasetSettings(BaseModel):
|
164 |
+
"""The persistent settings for a dataset."""
|
165 |
+
ui: Optional[DatasetUISettings] = None
|
166 |
+
preferred_embedding: Optional[str] = None
|
167 |
+
|
168 |
+
class Config:
|
169 |
+
extra = Extra.forbid
|
170 |
+
|
171 |
+
|
172 |
+
class DatasetConfig(BaseModel):
|
173 |
+
"""Configures a dataset with a source and transformations."""
|
174 |
+
# The namespace and name of the dataset.
|
175 |
+
namespace: str
|
176 |
+
name: str
|
177 |
+
# Tags to organize datasets.
|
178 |
+
tags: list[str] = []
|
179 |
+
|
180 |
+
# The source configuration.
|
181 |
+
source: Source
|
182 |
+
|
183 |
+
# Model configuration: embeddings and signals on paths.
|
184 |
+
embeddings: list[EmbeddingConfig] = []
|
185 |
+
# When defined, uses this list of signals instead of running all signals.
|
186 |
+
signals: list[SignalConfig] = []
|
187 |
+
|
188 |
+
# Dataset settings, default embeddings and UI settings like media paths.
|
189 |
+
settings: Optional[DatasetSettings] = None
|
190 |
+
|
191 |
+
class Config:
|
192 |
+
extra = Extra.forbid
|
193 |
+
|
194 |
+
@validator('source', pre=True)
|
195 |
+
def parse_source(cls, source: dict) -> Source:
|
196 |
+
"""Parse a source to its specific subclass instance."""
|
197 |
+
return resolve_source(source)
|
198 |
+
|
199 |
+
|
200 |
+
class Config(BaseModel):
|
201 |
+
"""Configures a set of datasets for a lilac instance."""
|
202 |
+
datasets: list[DatasetConfig]
|
203 |
+
|
204 |
+
# When defined, uses this list of signals to run over every dataset, over all media paths, unless
|
205 |
+
# signals is overridden by a specific dataset.
|
206 |
+
signals: list[Signal] = []
|
207 |
+
|
208 |
+
# A list of embeddings to compute the model caches for, for all concepts.
|
209 |
+
concept_model_cache_embeddings: list[str] = []
|
210 |
+
|
211 |
+
class Config:
|
212 |
+
extra = Extra.forbid
|
213 |
+
|
214 |
+
@validator('signals', pre=True)
|
215 |
+
def parse_signal(cls, signals: list[dict]) -> list[Signal]:
|
216 |
+
"""Parse alist of signals to their specific subclass instances."""
|
217 |
+
return [resolve_signal(signal) for signal in signals]
|
218 |
+
|
219 |
+
|
220 |
+
def read_config(config_path: str) -> Config:
|
221 |
+
"""Reads a config file.
|
222 |
+
|
223 |
+
The config file can either be a `Config` or a `DatasetConfig`.
|
224 |
+
|
225 |
+
The result is always a `Config` object. If the input is a `DatasetConfig`, the config will just
|
226 |
+
contain a single dataset.
|
227 |
+
"""
|
228 |
+
config_ext = pathlib.Path(config_path).suffix
|
229 |
+
if config_ext in ['.yml', '.yaml']:
|
230 |
+
with open(config_path, 'r') as f:
|
231 |
+
config_dict = yaml.safe_load(f)
|
232 |
+
elif config_ext in ['.json']:
|
233 |
+
with open(config_path, 'r') as f:
|
234 |
+
config_dict = json.load(f)
|
235 |
+
else:
|
236 |
+
raise ValueError(f'Unsupported config file extension: {config_ext}')
|
237 |
+
|
238 |
+
config: Optional[Config] = None
|
239 |
+
is_config = True
|
240 |
+
try:
|
241 |
+
config = Config(**config_dict)
|
242 |
+
except ValidationError:
|
243 |
+
is_config = False
|
244 |
+
|
245 |
+
if not is_config:
|
246 |
+
try:
|
247 |
+
dataset_config = DatasetConfig(**config_dict)
|
248 |
+
config = Config(datasets=[dataset_config])
|
249 |
+
except ValidationError as error:
|
250 |
+
raise ValidationError(
|
251 |
+
'Config is not a valid `Config` or `DatasetConfig`', model=DatasetConfig) from error
|
252 |
+
assert config is not None
|
253 |
+
|
254 |
+
return config
|
255 |
+
|
256 |
+
|
257 |
+
class LilacHuggingFaceDataset(BaseModel):
|
258 |
+
"""A huggingface dataset that powers the demo."""
|
259 |
+
hf_dataset_repo_id: str
|
260 |
+
lilac_namespace: str
|
261 |
+
lilac_name: str
|
262 |
+
|
263 |
+
|
264 |
+
class DemoConfig(BaseModel):
|
265 |
+
"""Configures a hosted demo."""
|
266 |
+
|
267 |
+
# A list of huggingface dataset repositories that power the demo.
|
268 |
+
lilac_hf_datasets: list[LilacHuggingFaceDataset] = []
|
lilac/conftest.py
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Fixtures for dataset tests."""
|
2 |
+
import os
|
3 |
+
import pathlib
|
4 |
+
from typing import Generator, Optional, Type
|
5 |
+
|
6 |
+
import pytest
|
7 |
+
from pytest_mock import MockerFixture
|
8 |
+
|
9 |
+
from .data.dataset import Dataset
|
10 |
+
from .data.dataset_duckdb import DatasetDuckDB
|
11 |
+
from .data.dataset_test_utils import make_dataset
|
12 |
+
from .db_manager import set_default_dataset_cls
|
13 |
+
from .schema import Item, Schema
|
14 |
+
|
15 |
+
|
16 |
+
@pytest.fixture(scope='function', params=[DatasetDuckDB])
|
17 |
+
def make_test_data(tmp_path: pathlib.Path, mocker: MockerFixture,
|
18 |
+
request: pytest.FixtureRequest) -> Generator:
|
19 |
+
"""A pytest fixture for creating temporary test datasets."""
|
20 |
+
mocker.patch.dict(os.environ, {'LILAC_DATA_PATH': str(tmp_path)})
|
21 |
+
dataset_cls: Type[Dataset] = request.param
|
22 |
+
set_default_dataset_cls(dataset_cls)
|
23 |
+
|
24 |
+
def _make_test_data(items: list[Item], schema: Optional[Schema] = None) -> Dataset:
|
25 |
+
return make_dataset(dataset_cls, tmp_path, items, schema)
|
26 |
+
|
27 |
+
# Return the factory for datasets that test methods can use.
|
28 |
+
yield _make_test_data
|
lilac/data/__init__.py
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from .dataset import (
|
2 |
+
BinaryOp,
|
3 |
+
Column,
|
4 |
+
ConceptSearch,
|
5 |
+
Dataset,
|
6 |
+
Filter,
|
7 |
+
FilterLike,
|
8 |
+
KeywordSearch,
|
9 |
+
ListOp,
|
10 |
+
SemanticSearch,
|
11 |
+
UnaryOp,
|
12 |
+
)
|
13 |
+
|
14 |
+
__all__ = [
|
15 |
+
'Column',
|
16 |
+
'KeywordSearch',
|
17 |
+
'ConceptSearch',
|
18 |
+
'SemanticSearch',
|
19 |
+
'Filter',
|
20 |
+
'UnaryOp',
|
21 |
+
'BinaryOp',
|
22 |
+
'ListOp',
|
23 |
+
'Dataset',
|
24 |
+
'FilterLike',
|
25 |
+
]
|
lilac/data/dataset.py
ADDED
@@ -0,0 +1,510 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""The interface for the database."""
|
2 |
+
from __future__ import annotations
|
3 |
+
|
4 |
+
import abc
|
5 |
+
import enum
|
6 |
+
import pathlib
|
7 |
+
from concurrent.futures import ThreadPoolExecutor
|
8 |
+
from datetime import datetime
|
9 |
+
from typing import Any, Iterator, Literal, Optional, Sequence, Union
|
10 |
+
|
11 |
+
import pandas as pd
|
12 |
+
from pydantic import (
|
13 |
+
BaseModel,
|
14 |
+
StrictBool,
|
15 |
+
StrictBytes,
|
16 |
+
StrictFloat,
|
17 |
+
StrictInt,
|
18 |
+
StrictStr,
|
19 |
+
validator,
|
20 |
+
)
|
21 |
+
from typing_extensions import TypeAlias
|
22 |
+
|
23 |
+
from lilac.signals.concept_scorer import ConceptSignal
|
24 |
+
|
25 |
+
from ..auth import UserInfo
|
26 |
+
from ..config import DatasetConfig, DatasetSettings, DatasetUISettings
|
27 |
+
from ..schema import (
|
28 |
+
PATH_WILDCARD,
|
29 |
+
ROWID,
|
30 |
+
VALUE_KEY,
|
31 |
+
Bin,
|
32 |
+
DataType,
|
33 |
+
Path,
|
34 |
+
PathTuple,
|
35 |
+
Schema,
|
36 |
+
normalize_path,
|
37 |
+
)
|
38 |
+
from ..signal import Signal, TextEmbeddingSignal, get_signal_by_type, resolve_signal
|
39 |
+
from ..tasks import TaskStepId
|
40 |
+
|
41 |
+
# Threshold for rejecting certain queries (e.g. group by) for columns with large cardinality.
|
42 |
+
TOO_MANY_DISTINCT = 500_000
|
43 |
+
SAMPLE_AVG_TEXT_LENGTH = 1000
|
44 |
+
MAX_TEXT_LEN_DISTINCT_COUNT = 250
|
45 |
+
|
46 |
+
|
47 |
+
class SelectRowsResult:
|
48 |
+
"""The result of a select rows query."""
|
49 |
+
|
50 |
+
def __init__(self, df: pd.DataFrame, total_num_rows: int) -> None:
|
51 |
+
"""Initialize the result."""
|
52 |
+
self._df = df
|
53 |
+
self.total_num_rows = total_num_rows
|
54 |
+
|
55 |
+
def __iter__(self) -> Iterator:
|
56 |
+
return (row.to_dict() for _, row in self._df.iterrows())
|
57 |
+
|
58 |
+
def df(self) -> pd.DataFrame:
|
59 |
+
"""Convert the result to a pandas DataFrame."""
|
60 |
+
return self._df
|
61 |
+
|
62 |
+
|
63 |
+
class StatsResult(BaseModel):
|
64 |
+
"""The result of a stats() query."""
|
65 |
+
path: PathTuple
|
66 |
+
# The number of leaf values.
|
67 |
+
total_count: int
|
68 |
+
# The approximate number of distinct leaf values.
|
69 |
+
approx_count_distinct: int
|
70 |
+
|
71 |
+
# Defined for ordinal features.
|
72 |
+
min_val: Optional[Union[float, datetime]] = None
|
73 |
+
max_val: Optional[Union[float, datetime]] = None
|
74 |
+
|
75 |
+
# Defined for text features.
|
76 |
+
avg_text_length: Optional[float] = None
|
77 |
+
|
78 |
+
|
79 |
+
class MediaResult(BaseModel):
|
80 |
+
"""The result of a media() query."""
|
81 |
+
data: bytes
|
82 |
+
|
83 |
+
|
84 |
+
BinaryOp = Literal['equals', 'not_equal', 'greater', 'greater_equal', 'less', 'less_equal']
|
85 |
+
UnaryOp = Literal['exists']
|
86 |
+
ListOp = Literal['in']
|
87 |
+
|
88 |
+
BINARY_OPS = set(['equals', 'not_equal', 'greater', 'greater_equal', 'less', 'less_equal'])
|
89 |
+
UNARY_OPS = set(['exists'])
|
90 |
+
LIST_OPS = set(['in'])
|
91 |
+
|
92 |
+
SearchType = Union[Literal['keyword'], Literal['semantic'], Literal['concept']]
|
93 |
+
|
94 |
+
|
95 |
+
class SortOrder(str, enum.Enum):
|
96 |
+
"""The sort order for a database query."""
|
97 |
+
DESC = 'DESC'
|
98 |
+
ASC = 'ASC'
|
99 |
+
|
100 |
+
|
101 |
+
class GroupsSortBy(str, enum.Enum):
|
102 |
+
"""The sort for groups queries.
|
103 |
+
|
104 |
+
Either "count" which sorts by the count of feature value, or "value" which sorts by the
|
105 |
+
feature value itself.
|
106 |
+
"""
|
107 |
+
COUNT = 'count'
|
108 |
+
VALUE = 'value'
|
109 |
+
|
110 |
+
|
111 |
+
class SortResult(BaseModel):
|
112 |
+
"""The information about what is sorted after combining searches and explicit sorts."""
|
113 |
+
# The column that was sorted.
|
114 |
+
path: PathTuple
|
115 |
+
# The sort order.
|
116 |
+
order: SortOrder
|
117 |
+
# The alias of the column if it was aliased.
|
118 |
+
alias: Optional[str] = None
|
119 |
+
# The search index if the sort is by a search.
|
120 |
+
search_index: Optional[int] = None
|
121 |
+
|
122 |
+
|
123 |
+
class SearchResultInfo(BaseModel):
|
124 |
+
"""The resulting sort order returned by the select rows schema."""
|
125 |
+
# The input path to the search.
|
126 |
+
search_path: PathTuple
|
127 |
+
# The resulting column that was searched.
|
128 |
+
result_path: PathTuple
|
129 |
+
# The alias of the UDF.
|
130 |
+
alias: Optional[str] = None
|
131 |
+
|
132 |
+
|
133 |
+
class SelectRowsSchemaUDF(BaseModel):
|
134 |
+
"""The UDF for a select rows schema query."""
|
135 |
+
path: PathTuple
|
136 |
+
alias: Optional[str] = None
|
137 |
+
|
138 |
+
|
139 |
+
class SelectRowsSchemaResult(BaseModel):
|
140 |
+
"""The result of a select rows schema query."""
|
141 |
+
data_schema: Schema
|
142 |
+
udfs: list[SelectRowsSchemaUDF] = []
|
143 |
+
search_results: list[SearchResultInfo] = []
|
144 |
+
sorts: Optional[list[SortResult]] = None
|
145 |
+
|
146 |
+
|
147 |
+
class Column(BaseModel):
|
148 |
+
"""A column in the dataset."""
|
149 |
+
path: PathTuple
|
150 |
+
alias: Optional[str] = None # This is the renamed column during querying and response.
|
151 |
+
|
152 |
+
# Defined when the feature is another column.
|
153 |
+
signal_udf: Optional[Signal] = None
|
154 |
+
|
155 |
+
class Config:
|
156 |
+
smart_union = True
|
157 |
+
|
158 |
+
def __init__(self,
|
159 |
+
path: Path,
|
160 |
+
alias: Optional[str] = None,
|
161 |
+
signal_udf: Optional[Signal] = None,
|
162 |
+
**kwargs: Any):
|
163 |
+
"""Initialize a column. We override __init__ to allow positional arguments for brevity."""
|
164 |
+
super().__init__(path=normalize_path(path), alias=alias, signal_udf=signal_udf, **kwargs)
|
165 |
+
|
166 |
+
@validator('signal_udf', pre=True)
|
167 |
+
def parse_signal_udf(cls, signal_udf: Optional[dict]) -> Optional[Signal]:
|
168 |
+
"""Parse a signal to its specific subclass instance."""
|
169 |
+
if not signal_udf:
|
170 |
+
return None
|
171 |
+
return resolve_signal(signal_udf)
|
172 |
+
|
173 |
+
|
174 |
+
ColumnId = Union[Path, Column]
|
175 |
+
|
176 |
+
|
177 |
+
class DatasetManifest(BaseModel):
|
178 |
+
"""The manifest for a dataset."""
|
179 |
+
namespace: str
|
180 |
+
dataset_name: str
|
181 |
+
data_schema: Schema
|
182 |
+
# Number of items in the dataset.
|
183 |
+
num_items: int
|
184 |
+
|
185 |
+
|
186 |
+
def column_from_identifier(column: ColumnId) -> Column:
|
187 |
+
"""Create a column from a column identifier."""
|
188 |
+
if isinstance(column, Column):
|
189 |
+
return column.copy()
|
190 |
+
return Column(path=column)
|
191 |
+
|
192 |
+
|
193 |
+
FeatureValue = Union[StrictInt, StrictFloat, StrictBool, StrictStr, StrictBytes, datetime]
|
194 |
+
FeatureListValue = list[StrictStr]
|
195 |
+
BinaryFilterTuple = tuple[Path, BinaryOp, FeatureValue]
|
196 |
+
ListFilterTuple = tuple[Path, ListOp, FeatureListValue]
|
197 |
+
UnaryFilterTuple = tuple[Path, UnaryOp]
|
198 |
+
|
199 |
+
FilterOp = Union[BinaryOp, UnaryOp, ListOp]
|
200 |
+
|
201 |
+
|
202 |
+
class SelectGroupsResult(BaseModel):
|
203 |
+
"""The result of a select groups query."""
|
204 |
+
too_many_distinct: bool
|
205 |
+
counts: list[tuple[Optional[FeatureValue], int]]
|
206 |
+
bins: Optional[list[Bin]] = None
|
207 |
+
|
208 |
+
|
209 |
+
class Filter(BaseModel):
|
210 |
+
"""A filter on a column."""
|
211 |
+
path: PathTuple
|
212 |
+
op: FilterOp
|
213 |
+
value: Optional[Union[FeatureValue, FeatureListValue]] = None
|
214 |
+
|
215 |
+
|
216 |
+
FilterLike: TypeAlias = Union[Filter, BinaryFilterTuple, UnaryFilterTuple, ListFilterTuple]
|
217 |
+
|
218 |
+
SearchValue = StrictStr
|
219 |
+
|
220 |
+
|
221 |
+
class KeywordSearch(BaseModel):
|
222 |
+
"""A keyword search query on a column."""
|
223 |
+
path: Path
|
224 |
+
query: SearchValue
|
225 |
+
type: Literal['keyword'] = 'keyword'
|
226 |
+
|
227 |
+
|
228 |
+
class SemanticSearch(BaseModel):
|
229 |
+
"""A semantic search on a column."""
|
230 |
+
path: Path
|
231 |
+
query: SearchValue
|
232 |
+
embedding: str
|
233 |
+
type: Literal['semantic'] = 'semantic'
|
234 |
+
|
235 |
+
|
236 |
+
class ConceptSearch(BaseModel):
|
237 |
+
"""A concept search query on a column."""
|
238 |
+
path: Path
|
239 |
+
concept_namespace: str
|
240 |
+
concept_name: str
|
241 |
+
embedding: str
|
242 |
+
type: Literal['concept'] = 'concept'
|
243 |
+
|
244 |
+
|
245 |
+
Search = Union[ConceptSearch, SemanticSearch, KeywordSearch]
|
246 |
+
|
247 |
+
|
248 |
+
class Dataset(abc.ABC):
|
249 |
+
"""The database implementation to query a dataset."""
|
250 |
+
|
251 |
+
namespace: str
|
252 |
+
dataset_name: str
|
253 |
+
|
254 |
+
def __init__(self, namespace: str, dataset_name: str):
|
255 |
+
"""Initialize a dataset.
|
256 |
+
|
257 |
+
Args:
|
258 |
+
namespace: The dataset namespace.
|
259 |
+
dataset_name: The dataset name.
|
260 |
+
"""
|
261 |
+
self.namespace = namespace
|
262 |
+
self.dataset_name = dataset_name
|
263 |
+
|
264 |
+
@abc.abstractmethod
|
265 |
+
def delete(self) -> None:
|
266 |
+
"""Deletes the dataset."""
|
267 |
+
pass
|
268 |
+
|
269 |
+
@abc.abstractmethod
|
270 |
+
def manifest(self) -> DatasetManifest:
|
271 |
+
"""Return the manifest for the dataset."""
|
272 |
+
pass
|
273 |
+
|
274 |
+
@abc.abstractmethod
|
275 |
+
def config(self) -> DatasetConfig:
|
276 |
+
"""Return the dataset config for this dataset."""
|
277 |
+
pass
|
278 |
+
|
279 |
+
@abc.abstractmethod
|
280 |
+
def settings(self) -> DatasetSettings:
|
281 |
+
"""Return the persistent settings for the dataset."""
|
282 |
+
pass
|
283 |
+
|
284 |
+
@abc.abstractmethod
|
285 |
+
def update_settings(self, settings: DatasetSettings) -> None:
|
286 |
+
"""Update the settings for the dataset."""
|
287 |
+
pass
|
288 |
+
|
289 |
+
@abc.abstractmethod
|
290 |
+
def compute_signal(self,
|
291 |
+
signal: Signal,
|
292 |
+
path: Path,
|
293 |
+
task_step_id: Optional[TaskStepId] = None) -> None:
|
294 |
+
"""Compute a signal for a column.
|
295 |
+
|
296 |
+
Args:
|
297 |
+
signal: The signal to compute over the given columns.
|
298 |
+
path: The leaf path to compute the signal on.
|
299 |
+
task_step_id: The TaskManager `task_step_id` for this process run. This is used to update the
|
300 |
+
progress of the task.
|
301 |
+
"""
|
302 |
+
pass
|
303 |
+
|
304 |
+
def compute_embedding(self,
|
305 |
+
embedding: str,
|
306 |
+
path: Path,
|
307 |
+
task_step_id: Optional[TaskStepId] = None) -> None:
|
308 |
+
"""Compute an embedding for a given field path."""
|
309 |
+
signal = get_signal_by_type(embedding, TextEmbeddingSignal)()
|
310 |
+
self.compute_signal(signal, path, task_step_id)
|
311 |
+
|
312 |
+
def compute_concept(self,
|
313 |
+
namespace: str,
|
314 |
+
concept_name: str,
|
315 |
+
embedding: str,
|
316 |
+
path: Path,
|
317 |
+
task_step_id: Optional[TaskStepId] = None) -> None:
|
318 |
+
"""Compute concept scores for a given field path."""
|
319 |
+
signal = ConceptSignal(namespace=namespace, concept_name=concept_name, embedding=embedding)
|
320 |
+
self.compute_signal(signal, path, task_step_id)
|
321 |
+
|
322 |
+
@abc.abstractmethod
|
323 |
+
def delete_signal(self, signal_path: Path) -> None:
|
324 |
+
"""Delete a computed signal from the dataset.
|
325 |
+
|
326 |
+
Args:
|
327 |
+
signal_path: The path holding the computed data of the signal.
|
328 |
+
"""
|
329 |
+
pass
|
330 |
+
|
331 |
+
@abc.abstractmethod
|
332 |
+
def select_groups(
|
333 |
+
self,
|
334 |
+
leaf_path: Path,
|
335 |
+
filters: Optional[Sequence[FilterLike]] = None,
|
336 |
+
sort_by: Optional[GroupsSortBy] = None,
|
337 |
+
sort_order: Optional[SortOrder] = SortOrder.DESC,
|
338 |
+
limit: Optional[int] = None,
|
339 |
+
bins: Optional[Union[Sequence[Bin], Sequence[float]]] = None) -> SelectGroupsResult:
|
340 |
+
"""Select grouped columns to power a histogram.
|
341 |
+
|
342 |
+
Args:
|
343 |
+
leaf_path: The leaf path to group by. The path can be a dot-seperated string path, or a tuple
|
344 |
+
of fields.
|
345 |
+
filters: The filters to apply to the query.
|
346 |
+
sort_by: What to sort by, either "count" or "value".
|
347 |
+
sort_order: The sort order.
|
348 |
+
limit: The maximum number of rows to return.
|
349 |
+
bins: The bins to use when bucketizing a float column.
|
350 |
+
|
351 |
+
Returns
|
352 |
+
A `SelectGroupsResult` iterator where each row is a group.
|
353 |
+
"""
|
354 |
+
raise NotImplementedError
|
355 |
+
|
356 |
+
@abc.abstractmethod
|
357 |
+
def select_rows(self,
|
358 |
+
columns: Optional[Sequence[ColumnId]] = None,
|
359 |
+
searches: Optional[Sequence[Search]] = None,
|
360 |
+
filters: Optional[Sequence[FilterLike]] = None,
|
361 |
+
sort_by: Optional[Sequence[Path]] = None,
|
362 |
+
sort_order: Optional[SortOrder] = SortOrder.DESC,
|
363 |
+
limit: Optional[int] = 100,
|
364 |
+
offset: Optional[int] = 0,
|
365 |
+
task_step_id: Optional[TaskStepId] = None,
|
366 |
+
resolve_span: bool = False,
|
367 |
+
combine_columns: bool = False,
|
368 |
+
user: Optional[UserInfo] = None) -> SelectRowsResult:
|
369 |
+
"""Select a set of rows that match the provided filters, analogous to SQL SELECT.
|
370 |
+
|
371 |
+
Args:
|
372 |
+
columns: The columns to select. A column is an instance of `Column` which can either
|
373 |
+
define a path to a feature, or a column with an applied Transform, e.g. a Concept. If none,
|
374 |
+
it selects all columns.
|
375 |
+
searches: The searches to apply to the query.
|
376 |
+
filters: The filters to apply to the query.
|
377 |
+
sort_by: An ordered list of what to sort by. When defined, this is a list of aliases of column
|
378 |
+
names defined by the "alias" field in Column. If no alias is provided for a column, an
|
379 |
+
automatic alias is generated by combining each path element with a "."
|
380 |
+
For example: e.g. ('person', 'name') => person.name. For columns that are transform columns,
|
381 |
+
an alias must be provided explicitly. When sorting by a (nested) list of values, the sort
|
382 |
+
takes the minumum value when `sort_order` is `ASC`, and the maximum value when `sort_order`
|
383 |
+
is `DESC`.
|
384 |
+
sort_order: The sort order.
|
385 |
+
limit: The maximum number of rows to return.
|
386 |
+
offset: The offset to start returning rows from.
|
387 |
+
task_step_id: The TaskManager `task_step_id` for this process run. This is used to update the
|
388 |
+
progress.
|
389 |
+
resolve_span: Whether to resolve the span of the row.
|
390 |
+
combine_columns: Whether to combine columns into a single object. The object will be pruned
|
391 |
+
to only include sub-fields that correspond to the requested columns.
|
392 |
+
user: The authenticated user, if auth is enabled and the user is logged in. This is used to
|
393 |
+
apply ACL to the query, especially for concepts.
|
394 |
+
|
395 |
+
Returns
|
396 |
+
A `SelectRowsResult` iterator with rows of `Item`s.
|
397 |
+
"""
|
398 |
+
pass
|
399 |
+
|
400 |
+
@abc.abstractmethod
|
401 |
+
def select_rows_schema(self,
|
402 |
+
columns: Optional[Sequence[ColumnId]] = None,
|
403 |
+
sort_by: Optional[Sequence[Path]] = None,
|
404 |
+
sort_order: Optional[SortOrder] = SortOrder.DESC,
|
405 |
+
searches: Optional[Sequence[Search]] = None,
|
406 |
+
combine_columns: bool = False) -> SelectRowsSchemaResult:
|
407 |
+
"""Returns the schema of the result of `select_rows` above with the same arguments."""
|
408 |
+
pass
|
409 |
+
|
410 |
+
@abc.abstractmethod
|
411 |
+
def stats(self, leaf_path: Path) -> StatsResult:
|
412 |
+
"""Compute stats for a leaf path.
|
413 |
+
|
414 |
+
Args:
|
415 |
+
leaf_path: The leaf path to compute stats for.
|
416 |
+
|
417 |
+
Returns
|
418 |
+
A StatsResult.
|
419 |
+
"""
|
420 |
+
pass
|
421 |
+
|
422 |
+
@abc.abstractmethod
|
423 |
+
def media(self, item_id: str, leaf_path: Path) -> MediaResult:
|
424 |
+
"""Return the media for a leaf path.
|
425 |
+
|
426 |
+
Args:
|
427 |
+
item_id: The item id to get media for.
|
428 |
+
leaf_path: The leaf path for the media.
|
429 |
+
|
430 |
+
Returns
|
431 |
+
A MediaResult.
|
432 |
+
"""
|
433 |
+
pass
|
434 |
+
|
435 |
+
@abc.abstractmethod
|
436 |
+
def to_json(self,
|
437 |
+
filepath: Union[str, pathlib.Path],
|
438 |
+
jsonl: bool = True,
|
439 |
+
columns: Optional[Sequence[ColumnId]] = None) -> None:
|
440 |
+
"""Export the dataset to a JSON file.
|
441 |
+
|
442 |
+
Args:
|
443 |
+
filepath: The path to the file to export to.
|
444 |
+
jsonl: Whether to export to JSONL or JSON.
|
445 |
+
columns: The columns to export.
|
446 |
+
"""
|
447 |
+
pass
|
448 |
+
|
449 |
+
@abc.abstractmethod
|
450 |
+
def to_pandas(self, columns: Optional[Sequence[ColumnId]] = None) -> pd.DataFrame:
|
451 |
+
"""Export the dataset to a pandas DataFrame.
|
452 |
+
|
453 |
+
Args:
|
454 |
+
columns: The columns to export.
|
455 |
+
"""
|
456 |
+
pass
|
457 |
+
|
458 |
+
@abc.abstractmethod
|
459 |
+
def to_parquet(self,
|
460 |
+
filepath: Union[str, pathlib.Path],
|
461 |
+
columns: Optional[Sequence[ColumnId]] = None) -> None:
|
462 |
+
"""Export the dataset to a parquet file.
|
463 |
+
|
464 |
+
Args:
|
465 |
+
filepath: The path to the file to export to.
|
466 |
+
columns: The columns to export.
|
467 |
+
"""
|
468 |
+
pass
|
469 |
+
|
470 |
+
@abc.abstractmethod
|
471 |
+
def to_csv(self,
|
472 |
+
filepath: Union[str, pathlib.Path],
|
473 |
+
columns: Optional[Sequence[ColumnId]] = None) -> None:
|
474 |
+
"""Export the dataset to a csv file.
|
475 |
+
|
476 |
+
Args:
|
477 |
+
filepath: The path to the file to export to.
|
478 |
+
columns: The columns to export.
|
479 |
+
"""
|
480 |
+
pass
|
481 |
+
|
482 |
+
|
483 |
+
def default_settings(dataset: Dataset) -> DatasetSettings:
|
484 |
+
"""Gets the default settings for a dataset."""
|
485 |
+
schema = dataset.manifest().data_schema
|
486 |
+
leaf_paths = [
|
487 |
+
path for path, field in schema.leafs.items()
|
488 |
+
if field.dtype == DataType.STRING and path != (ROWID,)
|
489 |
+
]
|
490 |
+
pool = ThreadPoolExecutor()
|
491 |
+
stats: list[StatsResult] = list(pool.map(lambda leaf: dataset.stats(leaf), leaf_paths))
|
492 |
+
sorted_stats = sorted([stat for stat in stats if stat.avg_text_length],
|
493 |
+
key=lambda stat: stat.avg_text_length or -1.0)
|
494 |
+
media_paths: list[PathTuple] = []
|
495 |
+
if sorted_stats:
|
496 |
+
media_paths = [sorted_stats[-1].path]
|
497 |
+
|
498 |
+
return DatasetSettings(ui=DatasetUISettings(media_paths=media_paths))
|
499 |
+
|
500 |
+
|
501 |
+
def make_parquet_id(signal: Signal,
|
502 |
+
source_path: PathTuple,
|
503 |
+
is_computed_signal: Optional[bool] = False) -> str:
|
504 |
+
"""Return a unique identifier for this parquet table."""
|
505 |
+
# Remove the wildcards from the parquet id since they are implicit.
|
506 |
+
path = [*[p for p in source_path if p != PATH_WILDCARD], signal.key(is_computed_signal)]
|
507 |
+
# Don't use the VALUE_KEY as part of the parquet id to reduce the size of paths.
|
508 |
+
if path[-1] == VALUE_KEY:
|
509 |
+
path = path[:-1]
|
510 |
+
return '.'.join(path)
|
lilac/data/dataset_duckdb.py
ADDED
@@ -0,0 +1,1833 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""The DuckDB implementation of the dataset database."""
|
2 |
+
import functools
|
3 |
+
import gc
|
4 |
+
import glob
|
5 |
+
import math
|
6 |
+
import os
|
7 |
+
import pathlib
|
8 |
+
import re
|
9 |
+
import shutil
|
10 |
+
import threading
|
11 |
+
from typing import Any, Iterable, Iterator, Optional, Sequence, Union, cast
|
12 |
+
|
13 |
+
import duckdb
|
14 |
+
import numpy as np
|
15 |
+
import pandas as pd
|
16 |
+
import yaml
|
17 |
+
from pandas.api.types import is_object_dtype
|
18 |
+
from pydantic import BaseModel, validator
|
19 |
+
from typing_extensions import override
|
20 |
+
|
21 |
+
from ..auth import UserInfo
|
22 |
+
from ..batch_utils import deep_flatten, deep_unflatten
|
23 |
+
from ..config import CONFIG_FILENAME, DatasetConfig, DatasetSettings, EmbeddingConfig, SignalConfig
|
24 |
+
from ..embeddings.vector_store import VectorDBIndex
|
25 |
+
from ..env import data_path, env
|
26 |
+
from ..schema import (
|
27 |
+
MANIFEST_FILENAME,
|
28 |
+
PATH_WILDCARD,
|
29 |
+
ROWID,
|
30 |
+
TEXT_SPAN_END_FEATURE,
|
31 |
+
TEXT_SPAN_START_FEATURE,
|
32 |
+
VALUE_KEY,
|
33 |
+
Bin,
|
34 |
+
DataType,
|
35 |
+
Field,
|
36 |
+
Item,
|
37 |
+
Path,
|
38 |
+
PathKey,
|
39 |
+
PathTuple,
|
40 |
+
RichData,
|
41 |
+
Schema,
|
42 |
+
SourceManifest,
|
43 |
+
column_paths_match,
|
44 |
+
is_float,
|
45 |
+
is_integer,
|
46 |
+
is_ordinal,
|
47 |
+
is_temporal,
|
48 |
+
normalize_path,
|
49 |
+
signal_type_supports_dtype,
|
50 |
+
)
|
51 |
+
from ..signal import Signal, TextEmbeddingSignal, VectorSignal, get_signal_by_type, resolve_signal
|
52 |
+
from ..signals.concept_labels import ConceptLabelsSignal
|
53 |
+
from ..signals.concept_scorer import ConceptSignal
|
54 |
+
from ..signals.semantic_similarity import SemanticSimilaritySignal
|
55 |
+
from ..signals.substring_search import SubstringSignal
|
56 |
+
from ..sources.source import Source
|
57 |
+
from ..tasks import TaskStepId, progress
|
58 |
+
from ..utils import DebugTimer, get_dataset_output_dir, log, open_file, to_yaml
|
59 |
+
from . import dataset
|
60 |
+
from .dataset import (
|
61 |
+
BINARY_OPS,
|
62 |
+
LIST_OPS,
|
63 |
+
MAX_TEXT_LEN_DISTINCT_COUNT,
|
64 |
+
SAMPLE_AVG_TEXT_LENGTH,
|
65 |
+
TOO_MANY_DISTINCT,
|
66 |
+
UNARY_OPS,
|
67 |
+
BinaryOp,
|
68 |
+
Column,
|
69 |
+
ColumnId,
|
70 |
+
Dataset,
|
71 |
+
DatasetManifest,
|
72 |
+
FeatureListValue,
|
73 |
+
FeatureValue,
|
74 |
+
Filter,
|
75 |
+
FilterLike,
|
76 |
+
GroupsSortBy,
|
77 |
+
MediaResult,
|
78 |
+
Search,
|
79 |
+
SearchResultInfo,
|
80 |
+
SelectGroupsResult,
|
81 |
+
SelectRowsResult,
|
82 |
+
SelectRowsSchemaResult,
|
83 |
+
SelectRowsSchemaUDF,
|
84 |
+
SortOrder,
|
85 |
+
SortResult,
|
86 |
+
StatsResult,
|
87 |
+
column_from_identifier,
|
88 |
+
make_parquet_id,
|
89 |
+
)
|
90 |
+
from .dataset_utils import (
|
91 |
+
count_primitives,
|
92 |
+
create_signal_schema,
|
93 |
+
flatten_keys,
|
94 |
+
merge_schemas,
|
95 |
+
schema_contains_path,
|
96 |
+
sparse_to_dense_compute,
|
97 |
+
wrap_in_dicts,
|
98 |
+
write_embeddings_to_disk,
|
99 |
+
write_items_to_parquet,
|
100 |
+
)
|
101 |
+
|
102 |
+
SIGNAL_MANIFEST_FILENAME = 'signal_manifest.json'
|
103 |
+
DATASET_SETTINGS_FILENAME = 'settings.json'
|
104 |
+
SOURCE_VIEW_NAME = 'source'
|
105 |
+
|
106 |
+
NUM_AUTO_BINS = 15
|
107 |
+
|
108 |
+
BINARY_OP_TO_SQL: dict[BinaryOp, str] = {
|
109 |
+
'equals': '=',
|
110 |
+
'not_equal': '!=',
|
111 |
+
'greater': '>',
|
112 |
+
'greater_equal': '>=',
|
113 |
+
'less': '<',
|
114 |
+
'less_equal': '<='
|
115 |
+
}
|
116 |
+
|
117 |
+
|
118 |
+
class DuckDBSearchUDF(BaseModel):
|
119 |
+
"""The transformation of searches to column UDFs."""
|
120 |
+
udf: Column
|
121 |
+
search_path: PathTuple
|
122 |
+
output_path: PathTuple
|
123 |
+
sort: Optional[tuple[PathTuple, SortOrder]] = None
|
124 |
+
|
125 |
+
|
126 |
+
class DuckDBSearchUDFs(BaseModel):
|
127 |
+
"""The transformation of searches to column UDFs with sorts."""
|
128 |
+
udfs: list[Column]
|
129 |
+
output_paths: list[PathTuple]
|
130 |
+
sorts: list[tuple[PathTuple, SortOrder]]
|
131 |
+
|
132 |
+
|
133 |
+
class DatasetDuckDB(Dataset):
|
134 |
+
"""The DuckDB implementation of the dataset database."""
|
135 |
+
|
136 |
+
def __init__(self, namespace: str, dataset_name: str, vector_store: str = 'hnsw'):
|
137 |
+
super().__init__(namespace, dataset_name)
|
138 |
+
|
139 |
+
self.dataset_path = get_dataset_output_dir(data_path(), namespace, dataset_name)
|
140 |
+
|
141 |
+
# TODO: Infer the manifest from the parquet files so this is lighter weight.
|
142 |
+
self._source_manifest = read_source_manifest(self.dataset_path)
|
143 |
+
self._signal_manifests: list[SignalManifest] = []
|
144 |
+
self.con = duckdb.connect(database=':memory:')
|
145 |
+
|
146 |
+
# Maps a path and embedding to the vector index. This is lazily generated as needed.
|
147 |
+
self._vector_indices: dict[tuple[PathKey, str], VectorDBIndex] = {}
|
148 |
+
self.vector_store = vector_store
|
149 |
+
self._manifest_lock = threading.Lock()
|
150 |
+
|
151 |
+
self._config_lock = threading.Lock()
|
152 |
+
config_filepath = get_config_filepath(namespace, dataset_name)
|
153 |
+
|
154 |
+
if not os.path.exists(config_filepath):
|
155 |
+
# For backwards compatibility, if the config doesn't exist, create one. This will be out of
|
156 |
+
# sync but allow the server to still boot and update with new config changes.
|
157 |
+
# Make a metaclass so we get a valid `Source` class.
|
158 |
+
source_cls = type('Source_no_source', (Source,), {'name': 'no_source'})
|
159 |
+
|
160 |
+
old_settings_filepath = os.path.join(
|
161 |
+
get_dataset_output_dir(data_path(), namespace, dataset_name), 'settings.json')
|
162 |
+
settings = DatasetSettings()
|
163 |
+
if os.path.exists(old_settings_filepath):
|
164 |
+
with open(old_settings_filepath) as f:
|
165 |
+
settings = DatasetSettings.parse_raw(f.read())
|
166 |
+
|
167 |
+
config = DatasetConfig(
|
168 |
+
namespace=namespace, name=dataset_name, source=source_cls(), settings=settings)
|
169 |
+
with open(get_config_filepath(self.namespace, self.dataset_name), 'w') as f:
|
170 |
+
f.write(to_yaml(config.dict(exclude_none=True, exclude_defaults=True)))
|
171 |
+
|
172 |
+
# Create a join table from all the parquet files.
|
173 |
+
self.manifest()
|
174 |
+
|
175 |
+
@override
|
176 |
+
def delete(self) -> None:
|
177 |
+
"""Deletes the dataset."""
|
178 |
+
self.con.close()
|
179 |
+
shutil.rmtree(self.dataset_path, ignore_errors=True)
|
180 |
+
|
181 |
+
def _create_view(self, view_name: str, files: list[str]) -> None:
|
182 |
+
self.con.execute(f"""
|
183 |
+
CREATE OR REPLACE VIEW {_escape_col_name(view_name)} AS (SELECT * FROM read_parquet({files}));
|
184 |
+
""")
|
185 |
+
|
186 |
+
# NOTE: This is cached, but when the latest mtime of any file in the dataset directory changes
|
187 |
+
# the results are invalidated.
|
188 |
+
@functools.cache
|
189 |
+
def _recompute_joint_table(self, latest_mtime_micro_sec: int) -> DatasetManifest:
|
190 |
+
del latest_mtime_micro_sec # This is used as the cache key.
|
191 |
+
merged_schema = self._source_manifest.data_schema.copy(deep=True)
|
192 |
+
self._signal_manifests = []
|
193 |
+
# Make a joined view of all the column groups.
|
194 |
+
self._create_view(SOURCE_VIEW_NAME,
|
195 |
+
[os.path.join(self.dataset_path, f) for f in self._source_manifest.files])
|
196 |
+
|
197 |
+
# Add the signal column groups.
|
198 |
+
for root, _, files in os.walk(self.dataset_path):
|
199 |
+
for file in files:
|
200 |
+
if not file.endswith(SIGNAL_MANIFEST_FILENAME):
|
201 |
+
continue
|
202 |
+
|
203 |
+
with open_file(os.path.join(root, file)) as f:
|
204 |
+
signal_manifest = SignalManifest.parse_raw(f.read())
|
205 |
+
self._signal_manifests.append(signal_manifest)
|
206 |
+
signal_files = [os.path.join(root, f) for f in signal_manifest.files]
|
207 |
+
if signal_files:
|
208 |
+
self._create_view(signal_manifest.parquet_id, signal_files)
|
209 |
+
|
210 |
+
merged_schema = merge_schemas([self._source_manifest.data_schema] +
|
211 |
+
[m.data_schema for m in self._signal_manifests])
|
212 |
+
|
213 |
+
# The logic below generates the following example query:
|
214 |
+
# CREATE OR REPLACE VIEW t AS (
|
215 |
+
# SELECT
|
216 |
+
# source.*,
|
217 |
+
# "parquet_id1"."root_column" AS "parquet_id1",
|
218 |
+
# "parquet_id2"."root_column" AS "parquet_id2"
|
219 |
+
# FROM source JOIN "parquet_id1" USING (rowid,) JOIN "parquet_id2" USING (rowid,)
|
220 |
+
# );
|
221 |
+
# NOTE: "root_column" for each signal is defined as the top-level column.
|
222 |
+
select_sql = ', '.join([f'{SOURCE_VIEW_NAME}.*'] + [(
|
223 |
+
f'{_escape_col_name(manifest.parquet_id)}.{_escape_col_name(_root_column(manifest))} '
|
224 |
+
f'AS {_escape_col_name(manifest.parquet_id)}')
|
225 |
+
for manifest in self._signal_manifests
|
226 |
+
if manifest.files])
|
227 |
+
join_sql = ' '.join([SOURCE_VIEW_NAME] + [
|
228 |
+
f'LEFT JOIN {_escape_col_name(manifest.parquet_id)} USING ({ROWID})'
|
229 |
+
for manifest in self._signal_manifests
|
230 |
+
if manifest.files
|
231 |
+
])
|
232 |
+
view_or_table = 'TABLE'
|
233 |
+
use_views = env('DUCKDB_USE_VIEWS', 0) or 0
|
234 |
+
if int(use_views):
|
235 |
+
view_or_table = 'VIEW'
|
236 |
+
sql_cmd = f"""CREATE OR REPLACE {view_or_table} t AS (SELECT {select_sql} FROM {join_sql})"""
|
237 |
+
self.con.execute(sql_cmd)
|
238 |
+
|
239 |
+
# Get the total size of the table.
|
240 |
+
size_query = 'SELECT COUNT() as count FROM t'
|
241 |
+
size_query_result = cast(Any, self._query(size_query)[0])
|
242 |
+
num_items = cast(int, size_query_result[0])
|
243 |
+
|
244 |
+
return DatasetManifest(
|
245 |
+
namespace=self.namespace,
|
246 |
+
dataset_name=self.dataset_name,
|
247 |
+
data_schema=merged_schema,
|
248 |
+
num_items=num_items)
|
249 |
+
|
250 |
+
@override
|
251 |
+
def manifest(self) -> DatasetManifest:
|
252 |
+
# Use the latest modification time of all files under the dataset path as the cache key for
|
253 |
+
# re-computing the manifest and the joined view.
|
254 |
+
with self._manifest_lock:
|
255 |
+
all_dataset_files = glob.iglob(os.path.join(self.dataset_path, '**'), recursive=True)
|
256 |
+
latest_mtime = max(map(os.path.getmtime, all_dataset_files))
|
257 |
+
latest_mtime_micro_sec = int(latest_mtime * 1e6)
|
258 |
+
return self._recompute_joint_table(latest_mtime_micro_sec)
|
259 |
+
|
260 |
+
def _update_config(self,
|
261 |
+
settings: Optional[DatasetSettings] = None,
|
262 |
+
signals: Optional[list[SignalConfig]] = None,
|
263 |
+
embeddings: Optional[list[EmbeddingConfig]] = None) -> None:
|
264 |
+
with self._config_lock:
|
265 |
+
config = self.config()
|
266 |
+
|
267 |
+
if settings is not None:
|
268 |
+
config.settings = settings
|
269 |
+
|
270 |
+
if signals is not None:
|
271 |
+
# Update the config with the new signal, if the new signal has not already been added (this
|
272 |
+
# can happen if a signal is re-computed)
|
273 |
+
update_config = True
|
274 |
+
for signal_config in signals:
|
275 |
+
for existing_signal in config.signals:
|
276 |
+
if (existing_signal.path == signal_config.path and
|
277 |
+
existing_signal.signal.dict() == signal_config.signal.dict()):
|
278 |
+
update_config = False
|
279 |
+
break
|
280 |
+
if update_config:
|
281 |
+
config.signals.append(signal_config)
|
282 |
+
|
283 |
+
if embeddings is not None:
|
284 |
+
# Update the config with the new signal, if the new signal has not already been added (this
|
285 |
+
# can happen if a signal is re-computed)
|
286 |
+
update_config = True
|
287 |
+
for embedding_config in embeddings:
|
288 |
+
for existing_embedding in config.embeddings:
|
289 |
+
if (existing_embedding.path == embedding_config.path and
|
290 |
+
existing_embedding.embedding == embedding_config.embedding):
|
291 |
+
update_config = False
|
292 |
+
break
|
293 |
+
if update_config:
|
294 |
+
config.embeddings.append(embedding_config)
|
295 |
+
|
296 |
+
with open(get_config_filepath(self.namespace, self.dataset_name), 'w') as f:
|
297 |
+
f.write(to_yaml(config.dict(exclude_none=True, exclude_defaults=True)))
|
298 |
+
|
299 |
+
@override
|
300 |
+
def config(self) -> DatasetConfig:
|
301 |
+
config_filepath = get_config_filepath(self.namespace, self.dataset_name)
|
302 |
+
with open(config_filepath) as f:
|
303 |
+
return DatasetConfig(**yaml.safe_load(f))
|
304 |
+
|
305 |
+
@override
|
306 |
+
def settings(self) -> DatasetSettings:
|
307 |
+
# Settings should always have a default.
|
308 |
+
settings = self.config().settings
|
309 |
+
assert settings is not None
|
310 |
+
return settings
|
311 |
+
|
312 |
+
@override
|
313 |
+
def update_settings(self, settings: DatasetSettings) -> None:
|
314 |
+
self._update_config(settings)
|
315 |
+
|
316 |
+
def count(self, filters: Optional[list[FilterLike]] = None) -> int:
|
317 |
+
"""Count the number of rows."""
|
318 |
+
raise NotImplementedError('count is not yet implemented for DuckDB.')
|
319 |
+
|
320 |
+
def _get_vector_db_index(self, embedding: str, path: PathTuple) -> VectorDBIndex:
|
321 |
+
# Refresh the manifest to make sure we have the latest signal manifests.
|
322 |
+
self.manifest()
|
323 |
+
index_key = (path, embedding)
|
324 |
+
if index_key in self._vector_indices:
|
325 |
+
return self._vector_indices[index_key]
|
326 |
+
|
327 |
+
manifests = [
|
328 |
+
m for m in self._signal_manifests
|
329 |
+
if schema_contains_path(m.data_schema, path) and m.vector_store and m.signal.name == embedding
|
330 |
+
]
|
331 |
+
if not manifests:
|
332 |
+
raise ValueError(f'No embedding found for path {path}.')
|
333 |
+
if len(manifests) > 1:
|
334 |
+
raise ValueError(f'Multiple embeddings found for path {path}. Got: {manifests}')
|
335 |
+
manifest = manifests[0]
|
336 |
+
if not manifest.vector_store:
|
337 |
+
raise ValueError(f'Signal manifest for path {path} is not an embedding. '
|
338 |
+
f'Got signal manifest: {manifest}')
|
339 |
+
|
340 |
+
base_path = os.path.join(self.dataset_path, _signal_dir(manifest.enriched_path),
|
341 |
+
manifest.signal.name)
|
342 |
+
path_id = f'{self.namespace}/{self.dataset_name}:{path}'
|
343 |
+
with DebugTimer(f'Loading vector store "{manifest.vector_store}" for {path_id}'
|
344 |
+
f' with embedding "{embedding}"'):
|
345 |
+
vector_index = VectorDBIndex(manifest.vector_store)
|
346 |
+
vector_index.load(base_path)
|
347 |
+
# Cache the vector index.
|
348 |
+
self._vector_indices[index_key] = vector_index
|
349 |
+
return vector_index
|
350 |
+
|
351 |
+
@override
|
352 |
+
def compute_signal(self,
|
353 |
+
signal: Signal,
|
354 |
+
path: Path,
|
355 |
+
task_step_id: Optional[TaskStepId] = None) -> None:
|
356 |
+
if isinstance(signal, TextEmbeddingSignal):
|
357 |
+
return self.compute_embedding(signal.name, path, task_step_id)
|
358 |
+
source_path = normalize_path(path)
|
359 |
+
manifest = self.manifest()
|
360 |
+
|
361 |
+
if task_step_id is None:
|
362 |
+
# Make a dummy task step so we report progress via tqdm.
|
363 |
+
task_step_id = ('', 0)
|
364 |
+
|
365 |
+
# The manifest may have changed after computing the dependencies.
|
366 |
+
manifest = self.manifest()
|
367 |
+
|
368 |
+
signal_col = Column(path=source_path, alias='value', signal_udf=signal)
|
369 |
+
select_rows_result = self.select_rows([ROWID, signal_col],
|
370 |
+
task_step_id=task_step_id,
|
371 |
+
resolve_span=True)
|
372 |
+
df = select_rows_result.df()
|
373 |
+
values = df['value']
|
374 |
+
|
375 |
+
enriched_path = _col_destination_path(signal_col, is_computed_signal=True)
|
376 |
+
spec = _split_path_into_subpaths_of_lists(enriched_path)
|
377 |
+
output_dir = os.path.join(self.dataset_path, _signal_dir(enriched_path))
|
378 |
+
signal_schema = create_signal_schema(signal, source_path, manifest.data_schema)
|
379 |
+
enriched_signal_items = cast(Iterable[Item], wrap_in_dicts(values, spec))
|
380 |
+
for rowid, item in zip(df[ROWID], enriched_signal_items):
|
381 |
+
item[ROWID] = rowid
|
382 |
+
|
383 |
+
enriched_signal_items = list(enriched_signal_items)
|
384 |
+
parquet_filename, _ = write_items_to_parquet(
|
385 |
+
items=enriched_signal_items,
|
386 |
+
output_dir=output_dir,
|
387 |
+
schema=signal_schema,
|
388 |
+
filename_prefix='data',
|
389 |
+
shard_index=0,
|
390 |
+
num_shards=1)
|
391 |
+
|
392 |
+
signal_manifest = SignalManifest(
|
393 |
+
files=[parquet_filename],
|
394 |
+
data_schema=signal_schema,
|
395 |
+
signal=signal,
|
396 |
+
enriched_path=source_path,
|
397 |
+
parquet_id=make_parquet_id(signal, source_path, is_computed_signal=True))
|
398 |
+
signal_manifest_filepath = os.path.join(output_dir, SIGNAL_MANIFEST_FILENAME)
|
399 |
+
with open_file(signal_manifest_filepath, 'w') as f:
|
400 |
+
f.write(signal_manifest.json(exclude_none=True, indent=2))
|
401 |
+
|
402 |
+
self._update_config(signals=[SignalConfig(path=source_path, signal=signal)])
|
403 |
+
|
404 |
+
log(f'Wrote signal output to {output_dir}')
|
405 |
+
|
406 |
+
@override
|
407 |
+
def compute_embedding(self,
|
408 |
+
embedding: str,
|
409 |
+
path: Path,
|
410 |
+
task_step_id: Optional[TaskStepId] = None) -> None:
|
411 |
+
source_path = normalize_path(path)
|
412 |
+
manifest = self.manifest()
|
413 |
+
|
414 |
+
if task_step_id is None:
|
415 |
+
# Make a dummy task step so we report progress via tqdm.
|
416 |
+
task_step_id = ('', 0)
|
417 |
+
|
418 |
+
signal = get_signal_by_type(embedding, TextEmbeddingSignal)()
|
419 |
+
signal_col = Column(path=source_path, alias='value', signal_udf=signal)
|
420 |
+
select_rows_result = self.select_rows([ROWID, signal_col],
|
421 |
+
task_step_id=task_step_id,
|
422 |
+
resolve_span=True)
|
423 |
+
df = select_rows_result.df()
|
424 |
+
values = df['value']
|
425 |
+
|
426 |
+
enriched_path = _col_destination_path(signal_col, is_computed_signal=True)
|
427 |
+
output_dir = os.path.join(self.dataset_path, _signal_dir(enriched_path))
|
428 |
+
signal_schema = create_signal_schema(signal, source_path, manifest.data_schema)
|
429 |
+
|
430 |
+
write_embeddings_to_disk(
|
431 |
+
vector_store=self.vector_store, rowids=df[ROWID], signal_items=values, output_dir=output_dir)
|
432 |
+
|
433 |
+
del select_rows_result, df, values
|
434 |
+
gc.collect()
|
435 |
+
|
436 |
+
signal_manifest = SignalManifest(
|
437 |
+
files=[],
|
438 |
+
data_schema=signal_schema,
|
439 |
+
signal=signal,
|
440 |
+
enriched_path=source_path,
|
441 |
+
parquet_id=make_parquet_id(signal, source_path, is_computed_signal=True),
|
442 |
+
vector_store=self.vector_store)
|
443 |
+
signal_manifest_filepath = os.path.join(output_dir, SIGNAL_MANIFEST_FILENAME)
|
444 |
+
|
445 |
+
with open_file(signal_manifest_filepath, 'w') as f:
|
446 |
+
f.write(signal_manifest.json(exclude_none=True, indent=2))
|
447 |
+
|
448 |
+
self._update_config(embeddings=[EmbeddingConfig(path=source_path, embedding=embedding)])
|
449 |
+
|
450 |
+
log(f'Wrote embedding index to {output_dir}')
|
451 |
+
|
452 |
+
@override
|
453 |
+
def delete_signal(self, signal_path: Path) -> None:
|
454 |
+
signal_path = normalize_path(signal_path)
|
455 |
+
manifest = self.manifest()
|
456 |
+
if not manifest.data_schema.has_field(signal_path):
|
457 |
+
raise ValueError(f'Unknown signal path: {signal_path}')
|
458 |
+
|
459 |
+
output_dir = os.path.join(self.dataset_path, _signal_dir(signal_path))
|
460 |
+
shutil.rmtree(output_dir, ignore_errors=True)
|
461 |
+
|
462 |
+
def _validate_filters(self, filters: Sequence[Filter], col_aliases: dict[str, PathTuple],
|
463 |
+
manifest: DatasetManifest) -> None:
|
464 |
+
for filter in filters:
|
465 |
+
if filter.path[0] in col_aliases:
|
466 |
+
# This is a filter on a column alias, which is always allowed.
|
467 |
+
continue
|
468 |
+
|
469 |
+
current_field = Field(fields=manifest.data_schema.fields)
|
470 |
+
if filter.path == (ROWID,):
|
471 |
+
return
|
472 |
+
for path_part in filter.path:
|
473 |
+
if path_part == VALUE_KEY:
|
474 |
+
if not current_field.dtype:
|
475 |
+
raise ValueError(f'Unable to filter on path {filter.path}. The field has no value.')
|
476 |
+
continue
|
477 |
+
if current_field.fields:
|
478 |
+
if path_part not in current_field.fields:
|
479 |
+
raise ValueError(f'Unable to filter on path {filter.path}. '
|
480 |
+
f'Path part "{path_part}" not found in the dataset.')
|
481 |
+
current_field = current_field.fields[str(path_part)]
|
482 |
+
continue
|
483 |
+
elif current_field.repeated_field:
|
484 |
+
current_field = current_field.repeated_field
|
485 |
+
continue
|
486 |
+
else:
|
487 |
+
raise ValueError(f'Unable to filter on path {filter.path}. '
|
488 |
+
f'Path part "{path_part}" is not defined on a primitive value.')
|
489 |
+
|
490 |
+
while current_field.repeated_field:
|
491 |
+
current_field = current_field.repeated_field
|
492 |
+
filter.path = (*filter.path, PATH_WILDCARD)
|
493 |
+
|
494 |
+
if not current_field.dtype:
|
495 |
+
raise ValueError(f'Unable to filter on path {filter.path}. The field has no value.')
|
496 |
+
|
497 |
+
def _validate_udfs(self, udf_cols: Sequence[Column], source_schema: Schema) -> None:
|
498 |
+
for col in udf_cols:
|
499 |
+
path = col.path
|
500 |
+
|
501 |
+
# Signal transforms must operate on a leaf field.
|
502 |
+
leaf = source_schema.leafs.get(path)
|
503 |
+
if not leaf or not leaf.dtype:
|
504 |
+
raise ValueError(f'Leaf "{path}" not found in dataset. '
|
505 |
+
'Signal transforms must operate on a leaf field.')
|
506 |
+
|
507 |
+
# Signal transforms must have the same dtype as the leaf field.
|
508 |
+
signal = cast(Signal, col.signal_udf)
|
509 |
+
if not signal_type_supports_dtype(signal.input_type, leaf.dtype):
|
510 |
+
raise ValueError(f'Leaf "{path}" has dtype "{leaf.dtype}" which is not supported '
|
511 |
+
f'by "{signal.key()}" with signal input type "{signal.input_type}".')
|
512 |
+
|
513 |
+
def _validate_selection(self, columns: Sequence[Column], select_schema: Schema) -> None:
|
514 |
+
# Validate all the columns and make sure they exist in the `select_schema`.
|
515 |
+
for column in columns:
|
516 |
+
current_field = Field(fields=select_schema.fields)
|
517 |
+
path = column.path
|
518 |
+
if path == (ROWID,):
|
519 |
+
return
|
520 |
+
for path_part in path:
|
521 |
+
if path_part == VALUE_KEY:
|
522 |
+
if not current_field.dtype:
|
523 |
+
raise ValueError(f'Unable to select path {path}. The field that has no value.')
|
524 |
+
continue
|
525 |
+
if current_field.fields:
|
526 |
+
if path_part not in current_field.fields:
|
527 |
+
raise ValueError(f'Unable to select path {path}. '
|
528 |
+
f'Path part "{path_part}" not found in the dataset.')
|
529 |
+
current_field = current_field.fields[path_part]
|
530 |
+
continue
|
531 |
+
elif current_field.repeated_field:
|
532 |
+
if path_part.isdigit():
|
533 |
+
raise ValueError(f'Unable to select path {path}. Selecting a specific index of '
|
534 |
+
'a repeated field is currently not supported.')
|
535 |
+
if path_part != PATH_WILDCARD:
|
536 |
+
raise ValueError(f'Unable to select path {path}. '
|
537 |
+
f'Path part "{path_part}" should be a wildcard.')
|
538 |
+
current_field = current_field.repeated_field
|
539 |
+
elif not current_field.dtype:
|
540 |
+
raise ValueError(f'Unable to select path {path}. '
|
541 |
+
f'Path part "{path_part}" is not defined on a primitive value.')
|
542 |
+
|
543 |
+
def _validate_columns(self, columns: Sequence[Column], source_schema: Schema,
|
544 |
+
select_schema: Schema) -> None:
|
545 |
+
udf_cols = [col for col in columns if col.signal_udf]
|
546 |
+
self._validate_udfs(udf_cols, source_schema)
|
547 |
+
self._validate_selection(columns, select_schema)
|
548 |
+
|
549 |
+
def _validate_sort_path(self, path: PathTuple, schema: Schema) -> None:
|
550 |
+
current_field = Field(fields=schema.fields)
|
551 |
+
if path == (ROWID,):
|
552 |
+
return
|
553 |
+
for path_part in path:
|
554 |
+
if path_part == VALUE_KEY:
|
555 |
+
if not current_field.dtype:
|
556 |
+
raise ValueError(f'Unable to sort by path {path}. The field that has no value.')
|
557 |
+
continue
|
558 |
+
if current_field.fields:
|
559 |
+
if path_part not in current_field.fields:
|
560 |
+
raise ValueError(f'Unable to sort by path {path}. '
|
561 |
+
f'Path part "{path_part}" not found in the dataset.')
|
562 |
+
current_field = current_field.fields[path_part]
|
563 |
+
continue
|
564 |
+
elif current_field.repeated_field:
|
565 |
+
if path_part.isdigit():
|
566 |
+
raise ValueError(f'Unable to sort by path {path}. Selecting a specific index of '
|
567 |
+
'a repeated field is currently not supported.')
|
568 |
+
if path_part != PATH_WILDCARD:
|
569 |
+
raise ValueError(f'Unable to sort by path {path}. '
|
570 |
+
f'Path part "{path_part}" should be a wildcard.')
|
571 |
+
current_field = current_field.repeated_field
|
572 |
+
elif not current_field.dtype:
|
573 |
+
raise ValueError(f'Unable to sort by path {path}. '
|
574 |
+
f'Path part "{path_part}" is not defined on a primitive value.')
|
575 |
+
if not current_field.dtype:
|
576 |
+
raise ValueError(f'Unable to sort by path {path}. The field has no value.')
|
577 |
+
|
578 |
+
@override
|
579 |
+
@functools.cache # Cache stats for leaf paths since we ask on every dataset page refresh.
|
580 |
+
def stats(self, leaf_path: Path) -> StatsResult:
|
581 |
+
if not leaf_path:
|
582 |
+
raise ValueError('leaf_path must be provided')
|
583 |
+
path = normalize_path(leaf_path)
|
584 |
+
manifest = self.manifest()
|
585 |
+
leaf = manifest.data_schema.get_field(path)
|
586 |
+
# Find the inner-most leaf in case this field is repeated.
|
587 |
+
while leaf.repeated_field:
|
588 |
+
leaf = leaf.repeated_field
|
589 |
+
path = (*path, PATH_WILDCARD)
|
590 |
+
|
591 |
+
if not leaf.dtype:
|
592 |
+
raise ValueError(f'Leaf "{path}" not found in dataset')
|
593 |
+
|
594 |
+
duckdb_path = self._leaf_path_to_duckdb_path(path, manifest.data_schema)
|
595 |
+
inner_select = _select_sql(
|
596 |
+
duckdb_path, flatten=True, unnest=True, span_from=self._get_span_from(path, manifest))
|
597 |
+
|
598 |
+
# Compute the average length of text fields.
|
599 |
+
avg_text_length: Optional[int] = None
|
600 |
+
if leaf.dtype in (DataType.STRING, DataType.STRING_SPAN):
|
601 |
+
avg_length_query = f"""
|
602 |
+
SELECT avg(length(val))
|
603 |
+
FROM (SELECT {inner_select} AS val FROM t) USING SAMPLE {SAMPLE_AVG_TEXT_LENGTH};
|
604 |
+
"""
|
605 |
+
row = self._query(avg_length_query)[0]
|
606 |
+
if row[0] is not None:
|
607 |
+
avg_text_length = int(row[0])
|
608 |
+
|
609 |
+
total_count_query = f'SELECT count(val) FROM (SELECT {inner_select} as val FROM t)'
|
610 |
+
total_count = int(self._query(total_count_query)[0][0])
|
611 |
+
|
612 |
+
# Compute approximate count by sampling the data to avoid OOM.
|
613 |
+
if avg_text_length and avg_text_length > MAX_TEXT_LEN_DISTINCT_COUNT:
|
614 |
+
# Assume that every text field is unique.
|
615 |
+
approx_count_distinct = manifest.num_items
|
616 |
+
elif leaf.dtype == DataType.BOOLEAN:
|
617 |
+
approx_count_distinct = 2
|
618 |
+
else:
|
619 |
+
sample_size = TOO_MANY_DISTINCT
|
620 |
+
approx_count_query = f"""
|
621 |
+
SELECT approx_count_distinct(val) as approxCountDistinct
|
622 |
+
FROM (SELECT {inner_select} AS val FROM t) USING SAMPLE {sample_size};
|
623 |
+
"""
|
624 |
+
row = self._query(approx_count_query)[0]
|
625 |
+
approx_count_distinct = int(row[0])
|
626 |
+
|
627 |
+
# Adjust the counts for the sample size.
|
628 |
+
factor = max(1, total_count / sample_size)
|
629 |
+
approx_count_distinct = round(approx_count_distinct * factor)
|
630 |
+
|
631 |
+
result = StatsResult(
|
632 |
+
path=path,
|
633 |
+
total_count=total_count,
|
634 |
+
approx_count_distinct=approx_count_distinct,
|
635 |
+
avg_text_length=avg_text_length)
|
636 |
+
|
637 |
+
# Compute min/max values for ordinal leafs, without sampling the data.
|
638 |
+
if is_ordinal(leaf.dtype):
|
639 |
+
min_max_query = f"""
|
640 |
+
SELECT MIN(val) AS minVal, MAX(val) AS maxVal
|
641 |
+
FROM (SELECT {inner_select} as val FROM t)
|
642 |
+
{'WHERE NOT isnan(val)' if is_float(leaf.dtype) else ''}
|
643 |
+
"""
|
644 |
+
row = self._query(min_max_query)[0]
|
645 |
+
result.min_val, result.max_val = row
|
646 |
+
|
647 |
+
return result
|
648 |
+
|
649 |
+
@override
|
650 |
+
def select_groups(
|
651 |
+
self,
|
652 |
+
leaf_path: Path,
|
653 |
+
filters: Optional[Sequence[FilterLike]] = None,
|
654 |
+
sort_by: Optional[GroupsSortBy] = GroupsSortBy.COUNT,
|
655 |
+
sort_order: Optional[SortOrder] = SortOrder.DESC,
|
656 |
+
limit: Optional[int] = None,
|
657 |
+
bins: Optional[Union[Sequence[Bin], Sequence[float]]] = None) -> SelectGroupsResult:
|
658 |
+
if not leaf_path:
|
659 |
+
raise ValueError('leaf_path must be provided')
|
660 |
+
path = normalize_path(leaf_path)
|
661 |
+
manifest = self.manifest()
|
662 |
+
leaf = manifest.data_schema.get_field(path)
|
663 |
+
# Find the inner-most leaf in case this field is repeated.
|
664 |
+
while leaf.repeated_field:
|
665 |
+
leaf = leaf.repeated_field
|
666 |
+
path = (*path, PATH_WILDCARD)
|
667 |
+
|
668 |
+
if not leaf.dtype:
|
669 |
+
raise ValueError(f'Leaf "{path}" not found in dataset')
|
670 |
+
|
671 |
+
inner_val = 'inner_val'
|
672 |
+
outer_select = inner_val
|
673 |
+
# Normalize the bins to be `list[Bin]`.
|
674 |
+
named_bins = _normalize_bins(bins or leaf.bins)
|
675 |
+
stats = self.stats(leaf_path)
|
676 |
+
|
677 |
+
leaf_is_float = is_float(leaf.dtype)
|
678 |
+
leaf_is_integer = is_integer(leaf.dtype)
|
679 |
+
if not leaf.categorical and (leaf_is_float or leaf_is_integer):
|
680 |
+
if named_bins is None:
|
681 |
+
# Auto-bin.
|
682 |
+
named_bins = _auto_bins(stats, NUM_AUTO_BINS)
|
683 |
+
|
684 |
+
sql_bounds = []
|
685 |
+
for label, start, end in named_bins:
|
686 |
+
if start is None:
|
687 |
+
start = cast(float, "'-Infinity'")
|
688 |
+
if end is None:
|
689 |
+
end = cast(float, "'Infinity'")
|
690 |
+
sql_bounds.append(f"('{label}', {start}, {end})")
|
691 |
+
|
692 |
+
bin_index_col = 'col0'
|
693 |
+
bin_min_col = 'col1'
|
694 |
+
bin_max_col = 'col2'
|
695 |
+
is_nan_filter = f'NOT isnan({inner_val}) AND' if leaf_is_float else ''
|
696 |
+
|
697 |
+
# We cast the field to `double` so binning works for both `float` and `int` fields.
|
698 |
+
outer_select = f"""(
|
699 |
+
SELECT {bin_index_col} FROM (
|
700 |
+
VALUES {', '.join(sql_bounds)}
|
701 |
+
) WHERE {is_nan_filter}
|
702 |
+
{inner_val}::DOUBLE >= {bin_min_col} AND {inner_val}::DOUBLE < {bin_max_col}
|
703 |
+
)"""
|
704 |
+
else:
|
705 |
+
if stats.approx_count_distinct >= dataset.TOO_MANY_DISTINCT:
|
706 |
+
return SelectGroupsResult(too_many_distinct=True, counts=[], bins=named_bins)
|
707 |
+
|
708 |
+
count_column = 'count'
|
709 |
+
value_column = 'value'
|
710 |
+
|
711 |
+
limit_query = f'LIMIT {limit}' if limit else ''
|
712 |
+
duckdb_path = self._leaf_path_to_duckdb_path(path, manifest.data_schema)
|
713 |
+
inner_select = _select_sql(
|
714 |
+
duckdb_path, flatten=True, unnest=True, span_from=self._get_span_from(path, manifest))
|
715 |
+
|
716 |
+
filters, _ = self._normalize_filters(filters, col_aliases={}, udf_aliases={}, manifest=manifest)
|
717 |
+
filter_queries = self._create_where(manifest, filters, searches=[])
|
718 |
+
|
719 |
+
where_query = ''
|
720 |
+
if filter_queries:
|
721 |
+
where_query = f"WHERE {' AND '.join(filter_queries)}"
|
722 |
+
|
723 |
+
query = f"""
|
724 |
+
SELECT {outer_select} AS {value_column}, COUNT() AS {count_column}
|
725 |
+
FROM (SELECT {inner_select} AS {inner_val} FROM t {where_query})
|
726 |
+
GROUP BY {value_column}
|
727 |
+
ORDER BY {sort_by} {sort_order}
|
728 |
+
{limit_query}
|
729 |
+
"""
|
730 |
+
df = self._query_df(query)
|
731 |
+
counts = list(df.itertuples(index=False, name=None))
|
732 |
+
if is_temporal(leaf.dtype):
|
733 |
+
# Replace any NaT with None and pd.Timestamp to native datetime objects.
|
734 |
+
counts = [(None if pd.isnull(val) else val.to_pydatetime(), count) for val, count in counts]
|
735 |
+
return SelectGroupsResult(too_many_distinct=False, counts=counts, bins=named_bins)
|
736 |
+
|
737 |
+
def _topk_udf_to_sort_by(
|
738 |
+
self,
|
739 |
+
udf_columns: list[Column],
|
740 |
+
sort_by: list[PathTuple],
|
741 |
+
limit: Optional[int],
|
742 |
+
sort_order: Optional[SortOrder],
|
743 |
+
) -> Optional[Column]:
|
744 |
+
if (sort_order != SortOrder.DESC) or (not limit) or (not sort_by):
|
745 |
+
return None
|
746 |
+
if len(sort_by) < 1:
|
747 |
+
return None
|
748 |
+
primary_sort_by = sort_by[0]
|
749 |
+
udf_cols_to_sort_by = [
|
750 |
+
udf_col for udf_col in udf_columns if udf_col.alias == primary_sort_by[0] or
|
751 |
+
_path_contains(_col_destination_path(udf_col), primary_sort_by)
|
752 |
+
]
|
753 |
+
if not udf_cols_to_sort_by:
|
754 |
+
return None
|
755 |
+
udf_col = udf_cols_to_sort_by[0]
|
756 |
+
if udf_col.signal_udf and not isinstance(udf_col.signal_udf, VectorSignal):
|
757 |
+
return None
|
758 |
+
return udf_col
|
759 |
+
|
760 |
+
def _normalize_columns(self, columns: Optional[Sequence[ColumnId]], schema: Schema,
|
761 |
+
combine_columns: bool) -> list[Column]:
|
762 |
+
"""Normalizes the columns to a list of `Column` objects."""
|
763 |
+
cols = [column_from_identifier(col) for col in columns or []]
|
764 |
+
star_in_cols = any(col.path == (PATH_WILDCARD,) for col in cols)
|
765 |
+
if not cols or star_in_cols:
|
766 |
+
# Select all columns.
|
767 |
+
cols.extend([Column((name,)) for name in schema.fields.keys() if name != ROWID])
|
768 |
+
|
769 |
+
if not combine_columns:
|
770 |
+
# Select all the signal top-level fields.
|
771 |
+
for path, field in schema.all_fields:
|
772 |
+
if field.signal:
|
773 |
+
cols.append(Column(path))
|
774 |
+
|
775 |
+
if star_in_cols:
|
776 |
+
cols = [col for col in cols if col.path != (PATH_WILDCARD,)]
|
777 |
+
return cols
|
778 |
+
|
779 |
+
def _merge_sorts(self, search_udfs: list[DuckDBSearchUDF], sort_by: Optional[Sequence[Path]],
|
780 |
+
sort_order: Optional[SortOrder]) -> list[SortResult]:
|
781 |
+
# True when the user has explicitly sorted by the alias of a search UDF (e.g. in ASC order).
|
782 |
+
is_explicit_search_sort = False
|
783 |
+
for sort_by_path in sort_by or []:
|
784 |
+
for search_udf in search_udfs:
|
785 |
+
if column_paths_match(sort_by_path, search_udf.output_path):
|
786 |
+
is_explicit_search_sort = True
|
787 |
+
break
|
788 |
+
|
789 |
+
sort_results: list[SortResult] = []
|
790 |
+
if sort_by and not is_explicit_search_sort:
|
791 |
+
if not sort_order:
|
792 |
+
raise ValueError('`sort_order` is required when `sort_by` is specified.')
|
793 |
+
# If the user has explicitly set a sort by, and it's not a search UDF alias, override.
|
794 |
+
sort_results = [
|
795 |
+
SortResult(path=normalize_path(sort_by), order=sort_order) for sort_by in sort_by if sort_by
|
796 |
+
]
|
797 |
+
else:
|
798 |
+
search_udfs_with_sort = [search_udf for search_udf in search_udfs if search_udf.sort]
|
799 |
+
if search_udfs_with_sort:
|
800 |
+
# Override the sort by the last search sort order when the user hasn't provided an
|
801 |
+
# explicit sort order.
|
802 |
+
last_search_udf = search_udfs_with_sort[-1]
|
803 |
+
assert last_search_udf.sort, 'Expected search UDFs with sort to have a sort.'
|
804 |
+
udf_sort_path, udf_sort_order = last_search_udf.sort
|
805 |
+
sort_results = [
|
806 |
+
SortResult(
|
807 |
+
path=udf_sort_path,
|
808 |
+
order=sort_order or udf_sort_order,
|
809 |
+
search_index=len(search_udfs_with_sort) - 1)
|
810 |
+
]
|
811 |
+
|
812 |
+
return sort_results
|
813 |
+
|
814 |
+
@override
|
815 |
+
def select_rows(self,
|
816 |
+
columns: Optional[Sequence[ColumnId]] = None,
|
817 |
+
searches: Optional[Sequence[Search]] = None,
|
818 |
+
filters: Optional[Sequence[FilterLike]] = None,
|
819 |
+
sort_by: Optional[Sequence[Path]] = None,
|
820 |
+
sort_order: Optional[SortOrder] = SortOrder.DESC,
|
821 |
+
limit: Optional[int] = None,
|
822 |
+
offset: Optional[int] = 0,
|
823 |
+
task_step_id: Optional[TaskStepId] = None,
|
824 |
+
resolve_span: bool = False,
|
825 |
+
combine_columns: bool = False,
|
826 |
+
user: Optional[UserInfo] = None) -> SelectRowsResult:
|
827 |
+
manifest = self.manifest()
|
828 |
+
cols = self._normalize_columns(columns, manifest.data_schema, combine_columns)
|
829 |
+
offset = offset or 0
|
830 |
+
schema = manifest.data_schema
|
831 |
+
|
832 |
+
if combine_columns:
|
833 |
+
schema = self.select_rows_schema(
|
834 |
+
columns, sort_by, sort_order, searches, combine_columns=True).data_schema
|
835 |
+
|
836 |
+
self._validate_columns(cols, manifest.data_schema, schema)
|
837 |
+
self._normalize_searches(searches, manifest)
|
838 |
+
search_udfs = self._search_udfs(searches, manifest)
|
839 |
+
cols.extend([search_udf.udf for search_udf in search_udfs])
|
840 |
+
udf_columns = [col for col in cols if col.signal_udf]
|
841 |
+
|
842 |
+
temp_rowid_selected = False
|
843 |
+
for col in cols:
|
844 |
+
if col.path == (ROWID,):
|
845 |
+
temp_rowid_selected = False
|
846 |
+
break
|
847 |
+
if isinstance(col.signal_udf, VectorSignal):
|
848 |
+
temp_rowid_selected = True
|
849 |
+
if temp_rowid_selected:
|
850 |
+
cols.append(Column(ROWID))
|
851 |
+
|
852 |
+
# Set extra information on any concept signals.
|
853 |
+
for udf_col in udf_columns:
|
854 |
+
if isinstance(udf_col.signal_udf, (ConceptSignal, ConceptLabelsSignal)):
|
855 |
+
# Concept are access controlled so we tell it about the user.
|
856 |
+
udf_col.signal_udf.set_user(user)
|
857 |
+
|
858 |
+
# Decide on the exact sorting order.
|
859 |
+
sort_results = self._merge_sorts(search_udfs, sort_by, sort_order)
|
860 |
+
sort_by = cast(list[PathTuple],
|
861 |
+
[(sort.alias,) if sort.alias else sort.path for sort in sort_results])
|
862 |
+
# Choose the first sort order as we only support a single sort order for now.
|
863 |
+
sort_order = sort_results[0].order if sort_results else None
|
864 |
+
|
865 |
+
col_aliases: dict[str, PathTuple] = {col.alias: col.path for col in cols if col.alias}
|
866 |
+
udf_aliases: dict[str, PathTuple] = {
|
867 |
+
col.alias: col.path for col in cols if col.signal_udf and col.alias
|
868 |
+
}
|
869 |
+
path_to_udf_col_name: dict[PathTuple, str] = {}
|
870 |
+
for col in cols:
|
871 |
+
if col.signal_udf:
|
872 |
+
alias = col.alias or _unique_alias(col)
|
873 |
+
dest_path = _col_destination_path(col)
|
874 |
+
path_to_udf_col_name[dest_path] = alias
|
875 |
+
|
876 |
+
# Filtering and searching.
|
877 |
+
where_query = ''
|
878 |
+
filters, udf_filters = self._normalize_filters(filters, col_aliases, udf_aliases, manifest)
|
879 |
+
filter_queries = self._create_where(manifest, filters, searches)
|
880 |
+
if filter_queries:
|
881 |
+
where_query = f"WHERE {' AND '.join(filter_queries)}"
|
882 |
+
|
883 |
+
total_num_rows = manifest.num_items
|
884 |
+
con = self.con.cursor()
|
885 |
+
|
886 |
+
topk_udf_col = self._topk_udf_to_sort_by(udf_columns, sort_by, limit, sort_order)
|
887 |
+
if topk_udf_col:
|
888 |
+
path_keys: Optional[list[PathKey]] = None
|
889 |
+
if where_query:
|
890 |
+
# If there are filters, we need to send rowids to the top k query.
|
891 |
+
df = con.execute(f'SELECT {ROWID} FROM t {where_query}').df()
|
892 |
+
total_num_rows = len(df)
|
893 |
+
# Convert rowids to path keys.
|
894 |
+
path_keys = [(rowid,) for rowid in df[ROWID]]
|
895 |
+
|
896 |
+
if path_keys is not None and len(path_keys) == 0:
|
897 |
+
where_query = 'WHERE false'
|
898 |
+
else:
|
899 |
+
topk_signal = cast(VectorSignal, topk_udf_col.signal_udf)
|
900 |
+
# The input is an embedding.
|
901 |
+
vector_index = self._get_vector_db_index(topk_signal.embedding, topk_udf_col.path)
|
902 |
+
k = (limit or 0) + offset
|
903 |
+
path_id = f'{self.namespace}/{self.dataset_name}:{topk_udf_col.path}'
|
904 |
+
with DebugTimer(f'Computing topk on {path_id} with embedding "{topk_signal.embedding}" '
|
905 |
+
f'and vector store "{vector_index._vector_store.name}"'):
|
906 |
+
topk = topk_signal.vector_compute_topk(k, vector_index, path_keys)
|
907 |
+
topk_rowids = list(dict.fromkeys([cast(str, rowid) for (rowid, *_), _ in topk]))
|
908 |
+
# Update the offset to account for the number of unique rowids.
|
909 |
+
offset = len(dict.fromkeys([cast(str, rowid) for (rowid, *_), _ in topk[:offset]]))
|
910 |
+
|
911 |
+
# Ignore all the other filters and filter DuckDB results only by the top k rowids.
|
912 |
+
rowid_filter = Filter(path=(ROWID,), op='in', value=topk_rowids)
|
913 |
+
filter_query = self._create_where(manifest, [rowid_filter])[0]
|
914 |
+
where_query = f'WHERE {filter_query}'
|
915 |
+
|
916 |
+
# Map a final column name to a list of temporary namespaced column names that need to be merged.
|
917 |
+
columns_to_merge: dict[str, dict[str, Column]] = {}
|
918 |
+
temp_column_to_offset_column: dict[str, tuple[str, Field]] = {}
|
919 |
+
select_queries: list[str] = []
|
920 |
+
|
921 |
+
for column in cols:
|
922 |
+
path = column.path
|
923 |
+
# If the signal is vector-based, we don't need to select the actual data, just the rowids
|
924 |
+
# plus an arbitrarily nested array of `None`s`.
|
925 |
+
empty = bool(column.signal_udf and schema.get_field(path).dtype == DataType.EMBEDDING)
|
926 |
+
|
927 |
+
select_sqls: list[str] = []
|
928 |
+
final_col_name = column.alias or _unique_alias(column)
|
929 |
+
if final_col_name not in columns_to_merge:
|
930 |
+
columns_to_merge[final_col_name] = {}
|
931 |
+
|
932 |
+
duckdb_paths = self._column_to_duckdb_paths(column, schema, combine_columns)
|
933 |
+
span_from = self._get_span_from(path, manifest) if resolve_span or column.signal_udf else None
|
934 |
+
|
935 |
+
for parquet_id, duckdb_path in duckdb_paths:
|
936 |
+
sql = _select_sql(
|
937 |
+
duckdb_path, flatten=False, unnest=False, empty=empty, span_from=span_from)
|
938 |
+
temp_column_name = (
|
939 |
+
final_col_name if len(duckdb_paths) == 1 else f'{final_col_name}/{parquet_id}')
|
940 |
+
select_sqls.append(f'{sql} AS {_escape_string_literal(temp_column_name)}')
|
941 |
+
columns_to_merge[final_col_name][temp_column_name] = column
|
942 |
+
|
943 |
+
if column.signal_udf and span_from and _schema_has_spans(column.signal_udf.fields()):
|
944 |
+
sql = _select_sql(duckdb_path, flatten=False, unnest=False, empty=empty, span_from=None)
|
945 |
+
temp_offset_column_name = f'{temp_column_name}/offset'
|
946 |
+
temp_offset_column_name = temp_offset_column_name.replace("'", "\\'")
|
947 |
+
select_sqls.append(f'{sql} AS {_escape_string_literal(temp_offset_column_name)}')
|
948 |
+
temp_column_to_offset_column[temp_column_name] = (temp_offset_column_name,
|
949 |
+
column.signal_udf.fields())
|
950 |
+
|
951 |
+
# `select_sqls` can be empty if this column points to a path that will be created by a UDF.
|
952 |
+
if select_sqls:
|
953 |
+
select_queries.append(', '.join(select_sqls))
|
954 |
+
|
955 |
+
sort_sql_before_udf: list[str] = []
|
956 |
+
sort_sql_after_udf: list[str] = []
|
957 |
+
|
958 |
+
for path in sort_by:
|
959 |
+
# We only allow sorting by nodes with a value.
|
960 |
+
first_subpath = str(path[0])
|
961 |
+
rest_of_path = path[1:]
|
962 |
+
signal_alias = '.'.join(map(str, path))
|
963 |
+
|
964 |
+
udf_path = _path_to_udf_duckdb_path(path, path_to_udf_col_name)
|
965 |
+
if not udf_path:
|
966 |
+
# Re-route the path if it starts with an alias by pointing it to the actual path.
|
967 |
+
if first_subpath in col_aliases:
|
968 |
+
path = (*col_aliases[first_subpath], *rest_of_path)
|
969 |
+
self._validate_sort_path(path, schema)
|
970 |
+
path = self._leaf_path_to_duckdb_path(path, schema)
|
971 |
+
else:
|
972 |
+
path = udf_path
|
973 |
+
|
974 |
+
sort_sql = _select_sql(path, flatten=True, unnest=False)
|
975 |
+
has_repeated_field = any(subpath == PATH_WILDCARD for subpath in path)
|
976 |
+
if has_repeated_field:
|
977 |
+
sort_sql = (f'list_min({sort_sql})'
|
978 |
+
if sort_order == SortOrder.ASC else f'list_max({sort_sql})')
|
979 |
+
|
980 |
+
# Separate sort columns into two groups: those that need to be sorted before and after UDFs.
|
981 |
+
if udf_path:
|
982 |
+
sort_sql_after_udf.append(sort_sql)
|
983 |
+
else:
|
984 |
+
sort_sql_before_udf.append(sort_sql)
|
985 |
+
|
986 |
+
order_query = ''
|
987 |
+
if sort_sql_before_udf:
|
988 |
+
order_query = (f'ORDER BY {", ".join(sort_sql_before_udf)} '
|
989 |
+
f'{cast(SortOrder, sort_order).value}')
|
990 |
+
|
991 |
+
limit_query = ''
|
992 |
+
if limit:
|
993 |
+
if topk_udf_col:
|
994 |
+
limit_query = f'LIMIT {limit + offset}'
|
995 |
+
elif sort_sql_after_udf:
|
996 |
+
limit_query = ''
|
997 |
+
else:
|
998 |
+
limit_query = f'LIMIT {limit} OFFSET {offset}'
|
999 |
+
|
1000 |
+
if not topk_udf_col and where_query:
|
1001 |
+
total_num_rows = cast(tuple,
|
1002 |
+
con.execute(f'SELECT COUNT(*) FROM t {where_query}').fetchone())[0]
|
1003 |
+
|
1004 |
+
# Fetch the data from DuckDB.
|
1005 |
+
df = con.execute(f"""
|
1006 |
+
SELECT {', '.join(select_queries)} FROM t
|
1007 |
+
{where_query}
|
1008 |
+
{order_query}
|
1009 |
+
{limit_query}
|
1010 |
+
""").df()
|
1011 |
+
df = _replace_nan_with_none(df)
|
1012 |
+
|
1013 |
+
# Run UDFs on the transformed columns.
|
1014 |
+
for udf_col in udf_columns:
|
1015 |
+
signal = cast(Signal, udf_col.signal_udf)
|
1016 |
+
signal_alias = udf_col.alias or _unique_alias(udf_col)
|
1017 |
+
temp_signal_cols = columns_to_merge[signal_alias]
|
1018 |
+
if len(temp_signal_cols) != 1:
|
1019 |
+
raise ValueError(
|
1020 |
+
f'Unable to compute signal {signal.name}. Signal UDFs only operate on leafs, but got '
|
1021 |
+
f'{len(temp_signal_cols)} underlying columns that contain data related to {udf_col.path}.'
|
1022 |
+
)
|
1023 |
+
signal_column = list(temp_signal_cols.keys())[0]
|
1024 |
+
input = df[signal_column]
|
1025 |
+
|
1026 |
+
path_id = f'{self.namespace}/{self.dataset_name}:{udf_col.path}'
|
1027 |
+
with DebugTimer(f'Computing signal "{signal.name}" on {path_id}'):
|
1028 |
+
signal.setup()
|
1029 |
+
|
1030 |
+
step_description = f'Computing {signal.key()} on {path_id}'
|
1031 |
+
|
1032 |
+
if isinstance(signal, VectorSignal):
|
1033 |
+
embedding_signal = signal
|
1034 |
+
vector_store = self._get_vector_db_index(embedding_signal.embedding, udf_col.path)
|
1035 |
+
flat_keys = list(flatten_keys(df[ROWID], input))
|
1036 |
+
signal_out = sparse_to_dense_compute(
|
1037 |
+
iter(flat_keys), lambda keys: embedding_signal.vector_compute(keys, vector_store))
|
1038 |
+
# Add progress.
|
1039 |
+
if task_step_id is not None:
|
1040 |
+
signal_out = progress(
|
1041 |
+
signal_out,
|
1042 |
+
task_step_id=task_step_id,
|
1043 |
+
estimated_len=len(flat_keys),
|
1044 |
+
step_description=step_description)
|
1045 |
+
df[signal_column] = deep_unflatten(signal_out, input)
|
1046 |
+
else:
|
1047 |
+
num_rich_data = count_primitives(input)
|
1048 |
+
flat_input = cast(Iterator[Optional[RichData]], deep_flatten(input))
|
1049 |
+
signal_out = sparse_to_dense_compute(
|
1050 |
+
flat_input, lambda x: signal.compute(cast(Iterable[RichData], x)))
|
1051 |
+
# Add progress.
|
1052 |
+
if task_step_id is not None:
|
1053 |
+
signal_out = progress(
|
1054 |
+
signal_out,
|
1055 |
+
task_step_id=task_step_id,
|
1056 |
+
estimated_len=num_rich_data,
|
1057 |
+
step_description=step_description)
|
1058 |
+
signal_out_list = list(signal_out)
|
1059 |
+
if signal_column in temp_column_to_offset_column:
|
1060 |
+
offset_column_name, field = temp_column_to_offset_column[signal_column]
|
1061 |
+
nested_spans: Iterable[Item] = df[offset_column_name]
|
1062 |
+
flat_spans = deep_flatten(nested_spans)
|
1063 |
+
for span, item in zip(flat_spans, signal_out_list):
|
1064 |
+
_offset_any_span(cast(int, span[VALUE_KEY][TEXT_SPAN_START_FEATURE]), item, field)
|
1065 |
+
|
1066 |
+
if len(signal_out_list) != num_rich_data:
|
1067 |
+
raise ValueError(
|
1068 |
+
f'The signal generated {len(signal_out_list)} values but the input data had '
|
1069 |
+
f"{num_rich_data} values. This means the signal either didn't generate a "
|
1070 |
+
'"None" for a sparse output, or generated too many items.')
|
1071 |
+
|
1072 |
+
df[signal_column] = deep_unflatten(signal_out_list, input)
|
1073 |
+
|
1074 |
+
signal.teardown()
|
1075 |
+
|
1076 |
+
if not df.empty and (udf_filters or sort_sql_after_udf):
|
1077 |
+
# Re-upload the udf outputs to duckdb so we can filter/sort on them.
|
1078 |
+
rel = con.from_df(df)
|
1079 |
+
|
1080 |
+
if udf_filters:
|
1081 |
+
udf_filter_queries = self._create_where(manifest, udf_filters)
|
1082 |
+
if udf_filter_queries:
|
1083 |
+
rel = rel.filter(' AND '.join(udf_filter_queries))
|
1084 |
+
total_num_rows = cast(tuple, rel.count('*').fetchone())[0]
|
1085 |
+
|
1086 |
+
if sort_sql_after_udf:
|
1087 |
+
if not sort_order:
|
1088 |
+
raise ValueError('`sort_order` is required when `sort_by` is specified.')
|
1089 |
+
rel = rel.order(f'{", ".join(sort_sql_after_udf)} {sort_order.value}')
|
1090 |
+
|
1091 |
+
if limit:
|
1092 |
+
rel = rel.limit(limit, offset)
|
1093 |
+
|
1094 |
+
df = _replace_nan_with_none(rel.df())
|
1095 |
+
|
1096 |
+
if temp_rowid_selected:
|
1097 |
+
del df[ROWID]
|
1098 |
+
del columns_to_merge[ROWID]
|
1099 |
+
|
1100 |
+
if combine_columns:
|
1101 |
+
all_columns: dict[str, Column] = {}
|
1102 |
+
for col_dict in columns_to_merge.values():
|
1103 |
+
all_columns.update(col_dict)
|
1104 |
+
columns_to_merge = {'*': all_columns}
|
1105 |
+
|
1106 |
+
for offset_column, _ in temp_column_to_offset_column.values():
|
1107 |
+
del df[offset_column]
|
1108 |
+
|
1109 |
+
for final_col_name, temp_columns in columns_to_merge.items():
|
1110 |
+
for temp_col_name, column in temp_columns.items():
|
1111 |
+
if combine_columns:
|
1112 |
+
dest_path = _col_destination_path(column)
|
1113 |
+
spec = _split_path_into_subpaths_of_lists(dest_path)
|
1114 |
+
df[temp_col_name] = wrap_in_dicts(df[temp_col_name], spec)
|
1115 |
+
|
1116 |
+
# If the temp col name is the same as the final name, we can skip merging. This happens when
|
1117 |
+
# we select a source leaf column.
|
1118 |
+
if temp_col_name == final_col_name:
|
1119 |
+
continue
|
1120 |
+
|
1121 |
+
if final_col_name not in df:
|
1122 |
+
df[final_col_name] = df[temp_col_name]
|
1123 |
+
else:
|
1124 |
+
df[final_col_name] = merge_series(df[final_col_name], df[temp_col_name])
|
1125 |
+
del df[temp_col_name]
|
1126 |
+
|
1127 |
+
con.close()
|
1128 |
+
|
1129 |
+
if combine_columns:
|
1130 |
+
# Since we aliased every column to `*`, the object with have only '*' as the key. We need to
|
1131 |
+
# elevate the all the columns under '*'.
|
1132 |
+
df = pd.DataFrame.from_records(df['*'])
|
1133 |
+
|
1134 |
+
return SelectRowsResult(df, total_num_rows)
|
1135 |
+
|
1136 |
+
@override
|
1137 |
+
def select_rows_schema(self,
|
1138 |
+
columns: Optional[Sequence[ColumnId]] = None,
|
1139 |
+
sort_by: Optional[Sequence[Path]] = None,
|
1140 |
+
sort_order: Optional[SortOrder] = None,
|
1141 |
+
searches: Optional[Sequence[Search]] = None,
|
1142 |
+
combine_columns: bool = False) -> SelectRowsSchemaResult:
|
1143 |
+
"""Returns the schema of the result of `select_rows` above with the same arguments."""
|
1144 |
+
if not combine_columns:
|
1145 |
+
raise NotImplementedError(
|
1146 |
+
'select_rows_schema with combine_columns=False is not yet supported.')
|
1147 |
+
manifest = self.manifest()
|
1148 |
+
cols = self._normalize_columns(columns, manifest.data_schema, combine_columns)
|
1149 |
+
|
1150 |
+
self._normalize_searches(searches, manifest)
|
1151 |
+
search_udfs = self._search_udfs(searches, manifest)
|
1152 |
+
cols.extend([search_udf.udf for search_udf in search_udfs])
|
1153 |
+
|
1154 |
+
udfs: list[SelectRowsSchemaUDF] = []
|
1155 |
+
col_schemas: list[Schema] = []
|
1156 |
+
for col in cols:
|
1157 |
+
dest_path = _col_destination_path(col)
|
1158 |
+
if col.signal_udf:
|
1159 |
+
udfs.append(SelectRowsSchemaUDF(path=dest_path, alias=col.alias))
|
1160 |
+
field = col.signal_udf.fields()
|
1161 |
+
field.signal = col.signal_udf.dict()
|
1162 |
+
elif manifest.data_schema.has_field(dest_path):
|
1163 |
+
field = manifest.data_schema.get_field(dest_path)
|
1164 |
+
else:
|
1165 |
+
# This column might refer to an output of a udf. We postpone validation to later.
|
1166 |
+
continue
|
1167 |
+
col_schemas.append(_make_schema_from_path(dest_path, field))
|
1168 |
+
|
1169 |
+
sort_results = self._merge_sorts(search_udfs, sort_by, sort_order)
|
1170 |
+
|
1171 |
+
search_results = [
|
1172 |
+
SearchResultInfo(search_path=search_udf.search_path, result_path=search_udf.output_path)
|
1173 |
+
for search_udf in search_udfs
|
1174 |
+
]
|
1175 |
+
|
1176 |
+
new_schema = merge_schemas(col_schemas)
|
1177 |
+
|
1178 |
+
# Now that we have the new schema, we can validate all the column selections.
|
1179 |
+
self._validate_columns(cols, manifest.data_schema, new_schema)
|
1180 |
+
|
1181 |
+
return SelectRowsSchemaResult(
|
1182 |
+
data_schema=new_schema, udfs=udfs, search_results=search_results, sorts=sort_results or None)
|
1183 |
+
|
1184 |
+
@override
|
1185 |
+
def media(self, item_id: str, leaf_path: Path) -> MediaResult:
|
1186 |
+
raise NotImplementedError('Media is not yet supported for the DuckDB implementation.')
|
1187 |
+
|
1188 |
+
def _get_span_from(self, path: PathTuple, manifest: DatasetManifest) -> Optional[PathTuple]:
|
1189 |
+
leafs = manifest.data_schema.leafs
|
1190 |
+
# Remove the value key so we can check the dtype from leafs.
|
1191 |
+
span_path = path[:-1] if path[-1] == VALUE_KEY else path
|
1192 |
+
is_span = (span_path in leafs and leafs[span_path].dtype == DataType.STRING_SPAN)
|
1193 |
+
return _derived_from_path(path, manifest.data_schema) if is_span else None
|
1194 |
+
|
1195 |
+
def _leaf_path_to_duckdb_path(self, leaf_path: PathTuple, schema: Schema) -> PathTuple:
|
1196 |
+
((_, duckdb_path),) = self._column_to_duckdb_paths(
|
1197 |
+
Column(leaf_path), schema, combine_columns=False, select_leaf=True)
|
1198 |
+
return duckdb_path
|
1199 |
+
|
1200 |
+
def _column_to_duckdb_paths(self,
|
1201 |
+
column: Column,
|
1202 |
+
schema: Schema,
|
1203 |
+
combine_columns: bool,
|
1204 |
+
select_leaf: bool = False) -> list[tuple[str, PathTuple]]:
|
1205 |
+
path = column.path
|
1206 |
+
parquet_manifests: list[Union[SourceManifest, SignalManifest]] = [
|
1207 |
+
self._source_manifest, *self._signal_manifests
|
1208 |
+
]
|
1209 |
+
duckdb_paths: list[tuple[str, PathTuple]] = []
|
1210 |
+
source_has_path = False
|
1211 |
+
|
1212 |
+
select_leaf = select_leaf or column.signal_udf is not None
|
1213 |
+
|
1214 |
+
if path == (ROWID,):
|
1215 |
+
return [('source', path)]
|
1216 |
+
|
1217 |
+
for m in parquet_manifests:
|
1218 |
+
if not m.files:
|
1219 |
+
continue
|
1220 |
+
# Skip this parquet file if it doesn't contain the path.
|
1221 |
+
if not schema_contains_path(m.data_schema, path):
|
1222 |
+
continue
|
1223 |
+
|
1224 |
+
if isinstance(m, SourceManifest):
|
1225 |
+
source_has_path = True
|
1226 |
+
|
1227 |
+
if isinstance(m, SignalManifest) and source_has_path and not combine_columns:
|
1228 |
+
# Skip this signal if the source already has the path and we are not combining columns.
|
1229 |
+
continue
|
1230 |
+
|
1231 |
+
# Skip this parquet file if the path doesn't have a dtype.
|
1232 |
+
if select_leaf and not m.data_schema.get_field(path).dtype:
|
1233 |
+
continue
|
1234 |
+
|
1235 |
+
duckdb_path = path
|
1236 |
+
parquet_id = 'source'
|
1237 |
+
|
1238 |
+
if isinstance(m, SignalManifest):
|
1239 |
+
duckdb_path = (m.parquet_id, *path[1:])
|
1240 |
+
parquet_id = m.parquet_id
|
1241 |
+
|
1242 |
+
duckdb_paths.append((parquet_id, duckdb_path))
|
1243 |
+
|
1244 |
+
if not duckdb_paths:
|
1245 |
+
# This path is probably a result of a udf. Make sure the result schema contains it.
|
1246 |
+
if not schema.has_field(path):
|
1247 |
+
raise ValueError(f'Invalid path "{path}": No manifest contains path. Valid paths: '
|
1248 |
+
f'{list(schema.leafs.keys())}')
|
1249 |
+
|
1250 |
+
return duckdb_paths
|
1251 |
+
|
1252 |
+
def _normalize_filters(self, filter_likes: Optional[Sequence[FilterLike]],
|
1253 |
+
col_aliases: dict[str, PathTuple], udf_aliases: dict[str, PathTuple],
|
1254 |
+
manifest: DatasetManifest) -> tuple[list[Filter], list[Filter]]:
|
1255 |
+
"""Normalize `FilterLike` to `Filter` and split into filters on source and filters on UDFs."""
|
1256 |
+
filter_likes = filter_likes or []
|
1257 |
+
filters: list[Filter] = []
|
1258 |
+
udf_filters: list[Filter] = []
|
1259 |
+
|
1260 |
+
for filter in filter_likes:
|
1261 |
+
# Normalize `FilterLike` to `Filter`.
|
1262 |
+
if not isinstance(filter, Filter):
|
1263 |
+
if len(filter) == 3:
|
1264 |
+
path, op, value = filter # type: ignore
|
1265 |
+
elif len(filter) == 2:
|
1266 |
+
path, op = filter # type: ignore
|
1267 |
+
value = None
|
1268 |
+
else:
|
1269 |
+
raise ValueError(f'Invalid filter: {filter}. Must be a tuple with 2 or 3 elements.')
|
1270 |
+
filter = Filter(path=normalize_path(path), op=op, value=value)
|
1271 |
+
|
1272 |
+
if str(filter.path[0]) in udf_aliases:
|
1273 |
+
udf_filters.append(filter)
|
1274 |
+
else:
|
1275 |
+
filters.append(filter)
|
1276 |
+
|
1277 |
+
self._validate_filters(filters, col_aliases, manifest)
|
1278 |
+
return filters, udf_filters
|
1279 |
+
|
1280 |
+
def _normalize_searches(self, searches: Optional[Sequence[Search]],
|
1281 |
+
manifest: DatasetManifest) -> None:
|
1282 |
+
"""Validate searches."""
|
1283 |
+
if not searches:
|
1284 |
+
return
|
1285 |
+
|
1286 |
+
for search in searches:
|
1287 |
+
search.path = normalize_path(search.path)
|
1288 |
+
field = manifest.data_schema.get_field(search.path)
|
1289 |
+
if field.dtype != DataType.STRING:
|
1290 |
+
raise ValueError(f'Invalid search path: {search.path}. '
|
1291 |
+
f'Must be a string field, got dtype {field.dtype}')
|
1292 |
+
|
1293 |
+
def _search_udfs(self, searches: Optional[Sequence[Search]],
|
1294 |
+
manifest: DatasetManifest) -> list[DuckDBSearchUDF]:
|
1295 |
+
searches = searches or []
|
1296 |
+
"""Create a UDF for each search for finding the location of the text with spans."""
|
1297 |
+
search_udfs: list[DuckDBSearchUDF] = []
|
1298 |
+
for search in searches:
|
1299 |
+
search_path = normalize_path(search.path)
|
1300 |
+
if search.type == 'keyword':
|
1301 |
+
udf = Column(path=search_path, signal_udf=SubstringSignal(query=search.query))
|
1302 |
+
search_udfs.append(
|
1303 |
+
DuckDBSearchUDF(
|
1304 |
+
udf=udf,
|
1305 |
+
search_path=search_path,
|
1306 |
+
output_path=(*_col_destination_path(udf), PATH_WILDCARD)))
|
1307 |
+
elif search.type == 'semantic' or search.type == 'concept':
|
1308 |
+
embedding = search.embedding
|
1309 |
+
if not embedding:
|
1310 |
+
raise ValueError(f'Please provide an embedding for semantic search. Got search: {search}')
|
1311 |
+
|
1312 |
+
try:
|
1313 |
+
manifest.data_schema.get_field((*search_path, embedding))
|
1314 |
+
except Exception as e:
|
1315 |
+
raise ValueError(
|
1316 |
+
f'Embedding {embedding} has not been computed. '
|
1317 |
+
f'Please compute the embedding index before issuing a {search.type} query.') from e
|
1318 |
+
|
1319 |
+
search_signal: Optional[Signal] = None
|
1320 |
+
if search.type == 'semantic':
|
1321 |
+
search_signal = SemanticSimilaritySignal(query=search.query, embedding=search.embedding)
|
1322 |
+
elif search.type == 'concept':
|
1323 |
+
search_signal = ConceptSignal(
|
1324 |
+
namespace=search.concept_namespace,
|
1325 |
+
concept_name=search.concept_name,
|
1326 |
+
embedding=search.embedding)
|
1327 |
+
|
1328 |
+
# Add the label UDF.
|
1329 |
+
concept_labels_signal = ConceptLabelsSignal(
|
1330 |
+
namespace=search.concept_namespace, concept_name=search.concept_name)
|
1331 |
+
concept_labels_udf = Column(path=search_path, signal_udf=concept_labels_signal)
|
1332 |
+
search_udfs.append(
|
1333 |
+
DuckDBSearchUDF(
|
1334 |
+
udf=concept_labels_udf,
|
1335 |
+
search_path=search_path,
|
1336 |
+
output_path=_col_destination_path(concept_labels_udf),
|
1337 |
+
sort=None))
|
1338 |
+
|
1339 |
+
udf = Column(path=search_path, signal_udf=search_signal)
|
1340 |
+
|
1341 |
+
output_path = _col_destination_path(udf)
|
1342 |
+
search_udfs.append(
|
1343 |
+
DuckDBSearchUDF(
|
1344 |
+
udf=udf,
|
1345 |
+
search_path=search_path,
|
1346 |
+
output_path=_col_destination_path(udf),
|
1347 |
+
sort=((*output_path, PATH_WILDCARD, 'score'), SortOrder.DESC)))
|
1348 |
+
else:
|
1349 |
+
raise ValueError(f'Unknown search operator {search.type}.')
|
1350 |
+
|
1351 |
+
return search_udfs
|
1352 |
+
|
1353 |
+
def _create_where(self,
|
1354 |
+
manifest: DatasetManifest,
|
1355 |
+
filters: list[Filter],
|
1356 |
+
searches: Optional[Sequence[Search]] = []) -> list[str]:
|
1357 |
+
if not filters and not searches:
|
1358 |
+
return []
|
1359 |
+
searches = searches or []
|
1360 |
+
sql_filter_queries: list[str] = []
|
1361 |
+
|
1362 |
+
# Add search where queries.
|
1363 |
+
for search in searches:
|
1364 |
+
duckdb_path = self._leaf_path_to_duckdb_path(
|
1365 |
+
normalize_path(search.path), manifest.data_schema)
|
1366 |
+
select_str = _select_sql(duckdb_path, flatten=False, unnest=False)
|
1367 |
+
if search.type == 'keyword':
|
1368 |
+
sql_op = 'ILIKE'
|
1369 |
+
query_val = _escape_like_value(search.query)
|
1370 |
+
elif search.type == 'semantic' or search.type == 'concept':
|
1371 |
+
# Semantic search and concepts don't yet filter.
|
1372 |
+
continue
|
1373 |
+
else:
|
1374 |
+
raise ValueError(f'Unknown search operator {search.type}.')
|
1375 |
+
|
1376 |
+
filter_query = f'{select_str} {sql_op} {query_val}'
|
1377 |
+
|
1378 |
+
sql_filter_queries.append(filter_query)
|
1379 |
+
|
1380 |
+
# Add filter where queries.
|
1381 |
+
for f in filters:
|
1382 |
+
duckdb_path = self._leaf_path_to_duckdb_path(f.path, manifest.data_schema)
|
1383 |
+
select_str = _select_sql(
|
1384 |
+
duckdb_path, flatten=True, unnest=False, span_from=self._get_span_from(f.path, manifest))
|
1385 |
+
is_array = any(subpath == PATH_WILDCARD for subpath in f.path)
|
1386 |
+
|
1387 |
+
nan_filter = ''
|
1388 |
+
field = manifest.data_schema.get_field(f.path)
|
1389 |
+
filter_nans = field.dtype and is_float(field.dtype)
|
1390 |
+
|
1391 |
+
if f.op in BINARY_OPS:
|
1392 |
+
sql_op = BINARY_OP_TO_SQL[cast(BinaryOp, f.op)]
|
1393 |
+
filter_val = cast(FeatureValue, f.value)
|
1394 |
+
if isinstance(filter_val, str):
|
1395 |
+
filter_val = _escape_string_literal(filter_val)
|
1396 |
+
elif isinstance(filter_val, bytes):
|
1397 |
+
filter_val = _bytes_to_blob_literal(filter_val)
|
1398 |
+
else:
|
1399 |
+
filter_val = str(filter_val)
|
1400 |
+
if is_array:
|
1401 |
+
nan_filter = 'NOT isnan(x) AND' if filter_nans else ''
|
1402 |
+
filter_query = (f'len(list_filter({select_str}, '
|
1403 |
+
f'x -> {nan_filter} x {sql_op} {filter_val})) > 0')
|
1404 |
+
else:
|
1405 |
+
nan_filter = f'NOT isnan({select_str}) AND' if filter_nans else ''
|
1406 |
+
filter_query = f'{nan_filter} {select_str} {sql_op} {filter_val}'
|
1407 |
+
elif f.op in UNARY_OPS:
|
1408 |
+
if f.op == 'exists':
|
1409 |
+
filter_query = f'len({select_str}) > 0' if is_array else f'{select_str} IS NOT NULL'
|
1410 |
+
else:
|
1411 |
+
raise ValueError(f'Unary op: {f.op} is not yet supported')
|
1412 |
+
elif f.op in LIST_OPS:
|
1413 |
+
if f.op == 'in':
|
1414 |
+
filter_list_val = cast(FeatureListValue, f.value)
|
1415 |
+
if not isinstance(filter_list_val, list):
|
1416 |
+
raise ValueError('filter with array value can only use the IN comparison')
|
1417 |
+
wrapped_filter_val = [f"'{part}'" for part in filter_list_val]
|
1418 |
+
filter_val = f'({", ".join(wrapped_filter_val)})'
|
1419 |
+
filter_query = f'{select_str} IN {filter_val}'
|
1420 |
+
else:
|
1421 |
+
raise ValueError(f'List op: {f.op} is not yet supported')
|
1422 |
+
else:
|
1423 |
+
raise ValueError(f'Invalid filter op: {f.op}')
|
1424 |
+
sql_filter_queries.append(filter_query)
|
1425 |
+
return sql_filter_queries
|
1426 |
+
|
1427 |
+
def _execute(self, query: str) -> duckdb.DuckDBPyConnection:
|
1428 |
+
"""Execute a query in duckdb."""
|
1429 |
+
# FastAPI is multi-threaded so we have to create a thread-specific connection cursor to allow
|
1430 |
+
# these queries to be thread-safe.
|
1431 |
+
local_con = self.con.cursor()
|
1432 |
+
if not env('DEBUG', False):
|
1433 |
+
return local_con.execute(query)
|
1434 |
+
|
1435 |
+
# Debug mode.
|
1436 |
+
log('Executing:')
|
1437 |
+
log(query)
|
1438 |
+
with DebugTimer('Query'):
|
1439 |
+
return local_con.execute(query)
|
1440 |
+
|
1441 |
+
def _query(self, query: str) -> list[tuple]:
|
1442 |
+
result = self._execute(query)
|
1443 |
+
rows = result.fetchall()
|
1444 |
+
result.close()
|
1445 |
+
return rows
|
1446 |
+
|
1447 |
+
def _query_df(self, query: str) -> pd.DataFrame:
|
1448 |
+
"""Execute a query that returns a data frame."""
|
1449 |
+
result = self._execute(query)
|
1450 |
+
df = _replace_nan_with_none(result.df())
|
1451 |
+
result.close()
|
1452 |
+
return df
|
1453 |
+
|
1454 |
+
def _path_to_col(self, path: Path, quote_each_part: bool = True) -> str:
|
1455 |
+
"""Convert a path to a column name."""
|
1456 |
+
if isinstance(path, str):
|
1457 |
+
path = (path,)
|
1458 |
+
return '.'.join([
|
1459 |
+
f'{_escape_col_name(path_comp)}' if quote_each_part else str(path_comp) for path_comp in path
|
1460 |
+
])
|
1461 |
+
|
1462 |
+
def _get_selection(self, columns: Optional[Sequence[ColumnId]] = None) -> str:
|
1463 |
+
"""Get the selection clause for download a dataset."""
|
1464 |
+
manifest = self.manifest()
|
1465 |
+
cols = self._normalize_columns(columns, manifest.data_schema, combine_columns=False)
|
1466 |
+
schema = manifest.data_schema
|
1467 |
+
self._validate_columns(cols, manifest.data_schema, schema)
|
1468 |
+
|
1469 |
+
select_queries: list[str] = []
|
1470 |
+
for column in cols:
|
1471 |
+
col_name = column.alias or _unique_alias(column)
|
1472 |
+
duckdb_paths = self._column_to_duckdb_paths(column, schema, combine_columns=False)
|
1473 |
+
if not duckdb_paths:
|
1474 |
+
raise ValueError(f'Cannot download path {column.path} which does not exist in the dataset.')
|
1475 |
+
if len(duckdb_paths) > 1:
|
1476 |
+
raise ValueError(
|
1477 |
+
f'Cannot download path {column.path} which spans multiple parquet files: {duckdb_paths}')
|
1478 |
+
_, duckdb_path = duckdb_paths[0]
|
1479 |
+
sql = _select_sql(duckdb_path, flatten=False, unnest=False)
|
1480 |
+
select_queries.append(f'{sql} AS {_escape_string_literal(col_name)}')
|
1481 |
+
return ', '.join(select_queries)
|
1482 |
+
|
1483 |
+
@override
|
1484 |
+
def to_json(self,
|
1485 |
+
filepath: Union[str, pathlib.Path],
|
1486 |
+
jsonl: bool = True,
|
1487 |
+
columns: Optional[Sequence[ColumnId]] = None) -> None:
|
1488 |
+
selection = self._get_selection(columns)
|
1489 |
+
self._execute(f"COPY (SELECT {selection} FROM t) TO '{filepath}' "
|
1490 |
+
f"(FORMAT JSON, ARRAY {'FALSE' if jsonl else 'TRUE'})")
|
1491 |
+
log(f'Dataset exported to {filepath}')
|
1492 |
+
|
1493 |
+
@override
|
1494 |
+
def to_pandas(self, columns: Optional[Sequence[ColumnId]] = None) -> pd.DataFrame:
|
1495 |
+
selection = self._get_selection(columns)
|
1496 |
+
return self._query_df(f'SELECT {selection} FROM t')
|
1497 |
+
|
1498 |
+
@override
|
1499 |
+
def to_csv(self,
|
1500 |
+
filepath: Union[str, pathlib.Path],
|
1501 |
+
columns: Optional[Sequence[ColumnId]] = None) -> None:
|
1502 |
+
selection = self._get_selection(columns)
|
1503 |
+
self._execute(f"COPY (SELECT {selection} FROM t) TO '{filepath}' (FORMAT CSV, HEADER)")
|
1504 |
+
log(f'Dataset exported to {filepath}')
|
1505 |
+
|
1506 |
+
@override
|
1507 |
+
def to_parquet(self,
|
1508 |
+
filepath: Union[str, pathlib.Path],
|
1509 |
+
columns: Optional[Sequence[ColumnId]] = None) -> None:
|
1510 |
+
selection = self._get_selection(columns)
|
1511 |
+
self._execute(f"COPY (SELECT {selection} FROM t) TO '{filepath}' (FORMAT PARQUET)")
|
1512 |
+
log(f'Dataset exported to {filepath}')
|
1513 |
+
|
1514 |
+
|
1515 |
+
def _escape_string_literal(string: str) -> str:
|
1516 |
+
string = string.replace("'", "''")
|
1517 |
+
return f"'{string}'"
|
1518 |
+
|
1519 |
+
|
1520 |
+
def _escape_col_name(col_name: str) -> str:
|
1521 |
+
col_name = col_name.replace('"', '""')
|
1522 |
+
return f'"{col_name}"'
|
1523 |
+
|
1524 |
+
|
1525 |
+
def _escape_like_value(value: str) -> str:
|
1526 |
+
value = value.replace('%', '\\%').replace('_', '\\_')
|
1527 |
+
return f"'%{value}%' ESCAPE '\\'"
|
1528 |
+
|
1529 |
+
|
1530 |
+
def _inner_select(sub_paths: list[PathTuple],
|
1531 |
+
inner_var: Optional[str] = None,
|
1532 |
+
empty: bool = False,
|
1533 |
+
span_from: Optional[PathTuple] = None) -> str:
|
1534 |
+
"""Recursively generate the inner select statement for a list of sub paths."""
|
1535 |
+
current_sub_path = sub_paths[0]
|
1536 |
+
lambda_var = inner_var + 'x' if inner_var else 'x'
|
1537 |
+
if not inner_var:
|
1538 |
+
lambda_var = 'x'
|
1539 |
+
inner_var = _escape_col_name(current_sub_path[0])
|
1540 |
+
current_sub_path = current_sub_path[1:]
|
1541 |
+
# Select the path inside structs. E.g. x['a']['b']['c'] given current_sub_path = [a, b, c].
|
1542 |
+
path_key = inner_var + ''.join([f'[{_escape_string_literal(p)}]' for p in current_sub_path])
|
1543 |
+
if len(sub_paths) == 1:
|
1544 |
+
if span_from:
|
1545 |
+
derived_col = _select_sql(span_from, flatten=False, unnest=False)
|
1546 |
+
path_key = (f'{derived_col}[{path_key}.{VALUE_KEY}.{TEXT_SPAN_START_FEATURE}+1:'
|
1547 |
+
f'{path_key}.{VALUE_KEY}.{TEXT_SPAN_END_FEATURE}]')
|
1548 |
+
return 'NULL' if empty else path_key
|
1549 |
+
return (f'list_transform({path_key}, {lambda_var} -> '
|
1550 |
+
f'{_inner_select(sub_paths[1:], lambda_var, empty, span_from)})')
|
1551 |
+
|
1552 |
+
|
1553 |
+
def _split_path_into_subpaths_of_lists(leaf_path: PathTuple) -> list[PathTuple]:
|
1554 |
+
"""Split a path into a subpath of lists.
|
1555 |
+
|
1556 |
+
E.g. [a, b, c, *, d, *, *] gets splits [[a, b, c], [d], [], []].
|
1557 |
+
"""
|
1558 |
+
sub_paths: list[PathTuple] = []
|
1559 |
+
offset = 0
|
1560 |
+
while offset <= len(leaf_path):
|
1561 |
+
new_offset = leaf_path.index(PATH_WILDCARD,
|
1562 |
+
offset) if PATH_WILDCARD in leaf_path[offset:] else len(leaf_path)
|
1563 |
+
sub_path = leaf_path[offset:new_offset]
|
1564 |
+
sub_paths.append(sub_path)
|
1565 |
+
offset = new_offset + 1
|
1566 |
+
return sub_paths
|
1567 |
+
|
1568 |
+
|
1569 |
+
def _select_sql(path: PathTuple,
|
1570 |
+
flatten: bool,
|
1571 |
+
unnest: bool,
|
1572 |
+
empty: bool = False,
|
1573 |
+
span_from: Optional[PathTuple] = None) -> str:
|
1574 |
+
"""Create a select column for a path.
|
1575 |
+
|
1576 |
+
Args:
|
1577 |
+
path: A path to a feature. E.g. ['a', 'b', 'c'].
|
1578 |
+
flatten: Whether to flatten the result.
|
1579 |
+
unnest: Whether to unnest the result.
|
1580 |
+
empty: Whether to return an empty list (used for embedding signals that don't need the data).
|
1581 |
+
span_from: The path this span is derived from. If specified, the span will be resolved
|
1582 |
+
to a substring of the original string.
|
1583 |
+
"""
|
1584 |
+
sub_paths = _split_path_into_subpaths_of_lists(path)
|
1585 |
+
selection = _inner_select(sub_paths, None, empty, span_from)
|
1586 |
+
# We only flatten when the result of a nested list to avoid segfault.
|
1587 |
+
is_result_nested_list = len(sub_paths) >= 3 # E.g. subPaths = [[a, b, c], *, *].
|
1588 |
+
if flatten and is_result_nested_list:
|
1589 |
+
selection = f'flatten({selection})'
|
1590 |
+
# We only unnest when the result is a list. // E.g. subPaths = [[a, b, c], *].
|
1591 |
+
is_result_a_list = len(sub_paths) >= 2
|
1592 |
+
if unnest and is_result_a_list:
|
1593 |
+
selection = f'unnest({selection})'
|
1594 |
+
return selection
|
1595 |
+
|
1596 |
+
|
1597 |
+
def read_source_manifest(dataset_path: str) -> SourceManifest:
|
1598 |
+
"""Read the manifest file."""
|
1599 |
+
with open_file(os.path.join(dataset_path, MANIFEST_FILENAME), 'r') as f:
|
1600 |
+
return SourceManifest.parse_raw(f.read())
|
1601 |
+
|
1602 |
+
|
1603 |
+
def _signal_dir(enriched_path: PathTuple) -> str:
|
1604 |
+
"""Get the filename prefix for a signal parquet file."""
|
1605 |
+
path_without_wildcards = (p for p in enriched_path if p != PATH_WILDCARD)
|
1606 |
+
return os.path.join(*path_without_wildcards)
|
1607 |
+
|
1608 |
+
|
1609 |
+
def split_column_name(column: str, split_name: str) -> str:
|
1610 |
+
"""Get the name of a split column."""
|
1611 |
+
return f'{column}.{split_name}'
|
1612 |
+
|
1613 |
+
|
1614 |
+
def split_parquet_prefix(column_name: str, splitter_name: str) -> str:
|
1615 |
+
"""Get the filename prefix for a split parquet file."""
|
1616 |
+
return f'{column_name}.{splitter_name}'
|
1617 |
+
|
1618 |
+
|
1619 |
+
def _bytes_to_blob_literal(bytes: bytes) -> str:
|
1620 |
+
"""Convert bytes to a blob literal."""
|
1621 |
+
escaped_hex = re.sub(r'(.{2})', r'\\x\1', bytes.hex())
|
1622 |
+
return f"'{escaped_hex}'::BLOB"
|
1623 |
+
|
1624 |
+
|
1625 |
+
class SignalManifest(BaseModel):
|
1626 |
+
"""The manifest that describes a signal computation including schema and parquet files."""
|
1627 |
+
# List of a parquet filepaths storing the data. The paths are relative to the manifest.
|
1628 |
+
files: list[str]
|
1629 |
+
|
1630 |
+
# An identifier for this parquet table. Will be used as the view name in SQL.
|
1631 |
+
parquet_id: str
|
1632 |
+
|
1633 |
+
data_schema: Schema
|
1634 |
+
signal: Signal
|
1635 |
+
|
1636 |
+
# The column path that this signal is derived from.
|
1637 |
+
enriched_path: PathTuple
|
1638 |
+
|
1639 |
+
# The name of the vector store. Present when the signal is an embedding.
|
1640 |
+
vector_store: Optional[str] = None
|
1641 |
+
|
1642 |
+
@validator('signal', pre=True)
|
1643 |
+
def parse_signal(cls, signal: dict) -> Signal:
|
1644 |
+
"""Parse a signal to its specific subclass instance."""
|
1645 |
+
return resolve_signal(signal)
|
1646 |
+
|
1647 |
+
|
1648 |
+
def _merge_cells(dest_cell: Item, source_cell: Item) -> Item:
|
1649 |
+
if source_cell is None or isinstance(source_cell, float) and math.isnan(source_cell):
|
1650 |
+
# Nothing to merge here (missing value).
|
1651 |
+
return dest_cell
|
1652 |
+
if isinstance(dest_cell, dict):
|
1653 |
+
if isinstance(source_cell, list):
|
1654 |
+
raise ValueError(f'Failed to merge cells. Destination is a dict ({dest_cell!r}), '
|
1655 |
+
f'but source is a list ({source_cell!r}).')
|
1656 |
+
if isinstance(source_cell, dict):
|
1657 |
+
res = {**dest_cell}
|
1658 |
+
for key, value in source_cell.items():
|
1659 |
+
res[key] = (value if key not in dest_cell else _merge_cells(dest_cell[key], value))
|
1660 |
+
return res
|
1661 |
+
else:
|
1662 |
+
return {VALUE_KEY: source_cell, **dest_cell}
|
1663 |
+
elif isinstance(dest_cell, list):
|
1664 |
+
if not isinstance(source_cell, list):
|
1665 |
+
raise ValueError('Failed to merge cells. Destination is a list, but source is not.')
|
1666 |
+
return [
|
1667 |
+
_merge_cells(dest_subcell, source_subcell)
|
1668 |
+
for dest_subcell, source_subcell in zip(dest_cell, source_cell)
|
1669 |
+
]
|
1670 |
+
else:
|
1671 |
+
# The destination is a primitive.
|
1672 |
+
if isinstance(source_cell, list):
|
1673 |
+
raise ValueError(f'Failed to merge cells. Destination is a primitive ({dest_cell!r}), '
|
1674 |
+
f'but source is a list ({source_cell!r}).')
|
1675 |
+
if isinstance(source_cell, dict):
|
1676 |
+
return {VALUE_KEY: dest_cell, **source_cell}
|
1677 |
+
else:
|
1678 |
+
# Primitives can be merged together if they are equal. This can happen if a user selects a
|
1679 |
+
# column that is the child of another.
|
1680 |
+
# NOTE: This can be removed if we fix https://github.com/lilacai/lilac/issues/166.
|
1681 |
+
if source_cell != dest_cell:
|
1682 |
+
raise ValueError(f'Cannot merge source "{source_cell!r}" into destination "{dest_cell!r}"')
|
1683 |
+
return dest_cell
|
1684 |
+
|
1685 |
+
|
1686 |
+
def merge_series(destination: pd.Series, source: pd.Series) -> list[Item]:
|
1687 |
+
"""Merge two series of values recursively."""
|
1688 |
+
return _merge_cells(destination.tolist(), source.tolist())
|
1689 |
+
|
1690 |
+
|
1691 |
+
def _unique_alias(column: Column) -> str:
|
1692 |
+
"""Get a unique alias for a selection column."""
|
1693 |
+
if column.signal_udf:
|
1694 |
+
return make_parquet_id(column.signal_udf, column.path)
|
1695 |
+
return '.'.join(map(str, column.path))
|
1696 |
+
|
1697 |
+
|
1698 |
+
def _path_contains(parent_path: PathTuple, child_path: PathTuple) -> bool:
|
1699 |
+
"""Check if a path contains another path."""
|
1700 |
+
if len(parent_path) > len(child_path):
|
1701 |
+
return False
|
1702 |
+
return all(parent_path[i] == child_path[i] for i in range(len(parent_path)))
|
1703 |
+
|
1704 |
+
|
1705 |
+
def _path_to_udf_duckdb_path(path: PathTuple,
|
1706 |
+
path_to_udf_col_name: dict[PathTuple, str]) -> Optional[PathTuple]:
|
1707 |
+
first_subpath, *rest_of_path = path
|
1708 |
+
for parent_path, udf_col_name in path_to_udf_col_name.items():
|
1709 |
+
# If the user selected udf(document.*.text) as "udf" and wanted to sort by "udf.len", we need to
|
1710 |
+
# sort by "udf.*.len" where the "*" came from the fact that the udf was applied to a list of
|
1711 |
+
# "text" fields.
|
1712 |
+
wildcards = [x for x in parent_path if x == PATH_WILDCARD]
|
1713 |
+
if _path_contains(parent_path, path):
|
1714 |
+
return (udf_col_name, *wildcards, *path[len(parent_path):])
|
1715 |
+
elif first_subpath == udf_col_name:
|
1716 |
+
return (udf_col_name, *wildcards, *rest_of_path)
|
1717 |
+
|
1718 |
+
return None
|
1719 |
+
|
1720 |
+
|
1721 |
+
def _col_destination_path(column: Column, is_computed_signal: Optional[bool] = False) -> PathTuple:
|
1722 |
+
"""Get the destination path where the output of this selection column will be stored."""
|
1723 |
+
source_path = column.path
|
1724 |
+
|
1725 |
+
if not column.signal_udf:
|
1726 |
+
return source_path
|
1727 |
+
|
1728 |
+
signal_key = column.signal_udf.key(is_computed_signal=is_computed_signal)
|
1729 |
+
# If we are enriching a value we should store the signal data in the value's parent.
|
1730 |
+
if source_path[-1] == VALUE_KEY:
|
1731 |
+
dest_path = (*source_path[:-1], signal_key)
|
1732 |
+
else:
|
1733 |
+
dest_path = (*source_path, signal_key)
|
1734 |
+
|
1735 |
+
return dest_path
|
1736 |
+
|
1737 |
+
|
1738 |
+
def _root_column(manifest: SignalManifest) -> str:
|
1739 |
+
"""Returns the root column of a signal manifest."""
|
1740 |
+
field_keys = list(manifest.data_schema.fields.keys())
|
1741 |
+
if len(field_keys) > 2:
|
1742 |
+
raise ValueError('Expected at most two fields in signal manifest, '
|
1743 |
+
f'the rowid and root this signal is enriching. Got {field_keys}.')
|
1744 |
+
return next(filter(lambda field: field != ROWID, manifest.data_schema.fields.keys()))
|
1745 |
+
|
1746 |
+
|
1747 |
+
def _derived_from_path(path: PathTuple, schema: Schema) -> PathTuple:
|
1748 |
+
# Find the closest parent of `path` that is a signal root.
|
1749 |
+
for i in reversed(range(len(path))):
|
1750 |
+
sub_path = path[:i]
|
1751 |
+
if schema.get_field(sub_path).signal is not None:
|
1752 |
+
# Skip the signal name at the end to get the source path that was enriched.
|
1753 |
+
return sub_path[:-1]
|
1754 |
+
raise ValueError('Cannot find the source path for the enriched path: {path}')
|
1755 |
+
|
1756 |
+
|
1757 |
+
def _make_schema_from_path(path: PathTuple, field: Field) -> Schema:
|
1758 |
+
"""Returns a schema that contains only the given path."""
|
1759 |
+
for sub_path in reversed(path):
|
1760 |
+
if sub_path == PATH_WILDCARD:
|
1761 |
+
field = Field(repeated_field=field)
|
1762 |
+
else:
|
1763 |
+
field = Field(fields={sub_path: field})
|
1764 |
+
if not field.fields:
|
1765 |
+
raise ValueError(f'Invalid path: {path}. Must contain at least one field name.')
|
1766 |
+
return Schema(fields=field.fields)
|
1767 |
+
|
1768 |
+
|
1769 |
+
def _replace_nan_with_none(df: pd.DataFrame) -> pd.DataFrame:
|
1770 |
+
"""DuckDB returns np.nan for missing field in string column, replace with None for correctness."""
|
1771 |
+
# TODO(https://github.com/duckdb/duckdb/issues/4066): Remove this once duckdb fixes upstream.
|
1772 |
+
for col in df.columns:
|
1773 |
+
if is_object_dtype(df[col]):
|
1774 |
+
df[col].replace(np.nan, None, inplace=True)
|
1775 |
+
return df
|
1776 |
+
|
1777 |
+
|
1778 |
+
def _offset_any_span(offset: int, item: Item, schema: Field) -> None:
|
1779 |
+
"""Offsets any spans inplace by the given parent offset."""
|
1780 |
+
if schema.dtype == DataType.STRING_SPAN:
|
1781 |
+
item = cast(dict, item)
|
1782 |
+
item[VALUE_KEY][TEXT_SPAN_START_FEATURE] += offset
|
1783 |
+
item[VALUE_KEY][TEXT_SPAN_END_FEATURE] += offset
|
1784 |
+
if schema.fields:
|
1785 |
+
item = cast(dict, item)
|
1786 |
+
for key, sub_schema in schema.fields.items():
|
1787 |
+
_offset_any_span(offset, item[key], sub_schema)
|
1788 |
+
if schema.repeated_field:
|
1789 |
+
item = cast(list, item)
|
1790 |
+
for sub_item in item:
|
1791 |
+
_offset_any_span(offset, sub_item, schema.repeated_field)
|
1792 |
+
|
1793 |
+
|
1794 |
+
def _schema_has_spans(field: Field) -> bool:
|
1795 |
+
if field.dtype and field.dtype == DataType.STRING_SPAN:
|
1796 |
+
return True
|
1797 |
+
if field.fields:
|
1798 |
+
children_have_spans = any(_schema_has_spans(sub_field) for sub_field in field.fields.values())
|
1799 |
+
if children_have_spans:
|
1800 |
+
return True
|
1801 |
+
if field.repeated_field:
|
1802 |
+
return _schema_has_spans(field.repeated_field)
|
1803 |
+
return False
|
1804 |
+
|
1805 |
+
|
1806 |
+
def _normalize_bins(bins: Optional[Union[Sequence[Bin], Sequence[float]]]) -> Optional[list[Bin]]:
|
1807 |
+
if bins is None:
|
1808 |
+
return None
|
1809 |
+
if not isinstance(bins[0], (float, int)):
|
1810 |
+
return cast(list[Bin], bins)
|
1811 |
+
named_bins: list[Bin] = []
|
1812 |
+
for i in range(len(bins) + 1):
|
1813 |
+
start = cast(float, bins[i - 1]) if i > 0 else None
|
1814 |
+
end = cast(float, bins[i]) if i < len(bins) else None
|
1815 |
+
named_bins.append((str(i), start, end))
|
1816 |
+
return named_bins
|
1817 |
+
|
1818 |
+
|
1819 |
+
def _auto_bins(stats: StatsResult, num_bins: int) -> list[Bin]:
|
1820 |
+
min_val = cast(float, stats.min_val)
|
1821 |
+
max_val = cast(float, stats.max_val)
|
1822 |
+
bin_width = (max_val - min_val) / num_bins
|
1823 |
+
bins: list[Bin] = []
|
1824 |
+
for i in range(num_bins):
|
1825 |
+
start = None if i == 0 else min_val + i * bin_width
|
1826 |
+
end = None if i == num_bins - 1 else min_val + (i + 1) * bin_width
|
1827 |
+
bins.append((str(i), start, end))
|
1828 |
+
return bins
|
1829 |
+
|
1830 |
+
|
1831 |
+
def get_config_filepath(namespace: str, dataset_name: str) -> str:
|
1832 |
+
"""Gets the config yaml filepath."""
|
1833 |
+
return os.path.join(get_dataset_output_dir(data_path(), namespace, dataset_name), CONFIG_FILENAME)
|
lilac/data/dataset_test_utils.py
ADDED
@@ -0,0 +1,153 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Tests utils of for dataset_test."""
|
2 |
+
import os
|
3 |
+
import pathlib
|
4 |
+
from copy import deepcopy
|
5 |
+
from datetime import datetime
|
6 |
+
from typing import Optional, Type, cast
|
7 |
+
|
8 |
+
import numpy as np
|
9 |
+
from typing_extensions import Protocol
|
10 |
+
|
11 |
+
from ..config import CONFIG_FILENAME, DatasetConfig
|
12 |
+
from ..embeddings.vector_store import VectorDBIndex
|
13 |
+
from ..schema import (
|
14 |
+
MANIFEST_FILENAME,
|
15 |
+
PARQUET_FILENAME_PREFIX,
|
16 |
+
ROWID,
|
17 |
+
VALUE_KEY,
|
18 |
+
DataType,
|
19 |
+
Field,
|
20 |
+
Item,
|
21 |
+
PathKey,
|
22 |
+
Schema,
|
23 |
+
SourceManifest,
|
24 |
+
)
|
25 |
+
from ..sources.source import Source
|
26 |
+
from ..utils import get_dataset_output_dir, open_file, to_yaml
|
27 |
+
from .dataset import Dataset, default_settings
|
28 |
+
from .dataset_utils import is_primitive, write_items_to_parquet
|
29 |
+
|
30 |
+
TEST_NAMESPACE = 'test_namespace'
|
31 |
+
TEST_DATASET_NAME = 'test_dataset'
|
32 |
+
|
33 |
+
|
34 |
+
def _infer_dtype(value: Item) -> DataType:
|
35 |
+
if isinstance(value, str):
|
36 |
+
return DataType.STRING
|
37 |
+
elif isinstance(value, bool):
|
38 |
+
return DataType.BOOLEAN
|
39 |
+
elif isinstance(value, bytes):
|
40 |
+
return DataType.BINARY
|
41 |
+
elif isinstance(value, float):
|
42 |
+
return DataType.FLOAT32
|
43 |
+
elif isinstance(value, int):
|
44 |
+
return DataType.INT32
|
45 |
+
elif isinstance(value, datetime):
|
46 |
+
return DataType.TIMESTAMP
|
47 |
+
else:
|
48 |
+
raise ValueError(f'Cannot infer dtype of primitive value: {value}')
|
49 |
+
|
50 |
+
|
51 |
+
def _infer_field(item: Item) -> Field:
|
52 |
+
"""Infer the schema from the items."""
|
53 |
+
if isinstance(item, dict):
|
54 |
+
fields: dict[str, Field] = {}
|
55 |
+
for k, v in item.items():
|
56 |
+
fields[k] = _infer_field(cast(Item, v))
|
57 |
+
dtype = None
|
58 |
+
if VALUE_KEY in fields:
|
59 |
+
dtype = fields[VALUE_KEY].dtype
|
60 |
+
del fields[VALUE_KEY]
|
61 |
+
return Field(fields=fields, dtype=dtype)
|
62 |
+
elif is_primitive(item):
|
63 |
+
return Field(dtype=_infer_dtype(item))
|
64 |
+
elif isinstance(item, list):
|
65 |
+
return Field(repeated_field=_infer_field(item[0]))
|
66 |
+
else:
|
67 |
+
raise ValueError(f'Cannot infer schema of item: {item}')
|
68 |
+
|
69 |
+
|
70 |
+
def _infer_schema(items: list[Item]) -> Schema:
|
71 |
+
"""Infer the schema from the items."""
|
72 |
+
schema = Schema(fields={})
|
73 |
+
for item in items:
|
74 |
+
field = _infer_field(item)
|
75 |
+
if not field.fields:
|
76 |
+
raise ValueError(f'Invalid schema of item. Expected an object, but got: {item}')
|
77 |
+
schema.fields = {**schema.fields, **field.fields}
|
78 |
+
return schema
|
79 |
+
|
80 |
+
|
81 |
+
class TestDataMaker(Protocol):
|
82 |
+
"""A function that creates a test dataset."""
|
83 |
+
|
84 |
+
def __call__(self, items: list[Item], schema: Optional[Schema] = None) -> Dataset:
|
85 |
+
"""Create a test dataset."""
|
86 |
+
...
|
87 |
+
|
88 |
+
|
89 |
+
class TestSource(Source):
|
90 |
+
"""Test source that does nothing."""
|
91 |
+
name = 'test_source'
|
92 |
+
|
93 |
+
|
94 |
+
def make_dataset(dataset_cls: Type[Dataset],
|
95 |
+
tmp_path: pathlib.Path,
|
96 |
+
items: list[Item],
|
97 |
+
schema: Optional[Schema] = None) -> Dataset:
|
98 |
+
"""Create a test dataset."""
|
99 |
+
schema = schema or _infer_schema(items)
|
100 |
+
_write_items(tmp_path, TEST_DATASET_NAME, items, schema)
|
101 |
+
dataset = dataset_cls(TEST_NAMESPACE, TEST_DATASET_NAME)
|
102 |
+
|
103 |
+
config = DatasetConfig(
|
104 |
+
namespace=TEST_NAMESPACE,
|
105 |
+
name=TEST_DATASET_NAME,
|
106 |
+
source=TestSource(),
|
107 |
+
settings=default_settings(dataset))
|
108 |
+
config_filepath = os.path.join(
|
109 |
+
get_dataset_output_dir(str(tmp_path), TEST_NAMESPACE, TEST_DATASET_NAME), CONFIG_FILENAME)
|
110 |
+
with open_file(config_filepath, 'w') as f:
|
111 |
+
f.write(to_yaml(config.dict(exclude_defaults=True, exclude_none=True, exclude_unset=True)))
|
112 |
+
|
113 |
+
return dataset
|
114 |
+
|
115 |
+
|
116 |
+
def _write_items(tmpdir: pathlib.Path, dataset_name: str, items: list[Item],
|
117 |
+
schema: Schema) -> None:
|
118 |
+
"""Write the items JSON to the dataset format: manifest.json and parquet files."""
|
119 |
+
source_dir = get_dataset_output_dir(str(tmpdir), TEST_NAMESPACE, dataset_name)
|
120 |
+
os.makedirs(source_dir)
|
121 |
+
|
122 |
+
# Add rowids to the items.
|
123 |
+
items = [deepcopy(item) for item in items]
|
124 |
+
for i, item in enumerate(items):
|
125 |
+
item[ROWID] = str(i + 1)
|
126 |
+
|
127 |
+
simple_parquet_files, _ = write_items_to_parquet(
|
128 |
+
items, source_dir, schema, filename_prefix=PARQUET_FILENAME_PREFIX, shard_index=0, num_shards=1)
|
129 |
+
manifest = SourceManifest(files=[simple_parquet_files], data_schema=schema)
|
130 |
+
with open_file(os.path.join(source_dir, MANIFEST_FILENAME), 'w') as f:
|
131 |
+
f.write(manifest.json(indent=2, exclude_none=True))
|
132 |
+
|
133 |
+
|
134 |
+
def enriched_item(value: Optional[Item] = None, metadata: dict[str, Item] = {}) -> Item:
|
135 |
+
"""Wrap a value in a dict with the value key."""
|
136 |
+
return {VALUE_KEY: value, **metadata}
|
137 |
+
|
138 |
+
|
139 |
+
def make_vector_index(vector_store: str, vector_dict: dict[PathKey,
|
140 |
+
list[list[float]]]) -> VectorDBIndex:
|
141 |
+
"""Make a vector index from a dictionary of vector keys to vectors."""
|
142 |
+
embeddings: list[np.ndarray] = []
|
143 |
+
spans: list[tuple[PathKey, list[tuple[int, int]]]] = []
|
144 |
+
for path_key, vectors in vector_dict.items():
|
145 |
+
vector_spans: list[tuple[int, int]] = []
|
146 |
+
for i, vector in enumerate(vectors):
|
147 |
+
embeddings.append(np.array(vector))
|
148 |
+
vector_spans.append((0, 0))
|
149 |
+
spans.append((path_key, vector_spans))
|
150 |
+
|
151 |
+
vector_index = VectorDBIndex(vector_store)
|
152 |
+
vector_index.add(spans, np.array(embeddings))
|
153 |
+
return vector_index
|
lilac/data/dataset_utils.py
ADDED
@@ -0,0 +1,313 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Utilities for working with datasets."""
|
2 |
+
|
3 |
+
import gc
|
4 |
+
import json
|
5 |
+
import math
|
6 |
+
import os
|
7 |
+
import pprint
|
8 |
+
import secrets
|
9 |
+
from collections.abc import Iterable
|
10 |
+
from typing import Any, Callable, Iterator, Optional, Sequence, TypeVar, Union, cast
|
11 |
+
|
12 |
+
import numpy as np
|
13 |
+
import pyarrow as pa
|
14 |
+
|
15 |
+
from ..batch_utils import deep_flatten
|
16 |
+
from ..embeddings.vector_store import VectorDBIndex
|
17 |
+
from ..env import env
|
18 |
+
from ..parquet_writer import ParquetWriter
|
19 |
+
from ..schema import (
|
20 |
+
EMBEDDING_KEY,
|
21 |
+
PATH_WILDCARD,
|
22 |
+
ROWID,
|
23 |
+
TEXT_SPAN_END_FEATURE,
|
24 |
+
TEXT_SPAN_START_FEATURE,
|
25 |
+
VALUE_KEY,
|
26 |
+
DataType,
|
27 |
+
Field,
|
28 |
+
Item,
|
29 |
+
PathKey,
|
30 |
+
PathTuple,
|
31 |
+
Schema,
|
32 |
+
VectorKey,
|
33 |
+
field,
|
34 |
+
schema,
|
35 |
+
schema_to_arrow_schema,
|
36 |
+
)
|
37 |
+
from ..signal import Signal
|
38 |
+
from ..utils import is_primitive, log, open_file
|
39 |
+
|
40 |
+
|
41 |
+
def _replace_embeddings_with_none(input: Union[Item, Item]) -> Union[Item, Item]:
|
42 |
+
if isinstance(input, np.ndarray):
|
43 |
+
return None
|
44 |
+
if isinstance(input, dict):
|
45 |
+
return {k: _replace_embeddings_with_none(v) for k, v in input.items()}
|
46 |
+
if isinstance(input, list):
|
47 |
+
return [_replace_embeddings_with_none(v) for v in input]
|
48 |
+
|
49 |
+
return input
|
50 |
+
|
51 |
+
|
52 |
+
def replace_embeddings_with_none(input: Union[Item, Item]) -> Item:
|
53 |
+
"""Replaces all embeddings with None."""
|
54 |
+
return cast(Item, _replace_embeddings_with_none(input))
|
55 |
+
|
56 |
+
|
57 |
+
def count_primitives(input: Union[Iterable, Iterator]) -> int:
|
58 |
+
"""Iterate through each element of the input, flattening each one, computing a count.
|
59 |
+
|
60 |
+
Sum the final set of counts. This is the important iterable not to exhaust.
|
61 |
+
"""
|
62 |
+
return sum((len(list(deep_flatten(i))) for i in input))
|
63 |
+
|
64 |
+
|
65 |
+
def _wrap_value_in_dict(input: Union[object, dict], props: PathTuple) -> Union[object, dict]:
|
66 |
+
# If the signal produced no value, or nan, we should return None so the parquet value is sparse.
|
67 |
+
if isinstance(input, float) and math.isnan(input):
|
68 |
+
input = None
|
69 |
+
for prop in reversed(props):
|
70 |
+
input = {prop: input}
|
71 |
+
return input
|
72 |
+
|
73 |
+
|
74 |
+
def _wrap_in_dicts(input: Union[object, Iterable[object]],
|
75 |
+
spec: list[PathTuple]) -> Union[object, Iterable[object]]:
|
76 |
+
"""Wraps an object or iterable in a dict according to the spec."""
|
77 |
+
props = spec[0] if spec else tuple()
|
78 |
+
if len(spec) == 1:
|
79 |
+
return _wrap_value_in_dict(input, props)
|
80 |
+
if input is None or isinstance(input, float) and math.isnan(input):
|
81 |
+
# Return empty dict for missing inputs.
|
82 |
+
return {}
|
83 |
+
res = [_wrap_in_dicts(elem, spec[1:]) for elem in cast(Iterable, input)]
|
84 |
+
return _wrap_value_in_dict(res, props)
|
85 |
+
|
86 |
+
|
87 |
+
def wrap_in_dicts(input: Iterable[object], spec: list[PathTuple]) -> Iterable[object]:
|
88 |
+
"""Wraps an object or iterable in a dict according to the spec."""
|
89 |
+
return [_wrap_in_dicts(elem, spec) for elem in input]
|
90 |
+
|
91 |
+
|
92 |
+
def _merge_field_into(schema: Field, destination: Field) -> None:
|
93 |
+
if isinstance(schema, Field):
|
94 |
+
destination.signal = destination.signal or schema.signal
|
95 |
+
destination.dtype = destination.dtype or schema.dtype
|
96 |
+
if schema.fields:
|
97 |
+
destination.fields = destination.fields or {}
|
98 |
+
for field_name, subfield in schema.fields.items():
|
99 |
+
if field_name not in destination.fields:
|
100 |
+
destination.fields[field_name] = subfield.copy(deep=True)
|
101 |
+
else:
|
102 |
+
_merge_field_into(subfield, destination.fields[field_name])
|
103 |
+
elif schema.repeated_field:
|
104 |
+
if not destination.repeated_field:
|
105 |
+
raise ValueError('Failed to merge schemas. Origin schema is repeated, but destination is not')
|
106 |
+
_merge_field_into(schema.repeated_field, destination.repeated_field)
|
107 |
+
else:
|
108 |
+
if destination.dtype != schema.dtype:
|
109 |
+
raise ValueError(f'Failed to merge schemas. Origin schema has dtype {schema.dtype}, '
|
110 |
+
f'but destination has dtype {destination.dtype}')
|
111 |
+
|
112 |
+
|
113 |
+
def merge_schemas(schemas: Sequence[Union[Schema, Field]]) -> Schema:
|
114 |
+
"""Merge a list of schemas."""
|
115 |
+
merged_schema = Schema(fields={})
|
116 |
+
for s in schemas:
|
117 |
+
_merge_field_into(cast(Field, s), cast(Field, merged_schema))
|
118 |
+
return merged_schema
|
119 |
+
|
120 |
+
|
121 |
+
def schema_contains_path(schema: Schema, path: PathTuple) -> bool:
|
122 |
+
"""Check if a schema contains a path."""
|
123 |
+
current_field = cast(Field, schema)
|
124 |
+
for path_part in path:
|
125 |
+
if path_part == PATH_WILDCARD:
|
126 |
+
if current_field.repeated_field is None:
|
127 |
+
return False
|
128 |
+
current_field = current_field.repeated_field
|
129 |
+
else:
|
130 |
+
if current_field.fields is None or path_part not in current_field.fields:
|
131 |
+
return False
|
132 |
+
current_field = current_field.fields[str(path_part)]
|
133 |
+
return True
|
134 |
+
|
135 |
+
|
136 |
+
def create_signal_schema(signal: Signal, source_path: PathTuple, current_schema: Schema) -> Schema:
|
137 |
+
"""Create a schema describing the enriched fields added an enrichment."""
|
138 |
+
leafs = current_schema.leafs
|
139 |
+
# Validate that the enrich fields are actually a valid leaf path.
|
140 |
+
if source_path not in leafs:
|
141 |
+
raise ValueError(f'"{source_path}" is not a valid leaf path. Leaf paths: {leafs.keys()}')
|
142 |
+
|
143 |
+
signal_schema = signal.fields()
|
144 |
+
signal_schema.signal = signal.dict()
|
145 |
+
|
146 |
+
enriched_schema = field(fields={signal.key(is_computed_signal=True): signal_schema})
|
147 |
+
|
148 |
+
for path_part in reversed(source_path):
|
149 |
+
if path_part == PATH_WILDCARD:
|
150 |
+
enriched_schema = Field(repeated_field=enriched_schema)
|
151 |
+
else:
|
152 |
+
enriched_schema = Field(fields={path_part: enriched_schema})
|
153 |
+
|
154 |
+
if not enriched_schema.fields:
|
155 |
+
raise ValueError('This should not happen since enriched_schema always has fields (see above)')
|
156 |
+
|
157 |
+
return schema(enriched_schema.fields.copy())
|
158 |
+
|
159 |
+
|
160 |
+
def write_embeddings_to_disk(vector_store: str, rowids: Iterable[str], signal_items: Iterable[Item],
|
161 |
+
output_dir: str) -> None:
|
162 |
+
"""Write a set of embeddings to disk."""
|
163 |
+
|
164 |
+
def embedding_predicate(input: Any) -> bool:
|
165 |
+
return (isinstance(input, list) and len(input) > 0 and isinstance(input[0], dict) and
|
166 |
+
EMBEDDING_KEY in input[0])
|
167 |
+
|
168 |
+
path_keys = flatten_keys(rowids, signal_items, is_primitive_predicate=embedding_predicate)
|
169 |
+
all_embeddings = cast(Iterable[Item],
|
170 |
+
deep_flatten(signal_items, is_primitive_predicate=embedding_predicate))
|
171 |
+
|
172 |
+
embedding_vectors: list[np.ndarray] = []
|
173 |
+
all_spans: list[tuple[PathKey, list[tuple[int, int]]]] = []
|
174 |
+
for path_key, embeddings in zip(path_keys, all_embeddings):
|
175 |
+
if not path_key or not embeddings:
|
176 |
+
# Sparse embeddings may not have an embedding for every key.
|
177 |
+
continue
|
178 |
+
|
179 |
+
spans: list[tuple[int, int]] = []
|
180 |
+
for e in embeddings:
|
181 |
+
span = e[VALUE_KEY]
|
182 |
+
vector = e[EMBEDDING_KEY]
|
183 |
+
# We squeeze here because embedding functions can return outer dimensions of 1.
|
184 |
+
embedding_vectors.append(vector.reshape(-1))
|
185 |
+
spans.append((span[TEXT_SPAN_START_FEATURE], span[TEXT_SPAN_END_FEATURE]))
|
186 |
+
all_spans.append((path_key, spans))
|
187 |
+
embedding_matrix = np.array(embedding_vectors, dtype=np.float32)
|
188 |
+
del path_keys, all_embeddings, embedding_vectors
|
189 |
+
gc.collect()
|
190 |
+
|
191 |
+
# Write to disk.
|
192 |
+
vector_index = VectorDBIndex(vector_store)
|
193 |
+
vector_index.add(all_spans, embedding_matrix)
|
194 |
+
vector_index.save(output_dir)
|
195 |
+
|
196 |
+
del vector_index
|
197 |
+
gc.collect()
|
198 |
+
|
199 |
+
|
200 |
+
def write_items_to_parquet(items: Iterable[Item], output_dir: str, schema: Schema,
|
201 |
+
filename_prefix: str, shard_index: int,
|
202 |
+
num_shards: int) -> tuple[str, int]:
|
203 |
+
"""Write a set of items to a parquet file, in columnar format."""
|
204 |
+
schema = schema.copy(deep=True)
|
205 |
+
# Add a rowid column.
|
206 |
+
schema.fields[ROWID] = Field(dtype=DataType.STRING)
|
207 |
+
|
208 |
+
arrow_schema = schema_to_arrow_schema(schema)
|
209 |
+
out_filename = parquet_filename(filename_prefix, shard_index, num_shards)
|
210 |
+
filepath = os.path.join(output_dir, out_filename)
|
211 |
+
f = open_file(filepath, mode='wb')
|
212 |
+
writer = ParquetWriter(schema)
|
213 |
+
writer.open(f)
|
214 |
+
debug = env('DEBUG', False)
|
215 |
+
num_items = 0
|
216 |
+
for item in items:
|
217 |
+
# Add a rowid column.
|
218 |
+
if ROWID not in item:
|
219 |
+
item[ROWID] = secrets.token_urlsafe(nbytes=12) # 16 base64 characters.
|
220 |
+
if debug:
|
221 |
+
try:
|
222 |
+
_validate(item, arrow_schema)
|
223 |
+
except Exception as e:
|
224 |
+
raise ValueError(f'Error validating item: {json.dumps(item)}') from e
|
225 |
+
writer.write(item)
|
226 |
+
num_items += 1
|
227 |
+
writer.close()
|
228 |
+
f.close()
|
229 |
+
return out_filename, num_items
|
230 |
+
|
231 |
+
|
232 |
+
def _validate(item: Item, schema: pa.Schema) -> None:
|
233 |
+
# Try to parse the item using the inferred schema.
|
234 |
+
try:
|
235 |
+
pa.RecordBatch.from_pylist([item], schema=schema)
|
236 |
+
except pa.ArrowTypeError:
|
237 |
+
log('Failed to parse arrow item using the arrow schema.')
|
238 |
+
log('Item:')
|
239 |
+
log(pprint.pformat(item, indent=2))
|
240 |
+
log('Arrow schema:')
|
241 |
+
log(schema)
|
242 |
+
raise # Re-raise the same exception, same stacktrace.
|
243 |
+
|
244 |
+
|
245 |
+
def parquet_filename(prefix: str, shard_index: int, num_shards: int) -> str:
|
246 |
+
"""Return the filename for a parquet file."""
|
247 |
+
return f'{prefix}-{shard_index:05d}-of-{num_shards:05d}.parquet'
|
248 |
+
|
249 |
+
|
250 |
+
def _flatten_keys(rowid: str, nested_input: Iterable, location: list[int],
|
251 |
+
is_primitive_predicate: Callable[[object], bool]) -> Iterator[VectorKey]:
|
252 |
+
if is_primitive_predicate(nested_input) or is_primitive(nested_input) or isinstance(
|
253 |
+
nested_input, dict):
|
254 |
+
yield (rowid, *location)
|
255 |
+
return
|
256 |
+
|
257 |
+
for i, input in enumerate(nested_input):
|
258 |
+
yield from _flatten_keys(rowid, input, [*location, i], is_primitive_predicate)
|
259 |
+
|
260 |
+
|
261 |
+
def flatten_keys(
|
262 |
+
rowids: Iterable[str],
|
263 |
+
nested_input: Iterable,
|
264 |
+
is_primitive_predicate: Callable[[object],
|
265 |
+
bool] = is_primitive) -> Iterator[Optional[VectorKey]]:
|
266 |
+
"""Flatten the rowids of a nested input."""
|
267 |
+
for rowid, input in zip(rowids, nested_input):
|
268 |
+
if input is None:
|
269 |
+
yield None
|
270 |
+
continue
|
271 |
+
yield from _flatten_keys(rowid, input, [], is_primitive_predicate)
|
272 |
+
|
273 |
+
|
274 |
+
Tin = TypeVar('Tin')
|
275 |
+
Tout = TypeVar('Tout')
|
276 |
+
|
277 |
+
|
278 |
+
def sparse_to_dense_compute(
|
279 |
+
sparse_input: Iterator[Optional[Tin]],
|
280 |
+
func: Callable[[Iterable[Tin]], Iterable[Tout]]) -> Iterator[Optional[Tout]]:
|
281 |
+
"""Densifies the input before calling the provided `func` and sparsifies the output."""
|
282 |
+
locations: list[int] = []
|
283 |
+
total_size: int = 0
|
284 |
+
|
285 |
+
def densify(x: Iterator[Optional[Tin]]) -> Iterator[Tin]:
|
286 |
+
nonlocal locations, total_size
|
287 |
+
for i, value in enumerate(x):
|
288 |
+
total_size += 1
|
289 |
+
if value is not None:
|
290 |
+
locations.append(i)
|
291 |
+
yield value
|
292 |
+
|
293 |
+
dense_input = densify(sparse_input)
|
294 |
+
dense_output = iter(func(dense_input))
|
295 |
+
index = 0
|
296 |
+
|
297 |
+
location_index = 0
|
298 |
+
|
299 |
+
while True:
|
300 |
+
try:
|
301 |
+
out = next(dense_output)
|
302 |
+
out_index = locations[location_index]
|
303 |
+
while index < out_index:
|
304 |
+
yield None
|
305 |
+
index += 1
|
306 |
+
yield out
|
307 |
+
location_index += 1
|
308 |
+
index += 1
|
309 |
+
except StopIteration:
|
310 |
+
while index < total_size:
|
311 |
+
yield None
|
312 |
+
index += 1
|
313 |
+
return
|
lilac/data_loader.py
ADDED
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""A data loader standalone binary. This should only be run as a script to load a dataset.
|
2 |
+
|
3 |
+
To run the source loader as a binary directly:
|
4 |
+
|
5 |
+
poetry run python -m lilac.data_loader \
|
6 |
+
--dataset_name=movies_dataset \
|
7 |
+
--output_dir=./data/ \
|
8 |
+
--config_path=./datasets/the_movies_dataset.json
|
9 |
+
"""
|
10 |
+
import os
|
11 |
+
import pathlib
|
12 |
+
import uuid
|
13 |
+
from typing import Iterable, Optional, Union
|
14 |
+
|
15 |
+
import pandas as pd
|
16 |
+
|
17 |
+
from .config import CONFIG_FILENAME, DatasetConfig
|
18 |
+
from .data.dataset import Dataset, default_settings
|
19 |
+
from .data.dataset_utils import write_items_to_parquet
|
20 |
+
from .db_manager import get_dataset
|
21 |
+
from .env import data_path
|
22 |
+
from .schema import (
|
23 |
+
MANIFEST_FILENAME,
|
24 |
+
PARQUET_FILENAME_PREFIX,
|
25 |
+
ROWID,
|
26 |
+
Field,
|
27 |
+
Item,
|
28 |
+
Schema,
|
29 |
+
SourceManifest,
|
30 |
+
is_float,
|
31 |
+
)
|
32 |
+
from .tasks import TaskStepId, progress
|
33 |
+
from .utils import get_dataset_output_dir, log, open_file, to_yaml
|
34 |
+
|
35 |
+
|
36 |
+
def create_dataset(config: DatasetConfig) -> Dataset:
|
37 |
+
"""Load a dataset from a given source configuration."""
|
38 |
+
process_source(data_path(), config)
|
39 |
+
return get_dataset(config.namespace, config.name)
|
40 |
+
|
41 |
+
|
42 |
+
def process_source(base_dir: Union[str, pathlib.Path],
|
43 |
+
config: DatasetConfig,
|
44 |
+
task_step_id: Optional[TaskStepId] = None) -> tuple[str, int]:
|
45 |
+
"""Process a source."""
|
46 |
+
output_dir = get_dataset_output_dir(base_dir, config.namespace, config.name)
|
47 |
+
|
48 |
+
config.source.setup()
|
49 |
+
source_schema = config.source.source_schema()
|
50 |
+
items = config.source.process()
|
51 |
+
|
52 |
+
# Add rowids and fix NaN in string columns.
|
53 |
+
items = normalize_items(items, source_schema.fields)
|
54 |
+
|
55 |
+
# Add progress.
|
56 |
+
items = progress(
|
57 |
+
items,
|
58 |
+
task_step_id=task_step_id,
|
59 |
+
estimated_len=source_schema.num_items,
|
60 |
+
step_description=f'Reading from source {config.source.name}...')
|
61 |
+
|
62 |
+
# Filter out the `None`s after progress.
|
63 |
+
items = (item for item in items if item is not None)
|
64 |
+
|
65 |
+
data_schema = Schema(fields=source_schema.fields.copy())
|
66 |
+
filepath, num_items = write_items_to_parquet(
|
67 |
+
items=items,
|
68 |
+
output_dir=output_dir,
|
69 |
+
schema=data_schema,
|
70 |
+
filename_prefix=PARQUET_FILENAME_PREFIX,
|
71 |
+
shard_index=0,
|
72 |
+
num_shards=1)
|
73 |
+
|
74 |
+
filenames = [os.path.basename(filepath)]
|
75 |
+
manifest = SourceManifest(files=filenames, data_schema=data_schema, images=None)
|
76 |
+
with open_file(os.path.join(output_dir, MANIFEST_FILENAME), 'w') as f:
|
77 |
+
f.write(manifest.json(indent=2, exclude_none=True))
|
78 |
+
|
79 |
+
if not config.settings:
|
80 |
+
dataset = get_dataset(config.namespace, config.name)
|
81 |
+
config.settings = default_settings(dataset)
|
82 |
+
with open_file(os.path.join(output_dir, CONFIG_FILENAME), 'w') as f:
|
83 |
+
f.write(to_yaml(config.dict(exclude_defaults=True, exclude_none=True)))
|
84 |
+
|
85 |
+
log(f'Dataset "{config.name}" written to {output_dir}')
|
86 |
+
|
87 |
+
return output_dir, num_items
|
88 |
+
|
89 |
+
|
90 |
+
def normalize_items(items: Iterable[Item], fields: dict[str, Field]) -> Item:
|
91 |
+
"""Sanitize items by removing NaNs and NaTs."""
|
92 |
+
replace_nan_fields = [
|
93 |
+
field_name for field_name, field in fields.items() if field.dtype and not is_float(field.dtype)
|
94 |
+
]
|
95 |
+
for item in items:
|
96 |
+
if item is None:
|
97 |
+
yield item
|
98 |
+
continue
|
99 |
+
|
100 |
+
# Add rowid if it doesn't exist.
|
101 |
+
if ROWID not in item:
|
102 |
+
item[ROWID] = uuid.uuid4().hex
|
103 |
+
|
104 |
+
# Fix NaN values.
|
105 |
+
for field_name in replace_nan_fields:
|
106 |
+
item_value = item.get(field_name)
|
107 |
+
if item_value and pd.isna(item_value):
|
108 |
+
item[field_name] = None
|
109 |
+
|
110 |
+
yield item
|
lilac/db_manager.py
ADDED
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Manages mapping the dataset name to the database instance."""
|
2 |
+
import os
|
3 |
+
import pathlib
|
4 |
+
import threading
|
5 |
+
from typing import Optional, Type, Union
|
6 |
+
|
7 |
+
import yaml
|
8 |
+
from pydantic import BaseModel
|
9 |
+
|
10 |
+
from .config import DatasetConfig
|
11 |
+
from .data.dataset import Dataset
|
12 |
+
from .data.dataset_duckdb import get_config_filepath
|
13 |
+
from .utils import get_datasets_dir
|
14 |
+
|
15 |
+
_DEFAULT_DATASET_CLS: Type[Dataset]
|
16 |
+
|
17 |
+
_CACHED_DATASETS: dict[str, Dataset] = {}
|
18 |
+
|
19 |
+
_db_lock = threading.Lock()
|
20 |
+
|
21 |
+
|
22 |
+
def get_dataset(namespace: str, dataset_name: str) -> Dataset:
|
23 |
+
"""Get the dataset instance."""
|
24 |
+
if not _DEFAULT_DATASET_CLS:
|
25 |
+
raise ValueError('Default dataset class not set.')
|
26 |
+
cache_key = f'{namespace}/{dataset_name}'
|
27 |
+
# https://docs.pytest.org/en/latest/example/simple.html#pytest-current-test-environment-variable
|
28 |
+
inside_test = 'PYTEST_CURRENT_TEST' in os.environ
|
29 |
+
with _db_lock:
|
30 |
+
if cache_key not in _CACHED_DATASETS or inside_test:
|
31 |
+
_CACHED_DATASETS[cache_key] = _DEFAULT_DATASET_CLS(
|
32 |
+
namespace=namespace, dataset_name=dataset_name)
|
33 |
+
return _CACHED_DATASETS[cache_key]
|
34 |
+
|
35 |
+
|
36 |
+
def remove_dataset_from_cache(namespace: str, dataset_name: str) -> None:
|
37 |
+
"""Remove the dataset from the db manager cache."""
|
38 |
+
cache_key = f'{namespace}/{dataset_name}'
|
39 |
+
with _db_lock:
|
40 |
+
if cache_key in _CACHED_DATASETS:
|
41 |
+
del _CACHED_DATASETS[cache_key]
|
42 |
+
|
43 |
+
|
44 |
+
class DatasetInfo(BaseModel):
|
45 |
+
"""Information about a dataset."""
|
46 |
+
namespace: str
|
47 |
+
dataset_name: str
|
48 |
+
description: Optional[str] = None
|
49 |
+
tags: list[str] = []
|
50 |
+
|
51 |
+
|
52 |
+
def list_datasets(base_dir: Union[str, pathlib.Path]) -> list[DatasetInfo]:
|
53 |
+
"""List the datasets in a data directory."""
|
54 |
+
datasets_path = get_datasets_dir(base_dir)
|
55 |
+
|
56 |
+
# Skip if 'datasets' doesn't exist.
|
57 |
+
if not os.path.isdir(datasets_path):
|
58 |
+
return []
|
59 |
+
|
60 |
+
dataset_infos: list[DatasetInfo] = []
|
61 |
+
for namespace in os.listdir(datasets_path):
|
62 |
+
dataset_dir = os.path.join(datasets_path, namespace)
|
63 |
+
# Skip if namespace is not a directory.
|
64 |
+
if not os.path.isdir(dataset_dir):
|
65 |
+
continue
|
66 |
+
if namespace.startswith('.'):
|
67 |
+
continue
|
68 |
+
|
69 |
+
for dataset_name in os.listdir(dataset_dir):
|
70 |
+
# Skip if dataset_name is not a directory.
|
71 |
+
dataset_path = os.path.join(dataset_dir, dataset_name)
|
72 |
+
if not os.path.isdir(dataset_path):
|
73 |
+
continue
|
74 |
+
if dataset_name.startswith('.'):
|
75 |
+
continue
|
76 |
+
|
77 |
+
# Open the config file to read the tags. We avoid instantiating a dataset for now to reduce
|
78 |
+
# the overhead of listing datasets.
|
79 |
+
config_filepath = get_config_filepath(namespace, dataset_name)
|
80 |
+
tags = []
|
81 |
+
if os.path.exists(config_filepath):
|
82 |
+
with open(config_filepath) as f:
|
83 |
+
config = DatasetConfig(**yaml.safe_load(f))
|
84 |
+
tags = config.tags
|
85 |
+
|
86 |
+
dataset_infos.append(DatasetInfo(namespace=namespace, dataset_name=dataset_name, tags=tags))
|
87 |
+
|
88 |
+
return dataset_infos
|
89 |
+
|
90 |
+
|
91 |
+
# TODO(nsthorat): Make this a registry once we have multiple dataset implementations. This breaks a
|
92 |
+
# circular dependency.
|
93 |
+
def set_default_dataset_cls(dataset_cls: Type[Dataset]) -> None:
|
94 |
+
"""Set the default dataset class."""
|
95 |
+
global _DEFAULT_DATASET_CLS
|
96 |
+
_DEFAULT_DATASET_CLS = dataset_cls
|
lilac/embeddings/__init__.py
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Embeddings compute a vector for a chunk of a document."""
|
2 |
+
|
3 |
+
from .embedding import compute_split_embeddings
|
4 |
+
|
5 |
+
__all__ = [
|
6 |
+
'compute_split_embeddings',
|
7 |
+
]
|
lilac/embeddings/cohere.py
ADDED
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Cohere embeddings."""
|
2 |
+
from typing import TYPE_CHECKING, Iterable, cast
|
3 |
+
|
4 |
+
import numpy as np
|
5 |
+
from typing_extensions import override
|
6 |
+
|
7 |
+
from ..env import env
|
8 |
+
from ..schema import Item, RichData
|
9 |
+
from ..signal import TextEmbeddingSignal
|
10 |
+
from ..splitters.chunk_splitter import split_text
|
11 |
+
from .embedding import compute_split_embeddings
|
12 |
+
|
13 |
+
if TYPE_CHECKING:
|
14 |
+
from cohere import Client
|
15 |
+
|
16 |
+
NUM_PARALLEL_REQUESTS = 10
|
17 |
+
COHERE_BATCH_SIZE = 96
|
18 |
+
|
19 |
+
|
20 |
+
class Cohere(TextEmbeddingSignal):
|
21 |
+
"""Computes embeddings using Cohere's embedding API.
|
22 |
+
|
23 |
+
<br>**Important**: This will send data to an external server!
|
24 |
+
|
25 |
+
<br>To use this signal, you must get a Cohere API key from
|
26 |
+
[cohere.com/embed](https://cohere.com/embed) and add it to your .env.local.
|
27 |
+
|
28 |
+
<br>For details on pricing, see: https://cohere.com/pricing.
|
29 |
+
"""
|
30 |
+
|
31 |
+
name = 'cohere'
|
32 |
+
display_name = 'Cohere Embeddings'
|
33 |
+
|
34 |
+
_model: 'Client'
|
35 |
+
|
36 |
+
@override
|
37 |
+
def setup(self) -> None:
|
38 |
+
"""Validate that the api key and python package exists in environment."""
|
39 |
+
api_key = env('COHERE_API_KEY')
|
40 |
+
if not api_key:
|
41 |
+
raise ValueError('`COHERE_API_KEY` environment variable not set.')
|
42 |
+
try:
|
43 |
+
import cohere
|
44 |
+
self._model = cohere.Client(api_key, max_retries=10)
|
45 |
+
except ImportError:
|
46 |
+
raise ImportError('Could not import the "cohere" python package. '
|
47 |
+
'Please install it with `pip install cohere`.')
|
48 |
+
|
49 |
+
@override
|
50 |
+
def compute(self, docs: Iterable[RichData]) -> Iterable[Item]:
|
51 |
+
"""Compute embeddings for the given documents."""
|
52 |
+
|
53 |
+
def embed_fn(texts: list[str]) -> list[np.ndarray]:
|
54 |
+
return self._model.embed(texts, truncate='END').embeddings
|
55 |
+
|
56 |
+
docs = cast(Iterable[str], docs)
|
57 |
+
split_fn = split_text if self._split else None
|
58 |
+
yield from compute_split_embeddings(
|
59 |
+
docs, COHERE_BATCH_SIZE, embed_fn, split_fn, num_parallel_requests=NUM_PARALLEL_REQUESTS)
|
lilac/embeddings/default_vector_stores.py
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Registers all vector stores."""
|
2 |
+
from .vector_store import register_vector_store
|
3 |
+
from .vector_store_hnsw import HNSWVectorStore
|
4 |
+
from .vector_store_numpy import NumpyVectorStore
|
5 |
+
|
6 |
+
|
7 |
+
def register_default_vector_stores() -> None:
|
8 |
+
"""Register all the default vector stores."""
|
9 |
+
register_vector_store(HNSWVectorStore)
|
10 |
+
register_vector_store(NumpyVectorStore)
|
lilac/embeddings/embedding.py
ADDED
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Embedding registry."""
|
2 |
+
from concurrent.futures import ThreadPoolExecutor
|
3 |
+
from typing import Callable, Generator, Iterable, Iterator, Optional, Union, cast
|
4 |
+
|
5 |
+
import numpy as np
|
6 |
+
from pydantic import StrictStr
|
7 |
+
from sklearn.preprocessing import normalize
|
8 |
+
|
9 |
+
from ..schema import (
|
10 |
+
EMBEDDING_KEY,
|
11 |
+
TEXT_SPAN_END_FEATURE,
|
12 |
+
TEXT_SPAN_START_FEATURE,
|
13 |
+
VALUE_KEY,
|
14 |
+
Item,
|
15 |
+
RichData,
|
16 |
+
SpanVector,
|
17 |
+
lilac_embedding,
|
18 |
+
)
|
19 |
+
from ..signal import TextEmbeddingSignal, get_signal_by_type
|
20 |
+
from ..splitters.chunk_splitter import TextChunk
|
21 |
+
from ..utils import chunks
|
22 |
+
|
23 |
+
EmbeddingId = Union[StrictStr, TextEmbeddingSignal]
|
24 |
+
|
25 |
+
EmbedFn = Callable[[Iterable[RichData]], Iterator[list[SpanVector]]]
|
26 |
+
|
27 |
+
|
28 |
+
def get_embed_fn(embedding_name: str, split: bool) -> EmbedFn:
|
29 |
+
"""Return a function that returns the embedding matrix for the given embedding signal."""
|
30 |
+
embedding_cls = get_signal_by_type(embedding_name, TextEmbeddingSignal)
|
31 |
+
embedding = embedding_cls(split=split)
|
32 |
+
embedding.setup()
|
33 |
+
|
34 |
+
def _embed_fn(data: Iterable[RichData]) -> Iterator[list[SpanVector]]:
|
35 |
+
items = embedding.compute(data)
|
36 |
+
|
37 |
+
for item in items:
|
38 |
+
if not item:
|
39 |
+
raise ValueError('Embedding signal returned None.')
|
40 |
+
|
41 |
+
yield [{
|
42 |
+
'vector': item_val[EMBEDDING_KEY].reshape(-1),
|
43 |
+
'span':
|
44 |
+
(item_val[VALUE_KEY][TEXT_SPAN_START_FEATURE], item_val[VALUE_KEY][TEXT_SPAN_END_FEATURE])
|
45 |
+
} for item_val in item]
|
46 |
+
|
47 |
+
return _embed_fn
|
48 |
+
|
49 |
+
|
50 |
+
def compute_split_embeddings(docs: Iterable[str],
|
51 |
+
batch_size: int,
|
52 |
+
embed_fn: Callable[[list[str]], list[np.ndarray]],
|
53 |
+
split_fn: Optional[Callable[[str], list[TextChunk]]] = None,
|
54 |
+
num_parallel_requests: int = 1) -> Generator[Item, None, None]:
|
55 |
+
"""Compute text embeddings in batches of chunks, using the provided splitter and embedding fn."""
|
56 |
+
pool = ThreadPoolExecutor()
|
57 |
+
|
58 |
+
def _splitter(doc: str) -> list[TextChunk]:
|
59 |
+
if not doc:
|
60 |
+
return []
|
61 |
+
if split_fn:
|
62 |
+
return split_fn(doc)
|
63 |
+
else:
|
64 |
+
# Return a single chunk that spans the entire document.
|
65 |
+
return [(doc, (0, len(doc)))]
|
66 |
+
|
67 |
+
num_docs = 0
|
68 |
+
|
69 |
+
def _flat_split_batch_docs(docs: Iterable[str]) -> Generator[tuple[int, TextChunk], None, None]:
|
70 |
+
"""Split a batch of documents into chunks and yield them."""
|
71 |
+
nonlocal num_docs
|
72 |
+
for i, doc in enumerate(docs):
|
73 |
+
num_docs += 1
|
74 |
+
chunks = _splitter(doc)
|
75 |
+
for chunk in chunks:
|
76 |
+
yield (i, chunk)
|
77 |
+
|
78 |
+
doc_chunks = _flat_split_batch_docs(docs)
|
79 |
+
items_to_yield: Optional[list[Item]] = None
|
80 |
+
current_index = 0
|
81 |
+
|
82 |
+
mega_batch_size = batch_size * num_parallel_requests
|
83 |
+
|
84 |
+
for batch in chunks(doc_chunks, mega_batch_size):
|
85 |
+
texts = [text for _, (text, _) in batch]
|
86 |
+
embeddings: list[np.ndarray] = []
|
87 |
+
|
88 |
+
for x in list(pool.map(lambda x: embed_fn(x), chunks(texts, batch_size))):
|
89 |
+
embeddings.extend(x)
|
90 |
+
matrix = cast(np.ndarray, normalize(np.array(embeddings, dtype=np.float32)))
|
91 |
+
# np.split returns a shallow copy of each embedding so we don't increase the mem footprint.
|
92 |
+
embeddings_batch = cast(list[np.ndarray], np.split(matrix, matrix.shape[0]))
|
93 |
+
for (index, (_, (start, end))), embedding in zip(batch, embeddings_batch):
|
94 |
+
embedding = embedding.reshape(-1)
|
95 |
+
if index == current_index:
|
96 |
+
if items_to_yield is None:
|
97 |
+
items_to_yield = []
|
98 |
+
items_to_yield.append(lilac_embedding(start, end, embedding))
|
99 |
+
else:
|
100 |
+
yield items_to_yield
|
101 |
+
current_index += 1
|
102 |
+
while current_index < index:
|
103 |
+
yield None
|
104 |
+
current_index += 1
|
105 |
+
items_to_yield = [lilac_embedding(start, end, embedding)]
|
106 |
+
|
107 |
+
while current_index < num_docs:
|
108 |
+
yield items_to_yield
|
109 |
+
items_to_yield = None
|
110 |
+
current_index += 1
|
lilac/embeddings/gte.py
ADDED
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Gegeral Text Embeddings (GTE) model. Open-source model, designed to run on device."""
|
2 |
+
from typing import TYPE_CHECKING, Iterable, cast
|
3 |
+
|
4 |
+
from typing_extensions import override
|
5 |
+
|
6 |
+
from ..schema import Item, RichData
|
7 |
+
from ..signal import TextEmbeddingSignal
|
8 |
+
from ..splitters.chunk_splitter import split_text
|
9 |
+
from .embedding import compute_split_embeddings
|
10 |
+
from .transformer_utils import get_model
|
11 |
+
|
12 |
+
if TYPE_CHECKING:
|
13 |
+
pass
|
14 |
+
|
15 |
+
# See https://huggingface.co/spaces/mteb/leaderboard for leaderboard of models.
|
16 |
+
GTE_SMALL = 'thenlper/gte-small'
|
17 |
+
GTE_BASE = 'thenlper/gte-base'
|
18 |
+
|
19 |
+
# Maps a tuple of model name and device to the optimal batch size, found empirically.
|
20 |
+
_OPTIMAL_BATCH_SIZES: dict[str, dict[str, int]] = {
|
21 |
+
GTE_SMALL: {
|
22 |
+
'': 64, # Default batch size.
|
23 |
+
'mps': 256,
|
24 |
+
},
|
25 |
+
GTE_BASE: {
|
26 |
+
'': 64, # Default batch size.
|
27 |
+
'mps': 128,
|
28 |
+
}
|
29 |
+
}
|
30 |
+
|
31 |
+
|
32 |
+
class GTESmall(TextEmbeddingSignal):
|
33 |
+
"""Computes Gegeral Text Embeddings (GTE).
|
34 |
+
|
35 |
+
<br>This embedding runs on-device. See the [model card](https://huggingface.co/thenlper/gte-small)
|
36 |
+
for details.
|
37 |
+
"""
|
38 |
+
|
39 |
+
name = 'gte-small'
|
40 |
+
display_name = 'Gegeral Text Embeddings (small)'
|
41 |
+
|
42 |
+
_model_name = GTE_SMALL
|
43 |
+
|
44 |
+
@override
|
45 |
+
def compute(self, docs: Iterable[RichData]) -> Iterable[Item]:
|
46 |
+
"""Call the embedding function."""
|
47 |
+
batch_size, model = get_model(self._model_name, _OPTIMAL_BATCH_SIZES[self._model_name])
|
48 |
+
embed_fn = model.encode
|
49 |
+
split_fn = split_text if self._split else None
|
50 |
+
docs = cast(Iterable[str], docs)
|
51 |
+
yield from compute_split_embeddings(docs, batch_size, embed_fn=embed_fn, split_fn=split_fn)
|
52 |
+
|
53 |
+
|
54 |
+
class GTEBase(GTESmall):
|
55 |
+
"""Computes Gegeral Text Embeddings (GTE).
|
56 |
+
|
57 |
+
<br>This embedding runs on-device. See the [model card](https://huggingface.co/thenlper/gte-base)
|
58 |
+
for details.
|
59 |
+
"""
|
60 |
+
name = 'gte-base'
|
61 |
+
display_name = 'Gegeral Text Embeddings (base)'
|
62 |
+
|
63 |
+
_model_name = GTE_BASE
|
lilac/embeddings/openai.py
ADDED
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""OpenAI embeddings."""
|
2 |
+
from typing import TYPE_CHECKING, Any, Iterable, cast
|
3 |
+
|
4 |
+
import numpy as np
|
5 |
+
from tenacity import retry, stop_after_attempt, wait_random_exponential
|
6 |
+
from typing_extensions import override
|
7 |
+
|
8 |
+
from ..env import env
|
9 |
+
from ..schema import Item, RichData
|
10 |
+
from ..signal import TextEmbeddingSignal
|
11 |
+
from ..splitters.chunk_splitter import split_text
|
12 |
+
from .embedding import compute_split_embeddings
|
13 |
+
|
14 |
+
if TYPE_CHECKING:
|
15 |
+
import openai
|
16 |
+
|
17 |
+
NUM_PARALLEL_REQUESTS = 10
|
18 |
+
OPENAI_BATCH_SIZE = 128
|
19 |
+
EMBEDDING_MODEL = 'text-embedding-ada-002'
|
20 |
+
|
21 |
+
|
22 |
+
class OpenAI(TextEmbeddingSignal):
|
23 |
+
"""Computes embeddings using OpenAI's embedding API.
|
24 |
+
|
25 |
+
<br>**Important**: This will send data to an external server!
|
26 |
+
|
27 |
+
<br>To use this signal, you must get an OpenAI API key from
|
28 |
+
[platform.openai.com](https://platform.openai.com/) and add it to your .env.local.
|
29 |
+
|
30 |
+
<br>For details on pricing, see: https://openai.com/pricing.
|
31 |
+
"""
|
32 |
+
|
33 |
+
name = 'openai'
|
34 |
+
display_name = 'OpenAI Embeddings'
|
35 |
+
|
36 |
+
_model: type['openai.Embedding']
|
37 |
+
|
38 |
+
@override
|
39 |
+
def setup(self) -> None:
|
40 |
+
api_key = env('OPENAI_API_KEY')
|
41 |
+
if not api_key:
|
42 |
+
raise ValueError('`OPENAI_API_KEY` environment variable not set.')
|
43 |
+
try:
|
44 |
+
import openai
|
45 |
+
openai.api_key = api_key
|
46 |
+
self._model = openai.Embedding
|
47 |
+
except ImportError:
|
48 |
+
raise ImportError('Could not import the "openai" python package. '
|
49 |
+
'Please install it with `pip install openai`.')
|
50 |
+
|
51 |
+
@override
|
52 |
+
def compute(self, docs: Iterable[RichData]) -> Iterable[Item]:
|
53 |
+
"""Compute embeddings for the given documents."""
|
54 |
+
|
55 |
+
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(10))
|
56 |
+
def embed_fn(texts: list[str]) -> list[np.ndarray]:
|
57 |
+
|
58 |
+
# Replace newlines, which can negatively affect performance.
|
59 |
+
# See https://github.com/search?q=repo%3Aopenai%2Fopenai-python+replace+newlines&type=code
|
60 |
+
texts = [text.replace('\n', ' ') for text in texts]
|
61 |
+
|
62 |
+
response: Any = self._model.create(input=texts, model=EMBEDDING_MODEL)
|
63 |
+
return [np.array(embedding['embedding'], dtype=np.float32) for embedding in response['data']]
|
64 |
+
|
65 |
+
docs = cast(Iterable[str], docs)
|
66 |
+
split_fn = split_text if self._split else None
|
67 |
+
yield from compute_split_embeddings(
|
68 |
+
docs, OPENAI_BATCH_SIZE, embed_fn, split_fn, num_parallel_requests=NUM_PARALLEL_REQUESTS)
|
lilac/embeddings/palm.py
ADDED
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""PaLM embeddings."""
|
2 |
+
from typing import TYPE_CHECKING, Iterable, cast
|
3 |
+
|
4 |
+
import numpy as np
|
5 |
+
from tenacity import retry, stop_after_attempt, wait_random_exponential
|
6 |
+
from typing_extensions import override
|
7 |
+
|
8 |
+
from ..env import env
|
9 |
+
from ..schema import Item, RichData
|
10 |
+
from ..signal import TextEmbeddingSignal
|
11 |
+
from ..splitters.chunk_splitter import split_text
|
12 |
+
from .embedding import compute_split_embeddings
|
13 |
+
|
14 |
+
if TYPE_CHECKING:
|
15 |
+
import google.generativeai as palm
|
16 |
+
|
17 |
+
PALM_BATCH_SIZE = 1 # PaLM API only supports batch size 1.
|
18 |
+
NUM_PARALLEL_REQUESTS = 256 # Because batch size is 1, we can send many requests in parallel.
|
19 |
+
EMBEDDING_MODEL = 'models/embedding-gecko-001'
|
20 |
+
|
21 |
+
|
22 |
+
class PaLM(TextEmbeddingSignal):
|
23 |
+
"""Computes embeddings using PaLM's embedding API.
|
24 |
+
|
25 |
+
<br>**Important**: This will send data to an external server!
|
26 |
+
|
27 |
+
<br>To use this signal, you must get a PaLM API key from
|
28 |
+
[makersuite.google.com](https://makersuite.google.com/app/apikey) and add it to your .env.local.
|
29 |
+
"""
|
30 |
+
|
31 |
+
name = 'palm'
|
32 |
+
display_name = 'PaLM Embeddings'
|
33 |
+
|
34 |
+
_model: 'palm.generate_embeddings'
|
35 |
+
|
36 |
+
@override
|
37 |
+
def setup(self) -> None:
|
38 |
+
api_key = env('PALM_API_KEY')
|
39 |
+
if not api_key:
|
40 |
+
raise ValueError('`PALM_API_KEY` environment variable not set.')
|
41 |
+
try:
|
42 |
+
import google.generativeai as palm
|
43 |
+
palm.configure(api_key=api_key)
|
44 |
+
self._model = palm.generate_embeddings
|
45 |
+
except ImportError:
|
46 |
+
raise ImportError('Could not import the "google.generativeai" python package. '
|
47 |
+
'Please install it with `pip install google-generativeai`.')
|
48 |
+
|
49 |
+
@override
|
50 |
+
def compute(self, docs: Iterable[RichData]) -> Iterable[Item]:
|
51 |
+
"""Compute embeddings for the given documents."""
|
52 |
+
|
53 |
+
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(10))
|
54 |
+
def embed_fn(texts: list[str]) -> list[np.ndarray]:
|
55 |
+
assert len(texts) == 1, 'PaLM API only supports batch size 1.'
|
56 |
+
response = self._model(model=EMBEDDING_MODEL, text=texts[0])
|
57 |
+
return [np.array(response['embedding'], dtype=np.float32)]
|
58 |
+
|
59 |
+
docs = cast(Iterable[str], docs)
|
60 |
+
split_fn = split_text if self._split else None
|
61 |
+
yield from compute_split_embeddings(
|
62 |
+
docs, PALM_BATCH_SIZE, embed_fn, split_fn, num_parallel_requests=NUM_PARALLEL_REQUESTS)
|
lilac/embeddings/sbert.py
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Sentence-BERT embeddings. Open-source models, designed to run on device."""
|
2 |
+
from typing import Iterable, cast
|
3 |
+
|
4 |
+
from typing_extensions import override
|
5 |
+
|
6 |
+
from ..schema import Item, RichData
|
7 |
+
from ..signal import TextEmbeddingSignal
|
8 |
+
from ..splitters.chunk_splitter import split_text
|
9 |
+
from .embedding import compute_split_embeddings
|
10 |
+
from .transformer_utils import get_model
|
11 |
+
|
12 |
+
# The `all-mpnet-base-v2` model provides the best quality, while `all-MiniLM-L6-v2`` is 5 times
|
13 |
+
# faster and still offers good quality. See https://www.sbert.net/docs/pretrained_models.html#sentence-embedding-models/
|
14 |
+
MINI_LM_MODEL = 'all-MiniLM-L6-v2'
|
15 |
+
|
16 |
+
# Maps a tuple of model name and device to the optimal batch size, found empirically.
|
17 |
+
_OPTIMAL_BATCH_SIZES: dict[str, dict[str, int]] = {
|
18 |
+
MINI_LM_MODEL: {
|
19 |
+
'': 64, # Default batch size.
|
20 |
+
'mps': 256,
|
21 |
+
}
|
22 |
+
}
|
23 |
+
|
24 |
+
|
25 |
+
class SBERT(TextEmbeddingSignal):
|
26 |
+
"""Computes embeddings using Sentence-BERT library."""
|
27 |
+
|
28 |
+
name = 'sbert'
|
29 |
+
display_name = 'SBERT Embeddings'
|
30 |
+
|
31 |
+
@override
|
32 |
+
def compute(self, docs: Iterable[RichData]) -> Iterable[Item]:
|
33 |
+
"""Call the embedding function."""
|
34 |
+
batch_size, model = get_model(MINI_LM_MODEL, _OPTIMAL_BATCH_SIZES[MINI_LM_MODEL])
|
35 |
+
embed_fn = model.encode
|
36 |
+
split_fn = split_text if self._split else None
|
37 |
+
docs = cast(Iterable[str], docs)
|
38 |
+
yield from compute_split_embeddings(docs, batch_size, embed_fn=embed_fn, split_fn=split_fn)
|
lilac/embeddings/transformer_utils.py
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Utils for transformer embeddings."""
|
2 |
+
|
3 |
+
import functools
|
4 |
+
import os
|
5 |
+
from typing import TYPE_CHECKING, Optional
|
6 |
+
|
7 |
+
from ..env import data_path
|
8 |
+
from ..utils import log
|
9 |
+
|
10 |
+
if TYPE_CHECKING:
|
11 |
+
from sentence_transformers import SentenceTransformer
|
12 |
+
|
13 |
+
|
14 |
+
def get_model(model_name: str,
|
15 |
+
optimal_batch_sizes: dict[str, int] = {}) -> tuple[int, 'SentenceTransformer']:
|
16 |
+
"""Get a transformer model and the optimal batch size for it."""
|
17 |
+
try:
|
18 |
+
import torch.backends.mps
|
19 |
+
from sentence_transformers import SentenceTransformer
|
20 |
+
except ImportError:
|
21 |
+
raise ImportError('Could not import the "sentence_transformers" python package. '
|
22 |
+
'Please install it with `pip install sentence-transformers`.')
|
23 |
+
preferred_device: Optional[str] = None
|
24 |
+
if torch.backends.mps.is_available():
|
25 |
+
preferred_device = 'mps'
|
26 |
+
elif not torch.backends.mps.is_built():
|
27 |
+
log('MPS not available because the current PyTorch install was not built with MPS enabled.')
|
28 |
+
|
29 |
+
@functools.cache
|
30 |
+
def _get_model(model_name: str) -> 'SentenceTransformer':
|
31 |
+
return SentenceTransformer(
|
32 |
+
model_name, device=preferred_device, cache_folder=os.path.join(data_path(), '.cache'))
|
33 |
+
|
34 |
+
batch_size = optimal_batch_sizes[preferred_device or '']
|
35 |
+
return batch_size, _get_model(model_name)
|
lilac/embeddings/vector_store.py
ADDED
@@ -0,0 +1,201 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Interface for storing vectors."""
|
2 |
+
|
3 |
+
import abc
|
4 |
+
import os
|
5 |
+
import pickle
|
6 |
+
from typing import Iterable, Optional, Type
|
7 |
+
|
8 |
+
import numpy as np
|
9 |
+
|
10 |
+
from ..schema import SpanVector, VectorKey
|
11 |
+
from ..utils import open_file
|
12 |
+
|
13 |
+
|
14 |
+
class VectorStore(abc.ABC):
|
15 |
+
"""Interface for storing and retrieving vectors."""
|
16 |
+
|
17 |
+
# The global name of the vector store.
|
18 |
+
name: str
|
19 |
+
|
20 |
+
@abc.abstractmethod
|
21 |
+
def save(self, base_path: str) -> None:
|
22 |
+
"""Save the store to disk."""
|
23 |
+
pass
|
24 |
+
|
25 |
+
@abc.abstractmethod
|
26 |
+
def load(self, base_path: str) -> None:
|
27 |
+
"""Load the store from disk."""
|
28 |
+
pass
|
29 |
+
|
30 |
+
@abc.abstractmethod
|
31 |
+
def size(self) -> int:
|
32 |
+
"""Return the number of vectors in the store."""
|
33 |
+
pass
|
34 |
+
|
35 |
+
@abc.abstractmethod
|
36 |
+
def add(self, keys: list[VectorKey], embeddings: np.ndarray) -> None:
|
37 |
+
"""Add or edit the given keyed embeddings to the store.
|
38 |
+
|
39 |
+
If the keys already exist they will be overwritten, acting as an "upsert".
|
40 |
+
|
41 |
+
Args:
|
42 |
+
keys: The keys to add the embeddings for.
|
43 |
+
embeddings: The embeddings to add. This should be a 2D matrix with the same length as keys.
|
44 |
+
"""
|
45 |
+
pass
|
46 |
+
|
47 |
+
@abc.abstractmethod
|
48 |
+
def get(self, keys: Optional[Iterable[VectorKey]] = None) -> np.ndarray:
|
49 |
+
"""Return the embeddings for given keys.
|
50 |
+
|
51 |
+
Args:
|
52 |
+
keys: The keys to return the embeddings for. If None, return all embeddings.
|
53 |
+
|
54 |
+
Returns
|
55 |
+
The embeddings for the given keys.
|
56 |
+
"""
|
57 |
+
pass
|
58 |
+
|
59 |
+
def topk(self,
|
60 |
+
query: np.ndarray,
|
61 |
+
k: int,
|
62 |
+
keys: Optional[Iterable[VectorKey]] = None) -> list[tuple[VectorKey, float]]:
|
63 |
+
"""Return the top k most similar vectors.
|
64 |
+
|
65 |
+
Args:
|
66 |
+
query: The query vector.
|
67 |
+
k: The number of results to return.
|
68 |
+
keys: Optional keys to restrict the search to.
|
69 |
+
|
70 |
+
Returns
|
71 |
+
A list of (key, score) tuples.
|
72 |
+
"""
|
73 |
+
raise NotImplementedError
|
74 |
+
|
75 |
+
|
76 |
+
PathKey = VectorKey
|
77 |
+
|
78 |
+
_SPANS_PICKLE_NAME = 'spans.pkl'
|
79 |
+
|
80 |
+
|
81 |
+
class VectorDBIndex:
|
82 |
+
"""Stores and retrives span vectors.
|
83 |
+
|
84 |
+
This wraps a regular vector store by adding a mapping from path keys, such as (rowid1, 0),
|
85 |
+
to span keys, such as (rowid1, 0, 0), which denotes the first span in the (rowid1, 0) document.
|
86 |
+
"""
|
87 |
+
|
88 |
+
def __init__(self, vector_store: str) -> None:
|
89 |
+
self._vector_store: VectorStore = get_vector_store_cls(vector_store)()
|
90 |
+
# Map a path key to spans for that path.
|
91 |
+
self._id_to_spans: dict[PathKey, list[tuple[int, int]]] = {}
|
92 |
+
|
93 |
+
def load(self, base_path: str) -> None:
|
94 |
+
"""Load the vector index from disk."""
|
95 |
+
assert not self._id_to_spans, 'Cannot load into a non-empty index.'
|
96 |
+
with open_file(os.path.join(base_path, _SPANS_PICKLE_NAME), 'rb') as f:
|
97 |
+
self._id_to_spans.update(pickle.load(f))
|
98 |
+
self._vector_store.load(os.path.join(base_path, self._vector_store.name))
|
99 |
+
|
100 |
+
def save(self, base_path: str) -> None:
|
101 |
+
"""Save the vector index to disk."""
|
102 |
+
assert self._id_to_spans, 'Cannot save an empty index.'
|
103 |
+
with open_file(os.path.join(base_path, _SPANS_PICKLE_NAME), 'wb') as f:
|
104 |
+
pickle.dump(list(self._id_to_spans.items()), f)
|
105 |
+
self._vector_store.save(os.path.join(base_path, self._vector_store.name))
|
106 |
+
|
107 |
+
def add(self, all_spans: list[tuple[PathKey, list[tuple[int, int]]]],
|
108 |
+
embeddings: np.ndarray) -> None:
|
109 |
+
"""Add the given spans and embeddings.
|
110 |
+
|
111 |
+
Args:
|
112 |
+
all_spans: The spans to initialize the index with.
|
113 |
+
embeddings: The embeddings to initialize the index with.
|
114 |
+
"""
|
115 |
+
assert not self._id_to_spans, 'Cannot add to a non-empty index.'
|
116 |
+
self._id_to_spans.update(all_spans)
|
117 |
+
vector_keys = [(*path_key, i) for path_key, spans in all_spans for i in range(len(spans))]
|
118 |
+
assert len(vector_keys) == len(embeddings), (
|
119 |
+
f'Number of spans ({len(vector_keys)}) and embeddings ({len(embeddings)}) must match.')
|
120 |
+
self._vector_store.add(vector_keys, embeddings)
|
121 |
+
|
122 |
+
def get_vector_store(self) -> VectorStore:
|
123 |
+
"""Return the underlying vector store."""
|
124 |
+
return self._vector_store
|
125 |
+
|
126 |
+
def get(self, keys: Iterable[PathKey]) -> Iterable[list[SpanVector]]:
|
127 |
+
"""Return the spans with vectors for each key in `keys`.
|
128 |
+
|
129 |
+
Args:
|
130 |
+
keys: The keys to return the vectors for.
|
131 |
+
|
132 |
+
Returns
|
133 |
+
The span vectors for the given keys.
|
134 |
+
"""
|
135 |
+
all_spans: list[list[tuple[int, int]]] = []
|
136 |
+
vector_keys: list[VectorKey] = []
|
137 |
+
for path_key in keys:
|
138 |
+
spans = self._id_to_spans[path_key]
|
139 |
+
all_spans.append(spans)
|
140 |
+
vector_keys.extend([(*path_key, i) for i in range(len(spans))])
|
141 |
+
|
142 |
+
all_vectors = self._vector_store.get(vector_keys)
|
143 |
+
offset = 0
|
144 |
+
for spans in all_spans:
|
145 |
+
vectors = all_vectors[offset:offset + len(spans)]
|
146 |
+
yield [{'span': span, 'vector': vector} for span, vector in zip(spans, vectors)]
|
147 |
+
offset += len(spans)
|
148 |
+
|
149 |
+
def topk(self,
|
150 |
+
query: np.ndarray,
|
151 |
+
k: int,
|
152 |
+
path_keys: Optional[Iterable[PathKey]] = None) -> list[tuple[PathKey, float]]:
|
153 |
+
"""Return the top k most similar vectors.
|
154 |
+
|
155 |
+
Args:
|
156 |
+
query: The query vector.
|
157 |
+
k: The number of results to return.
|
158 |
+
path_keys: Optional key prefixes to restrict the search to.
|
159 |
+
|
160 |
+
Returns
|
161 |
+
A list of (key, score) tuples.
|
162 |
+
"""
|
163 |
+
span_keys: Optional[list[VectorKey]] = None
|
164 |
+
if path_keys is not None:
|
165 |
+
span_keys = [
|
166 |
+
(*path_key, i) for path_key in path_keys for i in range(len(self._id_to_spans[path_key]))
|
167 |
+
]
|
168 |
+
span_k = k
|
169 |
+
path_key_scores: dict[PathKey, float] = {}
|
170 |
+
total_num_span_keys = self._vector_store.size()
|
171 |
+
while (len(path_key_scores) < k and span_k < total_num_span_keys and
|
172 |
+
(not span_keys or span_k < len(span_keys))):
|
173 |
+
span_k += k
|
174 |
+
vector_key_scores = self._vector_store.topk(query, span_k, span_keys)
|
175 |
+
for (*path_key_list, _), score in vector_key_scores:
|
176 |
+
path_key = tuple(path_key_list)
|
177 |
+
if path_key not in path_key_scores:
|
178 |
+
path_key_scores[path_key] = score
|
179 |
+
|
180 |
+
return list(path_key_scores.items())[:k]
|
181 |
+
|
182 |
+
|
183 |
+
VECTOR_STORE_REGISTRY: dict[str, Type[VectorStore]] = {}
|
184 |
+
|
185 |
+
|
186 |
+
def register_vector_store(vector_store_cls: Type[VectorStore]) -> None:
|
187 |
+
"""Register a vector store in the global registry."""
|
188 |
+
if vector_store_cls.name in VECTOR_STORE_REGISTRY:
|
189 |
+
raise ValueError(f'Vector store "{vector_store_cls.name}" has already been registered!')
|
190 |
+
|
191 |
+
VECTOR_STORE_REGISTRY[vector_store_cls.name] = vector_store_cls
|
192 |
+
|
193 |
+
|
194 |
+
def get_vector_store_cls(vector_store_name: str) -> Type[VectorStore]:
|
195 |
+
"""Return a registered vector store given the name in the registry."""
|
196 |
+
return VECTOR_STORE_REGISTRY[vector_store_name]
|
197 |
+
|
198 |
+
|
199 |
+
def clear_vector_store_registry() -> None:
|
200 |
+
"""Clear the vector store registry."""
|
201 |
+
VECTOR_STORE_REGISTRY.clear()
|
lilac/embeddings/vector_store_hnsw.py
ADDED
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""HNSW vector store."""
|
2 |
+
|
3 |
+
import multiprocessing
|
4 |
+
from typing import Iterable, Optional, Set, cast
|
5 |
+
|
6 |
+
import hnswlib
|
7 |
+
import numpy as np
|
8 |
+
import pandas as pd
|
9 |
+
from typing_extensions import override
|
10 |
+
|
11 |
+
from ..schema import VectorKey
|
12 |
+
from ..utils import DebugTimer
|
13 |
+
from .vector_store import VectorStore
|
14 |
+
|
15 |
+
_HNSW_SUFFIX = '.hnswlib.bin'
|
16 |
+
_LOOKUP_SUFFIX = '.lookup.pkl'
|
17 |
+
|
18 |
+
# Parameters for HNSW index: https://github.com/nmslib/hnswlib/blob/master/ALGO_PARAMS.md
|
19 |
+
QUERY_EF = 50
|
20 |
+
CONSTRUCTION_EF = 100
|
21 |
+
M = 16
|
22 |
+
SPACE = 'ip'
|
23 |
+
|
24 |
+
|
25 |
+
class HNSWVectorStore(VectorStore):
|
26 |
+
"""HNSW-backed vector store."""
|
27 |
+
|
28 |
+
name = 'hnsw'
|
29 |
+
|
30 |
+
def __init__(self) -> None:
|
31 |
+
# Maps a `VectorKey` to a row index in `_embeddings`.
|
32 |
+
self._key_to_label: Optional[pd.Series] = None
|
33 |
+
self._index: Optional[hnswlib.Index] = None
|
34 |
+
|
35 |
+
@override
|
36 |
+
def save(self, base_path: str) -> None:
|
37 |
+
assert self._key_to_label is not None and self._index is not None, (
|
38 |
+
'The vector store has no embeddings. Call load() or add() first.')
|
39 |
+
self._index.save_index(base_path + _HNSW_SUFFIX)
|
40 |
+
self._key_to_label.to_pickle(base_path + _LOOKUP_SUFFIX)
|
41 |
+
|
42 |
+
@override
|
43 |
+
def load(self, base_path: str) -> None:
|
44 |
+
self._key_to_label = pd.read_pickle(base_path + _LOOKUP_SUFFIX)
|
45 |
+
dim = int(self._key_to_label.name)
|
46 |
+
index = hnswlib.Index(space=SPACE, dim=dim)
|
47 |
+
index.set_ef(QUERY_EF)
|
48 |
+
index.set_num_threads(multiprocessing.cpu_count())
|
49 |
+
index.load_index(base_path + _HNSW_SUFFIX)
|
50 |
+
self._index = index
|
51 |
+
|
52 |
+
@override
|
53 |
+
def size(self) -> int:
|
54 |
+
assert self._index is not None, (
|
55 |
+
'The vector store has no embeddings. Call load() or add() first.')
|
56 |
+
return self._index.get_current_count()
|
57 |
+
|
58 |
+
@override
|
59 |
+
def add(self, keys: list[VectorKey], embeddings: np.ndarray) -> None:
|
60 |
+
assert self._index is None, (
|
61 |
+
'Embeddings already exist in this store. Upsert is not yet supported.')
|
62 |
+
|
63 |
+
if len(keys) != embeddings.shape[0]:
|
64 |
+
raise ValueError(
|
65 |
+
f'Length of keys ({len(keys)}) does not match number of embeddings {embeddings.shape[0]}.')
|
66 |
+
|
67 |
+
dim = embeddings.shape[1]
|
68 |
+
with DebugTimer('hnswlib index creation'):
|
69 |
+
index = hnswlib.Index(space=SPACE, dim=dim)
|
70 |
+
index.set_ef(QUERY_EF)
|
71 |
+
index.set_num_threads(multiprocessing.cpu_count())
|
72 |
+
index.init_index(max_elements=len(keys), ef_construction=CONSTRUCTION_EF, M=M)
|
73 |
+
|
74 |
+
# Cast to float32 since dot product with float32 is 40-50x faster than float16 and 2.5x faster
|
75 |
+
# than float64.
|
76 |
+
embeddings = embeddings.astype(np.float32)
|
77 |
+
row_indices = np.arange(len(keys), dtype=np.int32)
|
78 |
+
self._key_to_label = pd.Series(row_indices, index=keys, dtype=np.int32)
|
79 |
+
self._key_to_label.name = str(dim)
|
80 |
+
index.add_items(embeddings, row_indices)
|
81 |
+
self._index = index
|
82 |
+
|
83 |
+
@override
|
84 |
+
def get(self, keys: Optional[Iterable[VectorKey]] = None) -> np.ndarray:
|
85 |
+
assert self._index is not None and self._key_to_label is not None, (
|
86 |
+
'No embeddings exist in this store.')
|
87 |
+
if not keys:
|
88 |
+
return np.array(self._index.get_items(self._key_to_label.values), dtype=np.float32)
|
89 |
+
locs = self._key_to_label.loc[cast(list[str], keys)].values
|
90 |
+
return np.array(self._index.get_items(locs), dtype=np.float32)
|
91 |
+
|
92 |
+
@override
|
93 |
+
def topk(self,
|
94 |
+
query: np.ndarray,
|
95 |
+
k: int,
|
96 |
+
keys: Optional[Iterable[VectorKey]] = None) -> list[tuple[VectorKey, float]]:
|
97 |
+
assert self._index is not None and self._key_to_label is not None, (
|
98 |
+
'No embeddings exist in this store.')
|
99 |
+
labels: Set[int] = set()
|
100 |
+
if keys is not None:
|
101 |
+
labels = set(self._key_to_label.loc[cast(list[str], keys)].tolist())
|
102 |
+
k = min(k, len(labels))
|
103 |
+
|
104 |
+
def filter_func(label: int) -> bool:
|
105 |
+
return label in labels
|
106 |
+
|
107 |
+
query = np.expand_dims(query.astype(np.float32), axis=0)
|
108 |
+
locs, dists = self._index.knn_query(query, k=k, filter=filter_func if labels else None)
|
109 |
+
locs = locs[0]
|
110 |
+
dists = dists[0]
|
111 |
+
topk_keys = self._key_to_label.index.values[locs]
|
112 |
+
return [(key, 1 - dist) for key, dist in zip(topk_keys, dists)]
|
lilac/embeddings/vector_store_numpy.py
ADDED
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""NumpyVectorStore class for storing vectors in numpy arrays."""
|
2 |
+
|
3 |
+
from typing import Iterable, Optional, cast
|
4 |
+
|
5 |
+
import numpy as np
|
6 |
+
import pandas as pd
|
7 |
+
from typing_extensions import override
|
8 |
+
|
9 |
+
from ..schema import VectorKey
|
10 |
+
from .vector_store import VectorStore
|
11 |
+
|
12 |
+
_EMBEDDINGS_SUFFIX = '.matrix.npy'
|
13 |
+
_LOOKUP_SUFFIX = '.lookup.pkl'
|
14 |
+
|
15 |
+
|
16 |
+
class NumpyVectorStore(VectorStore):
|
17 |
+
"""Stores vectors as in-memory np arrays."""
|
18 |
+
name = 'numpy'
|
19 |
+
|
20 |
+
def __init__(self) -> None:
|
21 |
+
self._embeddings: Optional[np.ndarray] = None
|
22 |
+
# Maps a `VectorKey` to a row index in `_embeddings`.
|
23 |
+
self._key_to_index: Optional[pd.Series] = None
|
24 |
+
|
25 |
+
@override
|
26 |
+
def size(self) -> int:
|
27 |
+
assert self._embeddings is not None, (
|
28 |
+
'The vector store has no embeddings. Call load() or add() first.')
|
29 |
+
return len(self._embeddings)
|
30 |
+
|
31 |
+
@override
|
32 |
+
def save(self, base_path: str) -> None:
|
33 |
+
assert self._embeddings is not None and self._key_to_index is not None, (
|
34 |
+
'The vector store has no embeddings. Call load() or add() first.')
|
35 |
+
np.save(base_path + _EMBEDDINGS_SUFFIX, self._embeddings, allow_pickle=False)
|
36 |
+
self._key_to_index.to_pickle(base_path + _LOOKUP_SUFFIX)
|
37 |
+
|
38 |
+
@override
|
39 |
+
def load(self, base_path: str) -> None:
|
40 |
+
self._embeddings = np.load(base_path + _EMBEDDINGS_SUFFIX, allow_pickle=False)
|
41 |
+
self._key_to_index = pd.read_pickle(base_path + _LOOKUP_SUFFIX)
|
42 |
+
|
43 |
+
@override
|
44 |
+
def add(self, keys: list[VectorKey], embeddings: np.ndarray) -> None:
|
45 |
+
if self._embeddings or self._key_to_index:
|
46 |
+
raise ValueError('Embeddings already exist in this store. Upsert is not yet supported.')
|
47 |
+
|
48 |
+
if len(keys) != embeddings.shape[0]:
|
49 |
+
raise ValueError(
|
50 |
+
f'Length of keys ({len(keys)}) does not match number of embeddings {embeddings.shape[0]}.')
|
51 |
+
|
52 |
+
# Cast to float32 since dot product with float32 is 40-50x faster than float16 and 2.5x faster
|
53 |
+
# than float64.
|
54 |
+
self._embeddings = embeddings.astype(np.float32)
|
55 |
+
row_indices = np.arange(len(embeddings), dtype=np.uint32)
|
56 |
+
self._key_to_index = pd.Series(row_indices, index=keys, dtype=np.uint32)
|
57 |
+
|
58 |
+
@override
|
59 |
+
def get(self, keys: Optional[Iterable[VectorKey]] = None) -> np.ndarray:
|
60 |
+
assert self._embeddings is not None and self._key_to_index is not None, (
|
61 |
+
'The vector store has no embeddings. Call load() or add() first.')
|
62 |
+
if not keys:
|
63 |
+
return self._embeddings
|
64 |
+
locs = self._key_to_index.loc[cast(list[str], keys)]
|
65 |
+
return self._embeddings.take(locs, axis=0)
|
66 |
+
|
67 |
+
@override
|
68 |
+
def topk(self,
|
69 |
+
query: np.ndarray,
|
70 |
+
k: int,
|
71 |
+
keys: Optional[Iterable[VectorKey]] = None) -> list[tuple[VectorKey, float]]:
|
72 |
+
assert self._embeddings is not None and self._key_to_index is not None, (
|
73 |
+
'The vector store has no embeddings. Call load() or add() first.')
|
74 |
+
if keys is not None:
|
75 |
+
row_indices = self._key_to_index.loc[cast(list[str], keys)]
|
76 |
+
embeddings = self._embeddings.take(row_indices, axis=0)
|
77 |
+
keys = list(keys)
|
78 |
+
else:
|
79 |
+
keys, embeddings = cast(list[VectorKey], self._key_to_index.index.tolist()), self._embeddings
|
80 |
+
|
81 |
+
query = query.astype(embeddings.dtype)
|
82 |
+
similarities: np.ndarray = np.dot(embeddings, query).reshape(-1)
|
83 |
+
k = min(k, len(similarities))
|
84 |
+
|
85 |
+
# We do a partition + sort only top K to save time: O(n + klogk) instead of O(nlogn).
|
86 |
+
indices = np.argpartition(similarities, -k)[-k:]
|
87 |
+
# Indices sorted by value from largest to smallest.
|
88 |
+
indices = indices[np.argsort(similarities[indices])][::-1]
|
89 |
+
|
90 |
+
topk_similarities = similarities[indices]
|
91 |
+
topk_keys = [keys[idx] for idx in indices]
|
92 |
+
return list(zip(topk_keys, topk_similarities))
|