Spaces:

lilacai
/

nikhil_staging

Runtime error

App Files Files Community

nsthorat commited on Jul 17, 2023

Commit

e9a1c18

•

1 Parent(s): c754234

Push

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.env +0 -6
.gitattributes +0 -35
.gitignore +1 -1
Dockerfile +3 -0
data/concept/lilac/profanity/concept.json +0 -0
data/concept/lilac/profanity/sbert.pkl +0 -3
data/concept/lilac/toxicity/cohere.pkl +0 -3
data/concept/lilac/toxicity/concept.json +0 -0
data/concept/lilac/toxicity/openai.pkl +0 -3
data/concept/lilac/toxicity/sbert.pkl +0 -3
data/concept/local/outerspace/cohere.pkl +0 -3
data/concept/local/outerspace/concept.json +0 -188
data/concept/local/outerspace/openai.pkl +0 -3
data/concept/local/outerspace/sbert.pkl +0 -3
data/datasets/local/spotify/data-00000-of-00001.parquet +0 -3
data/datasets/local/spotify/manifest.json +0 -27
data/datasets/local/spotify/text/.concepts/local/aliens/sbert-neg-100.pkl +0 -3
data/datasets/local/spotify/text/.concepts/local/outer_space/sbert-neg-100.pkl +0 -3
data/datasets/local/spotify/text/.concepts/local/outerspace/sbert-neg-100.pkl +0 -3
data/datasets/local/spotify/text/.concepts/local/phone_addiction/sbert-neg-100.pkl +0 -3
data/datasets/local/spotify/text/sbert/data-00000-of-00001.parquet +0 -3
data/datasets/local/spotify/text/sbert/embedding/local/outerspace/v34/data-00000-of-00001.parquet +0 -3
data/datasets/local/spotify/text/sbert/embedding/local/outerspace/v34/signal_manifest.json +0 -64
data/datasets/local/spotify/text/sbert/embeddings-00000-of-00001.keys.pkl +0 -3
data/datasets/local/spotify/text/sbert/embeddings-00000-of-00001.npy +0 -3
data/datasets/local/spotify/text/sbert/signal_manifest.json +0 -37
requirements.txt +1 -0
src/concepts/concept.py +17 -8
src/concepts/concept_test.py +0 -84
src/concepts/db_concept_test.py +0 -606
src/data/dataset_compute_signal_chain_test.py +0 -255
src/data/dataset_compute_signal_test.py +0 -669
src/data/dataset_duckdb.py +15 -13
src/data/dataset_select_groups_test.py +0 -317
src/data/dataset_select_rows_filter_test.py +0 -200
src/data/dataset_select_rows_schema_test.py +0 -551
src/data/dataset_select_rows_search_test.py +0 -393
src/data/dataset_select_rows_sort_test.py +0 -904
src/data/dataset_select_rows_udf_test.py +0 -404
src/data/dataset_stats_test.py +0 -125
src/data/dataset_test.py +0 -860
src/data/dataset_utils.py +68 -34
src/data/dataset_utils_test.py +0 -114
src/data/sources/csv_source_test.py +0 -42
src/data/sources/huggingface_source_test.py +0 -170
src/data/sources/json_source_test.py +0 -74
src/data/sources/pandas_source_test.py +0 -91
src/data/sources/source_registry_test.py +0 -55
src/data_loader_test.py +0 -74
src/embeddings/embedding.py +18 -6

.env CHANGED Viewed

@@ -26,9 +26,3 @@ DUCKDB_USE_VIEWS=0
 # HF_USERNAME=
 # The default repo to deploy to for a staging demo. Can be overridden by a command line flag.
 # HF_STAGING_DEMO_REPO='HF_ORG/HF_REPO_NAME'
-# HuggingFace demos: HuggingFace machine that runs the demo.
-# To read private uploaded data from the server (running on HF spaces) for the demo.
-# Get a token from https://huggingface.co/settings/tokens
-# HF_ACCESS_TOKEN=

 # HF_USERNAME=
 # The default repo to deploy to for a staging demo. Can be overridden by a command line flag.
 # HF_STAGING_DEMO_REPO='HF_ORG/HF_REPO_NAME'

.gitattributes DELETED Viewed

@@ -1,35 +0,0 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

.gitignore CHANGED Viewed

@@ -1,4 +1,4 @@
-**/__pycache__
 **/*.pyc
 **/*.pyo
 **/*.pyd

+__pycache__/
 **/*.pyc
 **/*.pyo
 **/*.pyd

Dockerfile CHANGED Viewed

@@ -22,6 +22,9 @@ COPY /web/blueprint/build ./web/blueprint/build
 # Copy python files.
 COPY /src ./src/
 CMD [ \
   "gunicorn", "src.server:app", \
   "--bind", "0.0.0.0:5432", \

 # Copy python files.
 COPY /src ./src/
+# Copy the data files. We use glob so docker copy won't fail if the directory doesn't exist.
+COPY /dat[a] ./data/
 CMD [ \
   "gunicorn", "src.server:app", \
   "--bind", "0.0.0.0:5432", \

data/concept/lilac/profanity/concept.json DELETED Viewed

The diff for this file is too large to render. See raw diff

data/concept/lilac/profanity/sbert.pkl DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:647280d255e1a1fabff691683926fbb49dfaffe2f8151cf9913ec98816eef473
-size 844427

data/concept/lilac/toxicity/cohere.pkl DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:670e81b8448ab0ee5161a42b523410b3af80c6ccce8003cae78edebb9d0981c4
-size 9720631

data/concept/lilac/toxicity/concept.json DELETED Viewed

The diff for this file is too large to render. See raw diff

data/concept/lilac/toxicity/openai.pkl DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:e15e8235c2152b1412a8e2dee3dcb94b23e95f1fde6fb60f01b876a832e46404
-size 3678199

data/concept/lilac/toxicity/sbert.pkl DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:8ac8b304760c88242eb6c567e1af87fd87731a192308df8cf43b253e24d2b0ec
-size 959111

data/concept/local/outerspace/cohere.pkl DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:30afc472c4beb1aabb24d5b8e633df6039ec532fd704d8318755e083592221f3
-size 331736

data/concept/local/outerspace/concept.json DELETED Viewed

@@ -1,188 +0,0 @@
-{
-  "namespace": "local",
-  "concept_name": "outerspace",
-  "type": "text",
-  "data": {
-    "da77c67f82524ce1a276593471fd530f": {
-      "label": true,
-      "text": "Fly me to the moon and let me play among the stars.",
-      "id": "da77c67f82524ce1a276593471fd530f"
-    },
-    "f73feff4be50410ab1ac468752d0301a": {
-      "label": true,
-      "text": "Space may be the final frontier but it's made in a Hollywood basement.",
-      "id": "f73feff4be50410ab1ac468752d0301a"
-    },
-    "0f0815ed04604209842d9e7b1e3538f8": {
-      "label": true,
-      "text": "We're just a speck of dust within the galaxy.",
-      "id": "0f0815ed04604209842d9e7b1e3538f8"
-    },
-    "2e41f637061e4ecb8b0d4e35abab9b63": {
-      "label": true,
-      "text": "In the darkest night, the stars shine bright and guide me to the moonlight.",
-      "id": "2e41f637061e4ecb8b0d4e35abab9b63"
-    },
-    "fb65845f4dc84da1b276de30967592e3": {
-      "label": true,
-      "text": "We'll be shooting star through time and space\r\n\r\n",
-      "id": "fb65845f4dc84da1b276de30967592e3"
-    },
-    "075534e3095b421687039291439b5524": {
-      "label": true,
-      "text": "Dreaming of love while cruising at high altitude  \r\nDreaming of making love with you the way we should  \r\nCloser to heaven. We're thirty thousand feet, up in the sky  \r\nHere among the stars, our spirits will fly  \r\n  \r\nLeave all your worries as we soar over the clouds  \r\nJet lag that's making you appear far from the crowd  \r\nWhile we're suspended, locked in each others, sweet embrace  \r",
-      "id": "075534e3095b421687039291439b5524"
-    },
-    "4bb656032d0d4f449bac8aa5f23c3e48": {
-      "label": true,
-      "text": "  \r\nI don't know where I don't know why  \r\nBut somehow back in time again  \r\nI'm on the edge that you can see  \r\n  \r\nI'm not particular at night  \r\nA single party calling me  \r\nYou won't be tracking me by sight  \r\n  \r\nShadows and the stars  \r\nWe will not return  \r\nHumanity won't save us  \r\nAt the speed of light  \r\n  \r\nShadows and the stars  \r\nWe will not return  \r\nHumanity won't save us  \r",
-      "id": "4bb656032d0d4f449bac8aa5f23c3e48"
-    },
-    "4a6dda9001ea487991a1264e6a6c021b": {
-      "label": true,
-      "text": "Load redeem me, am I pure?  \r\nAs pure as pure as heaven  \r\nSent you money sent you flowers  \r\nCould worship you for hours  \r\nIn whose hands are we anyway?  \r\n  \r\nGo waiting for the stars  \r\nTo come showering down  \r\nFrom Moscow to Mars  \r\nUniverse falling down  \r\n  \r\nYou got to look real hard  \r\nIs it in your heart?  \r\nYeah it's in there somewhere  \r\nThe power wrapped in your palm  \r",
-      "id": "4a6dda9001ea487991a1264e6a6c021b"
-    },
-    "9aacce9311d24cb182aee783ca313c58": {
-      "label": true,
-      "text": "Growth is our future resource.  \r\n  \r\nJoin the state of the universe,  \r\nUnited state of peace.  \r\nJoin the state of the universe,  \r\nUnited state of peace.  \r\n  \r\nStarpeace, I see you growing,  \r\nStarpeace, I see you growing,  \r\nStarpeace, I see you growing,  \r\nStarpeace, I see you growing,  \r\nStarpeace, I see you growing,  \r\nStarpeace, I see you growing,  \r\nStarpeace, I see you growing,  \r",
-      "id": "9aacce9311d24cb182aee783ca313c58"
-    },
-    "313b8f9ce9164791b04ead82e6adb40f": {
-      "label": false,
-      "text": "  \r\nEven I could see a light if it wasn't for the nights  \r\n(Even I could see a light I think that I could make it)  \r\nGuess my future would look bright if it wasn't for the nights\r\n\r\n",
-      "id": "313b8f9ce9164791b04ead82e6adb40f"
-    },
-    "b9c587b74f084ef4917e7a52cd5c5cbe": {
-      "label": true,
-      "text": "Yea I think I know  \r\nI really hate it when it gets too slow  \r\nI gotta try and keep myself amused  \r\nI love the way my rocket purrs  \r\nI like it best when I see blurs  \r\nYou gotta love to watch me light my fuse  \r\n  \r\nNo more lookin' back to yesterday  \r\nI got the thing to blow us both away  \r\nAll I need is you to navigate  \r\nSo come and ride my Rocket 88  \r\n  \r",
-      "id": "b9c587b74f084ef4917e7a52cd5c5cbe"
-    },
-    "6f844600cc024117a22287557130a17b": {
-      "label": false,
-      "text": "They came flying from far away, now I'm under their spell  \r\nI love hearing the stories that they tell  \r\nThey've seen places beyond my land and they've found new horizons  \r\nThey speak strangely but I understand  \r\n  \r\nAnd I dream I'm an eagle  \r\nAnd I dream I can spread my wings  \r\nFlying high, high, I'm a bird in the sky  \r\nI'm an eagle that rides on the breeze  \r",
-      "id": "6f844600cc024117a22287557130a17b"
-    },
-    "8cddcff34f894743872ecc02262c2375": {
-      "label": true,
-      "text": "Fire! I can see it burning so brightly  \r\nFire! I can feel it calling out to me  \r\nAnd as the sun goes down  \r\nIt starts to paint a picture  \r\n  \r\nOf an ancient town  \r\nSo far away, across the endless sea  \r\nLead me to the light  \r\nAnd take me to the edge of heaven  \r\n  \r\nI'm standing in the night  \r\nLooking for the edge of heaven  \r\nWe'll be touching the edge of heaven  \r\nTime  \r\n  \r",
-      "id": "8cddcff34f894743872ecc02262c2375"
-    },
-    "3d044718f379452ab3c1e4d00c99f8f3": {
-      "label": false,
-      "text": "Fire! I can see it burning so brightly  \r\nFire! I can feel it calling out to me  \r\nAnd as the sun goes down  \r\nIt starts to paint a picture  \r\n  \r\nOf an ancient town  \r\nSo far away, across the endless sea  \r\nLead me to the light  \r\nAnd take me to the edge of heaven  \r\n  \r\nI'm standing in the night  \r\nLooking for the edge of heaven  \r\nWe'll be touching the edge of heaven  \r\nTime  \r\n  \r",
-      "id": "3d044718f379452ab3c1e4d00c99f8f3"
-    },
-    "d233250a91d44f13aac58eb5fa43afe6": {
-      "label": true,
-      "text": "Star  \r\nWe go waiting for the stars  \r\nTo come showering down  \r\nFrom Moscow to Mars  \r\nUniverse falling down  \r\n  \r\nYou got to look real hard  \r\nThere's a fiery star  \r\nHidden out there somewhere  \r\nNot the satellite of love  \r\nBut a laser  \r\nShooting out it's shiny tongue there  \r\n  \r\nGod is love, God is war  \r\nTV-preacher tell me more  \r\nLoad redeem me, am I pure?  \r",
-      "id": "d233250a91d44f13aac58eb5fa43afe6"
-    },
-    "a30c9a5c63a2456f8f53a9177a522d7a": {
-      "label": false,
-      "text": "Tell me do you want to be free  \r\n  \r\nWell your love falls down you know  \r\nAnd your heart might fall to pieces  \r\nAnd I saw your soul get lost along the way  \r\n  \r\nAll these songs now they used to make you shine  \r\nThey are just lullabies for your nightmares  \r\nAnd Ill sing them softly now  \r\n  \r\nLately I've felt the warmth  \r\nOf the one who tore down my walls  \r\nBut then I look at you  \r",
-      "id": "a30c9a5c63a2456f8f53a9177a522d7a"
-    },
-    "89ce6961ff064f719212e68058bb2013": {
-      "label": false,
-      "text": "I Left Them Niggas Needin'Path  \r\nAnd Ya'll Probly Won't Live To See This Weekend,  \r\nGotta Go, Gotta Go, FUckin Mash Out  \r\nI Hit The Dro' A Lil More And Then I Pass Out  \r\nCrashin' The H2, Bitches I Hate You  \r\nNow you Keep Talkin Shit, I Kidnap And Ducktape You  \r\nLet Them Faggots Rape You  \r\nThen It's Back To Mississippi, If Ya Boys Want Revenge  \r\nTell Them Bitches Come And Get Me  \r",
-      "id": "89ce6961ff064f719212e68058bb2013"
-    },
-    "6de1b38adc9b4f48ac15609dad02faa0": {
-      "label": true,
-      "text": "In heaven's eye  \r\n  \r\nYes, this is our star.  \r\nYes, this is our star.  \r\nOur star our star.\r\n\r\n",
-      "id": "6de1b38adc9b4f48ac15609dad02faa0"
-    },
-    "52ccd98280b849f498d838b6230285a7": {
-      "label": false,
-      "text": "Tell Them Bitches Come And Get Me  \r\n'cause I Was Born In This Bitch To Die  \r\nI'm In Queens, In Your 'Lac, With Your Bitch, Gettin' High  \r\n  \r\nYoung Buck:  \r\nGold Grills, Coupe' Devilles Sittin On 22's  \r\nThe Dirty, Dirty Baby  \r\nShow 'Em How The South Do  \r\nWe Pop Pills, Shoot To Kill, You Know What We 'Bout  \r\nAnd On Behalf Of G-Unit, Welcome To The South  \r\n  \r\nLil Flip:  \r",
-      "id": "52ccd98280b849f498d838b6230285a7"
-    },
-    "866a61ec0ab04a54ade2532b7825c858": {
-      "label": false,
-      "text": "I Swear On The Soul's Of Our Dead Cousin's  \r\nI Ain't Fuckin, Man I'm Commin Ak 40's Bustin',  \r\n7's And Mack 11's  \r\nI Told 'Em All I Ain't No Hoe  \r\nBut Niggas Don't Listen Till You Kick A Nigga,  \r\nSmack Him With That Callico  \r\nI'm Tryin To Stay In Gods Plan  \r\nBut I Hadta Show These Faggots That Your Fuckin With A Man, Ya Bitch!  \r\nI Left Them Niggas Needin'Path  \r",
-      "id": "866a61ec0ab04a54ade2532b7825c858"
-    },
-    "0a2dbf3ee6cd46ae9f71ecb65e02674e": {
-      "label": true,
-      "text": "And filling up the space  \r\nMen and women boys and girls  \r\nThere are so many people in the world  \r\nThinkin' about the world  \r\nAnd all the people in it  \r\nAnd I'm staring at the stars  \r\nAnd into the infinite  \r\nIn a world without a world  \r\nOn a planet that's  \r\nDriftin' in a space  \r\n  \r\nSeconds into minutes and minutes  \r\nInto hour and hours into days  \r",
-      "id": "0a2dbf3ee6cd46ae9f71ecb65e02674e"
-    },
-    "fff7748b4c384cb49ae18f96df719aa8": {
-      "label": false,
-      "text": "And the way things ought to be  \r\n  \r\nWhat kind of difference  \r\nCan on person make?  \r\nCut to the chase\r\n\r\n",
-      "id": "fff7748b4c384cb49ae18f96df719aa8"
-    },
-    "54971cdd9be0444096cacd2637a50ce4": {
-      "label": false,
-      "text": "With bar lights and pretty girls  \r\nBut most nights I stay straight and think about my mom  \r\nOh god, I miss her so much  \r\n  \r\nAnd there are people on the street  \r\nThey're coming up to me  \r\nThey're telling me that they like what I do now  \r\nAnd so I tried my best when I took the fall  \r\nTo get right back up, back in your arms  \r\nIf you're out here why do I miss you so much  \r\n  \r",
-      "id": "54971cdd9be0444096cacd2637a50ce4"
-    },
-    "048e4f04661d4f71a48d48f216b30975": {
-      "label": true,
-      "text": "  \r\nShadows and the stars  \r\nWe will not return  \r\nHumanity won't save us  \r\nAt the speed of light  \r\n  \r\nShadows and the stars  \r\nWe will not return  \r\nHumanity won't save us  \r\nWe slip into the night  \r\n  \r\nI'll say a mass for you and wave  \r\nShooting plasma from my grave  \r\n  \r\nEvent horizon lost in space  \r\nRunning in a human race  \r\n  \r\nI don't know where I don't know why  \r",
-      "id": "048e4f04661d4f71a48d48f216b30975"
-    },
-    "f4ee9e97357c4f2fa0ed627a6983e4de": {
-      "label": false,
-      "text": "I am here to tell you we can never meet again  \r\nSimple really, isn't it, a word or two and then  \r\nA lifetime of not knowing where or how or why or when  \r\nYou think of me or speak of me or wonder what befell  \r\nThe someone you once loved so long ago so well  \r\n  \r\nNever wonder what I'll feel as living shuffles by  \r\nYou don't have to ask me and I need not reply  \r",
-      "id": "f4ee9e97357c4f2fa0ed627a6983e4de"
-    },
-    "797514b7375f4ef8bfbd3320936b266a": {
-      "label": false,
-      "text": "  \r\nThe last time that I saw him he was trying hard to get  \r\nA woman's education but he's not a woman yet  \r\nAnd the last time that I saw her she was living with some boy  \r\nWho gives her soul an empty room and gives her body joy.  \r\n  \r\nSo the great affair is over but whoever would have guessed  \r\nIt would leave us all so vacant and so deeply unimpressed  \r",
-      "id": "797514b7375f4ef8bfbd3320936b266a"
-    },
-    "56663fdf792a4820b7ae2e4344542cfa": {
-      "label": true,
-      "text": "Yeah we'll find our star  \r\nBut maybe that's another world  \r\n  \r\nFar away from where we are  \r\nYeah we'll find our star  \r\nBut maybe that's another world\r\n\r\n",
-      "id": "56663fdf792a4820b7ae2e4344542cfa"
-    },
-    "d522d97e7d44430e945e40720d54e98d": {
-      "label": false,
-      "text": "The silly people just like you and better too.  \r\nHow can you keep turning when the overture is burning in the faces  \r\nOf the people in the churches of the land.  \r\n  \r\nThat's all it seems, there is only one dream.  \r\nThe day has come at last.\r\n\r\n",
-      "id": "d522d97e7d44430e945e40720d54e98d"
-    },
-    "761a17d5909d4c7c9cd0cd1ac8c2db76": {
-      "label": false,
-      "text": "Ah the man she wanted all her life was hanging by a thread  \r\n\"I never even knew how much I wanted you,\" she said.  \r\nHis muscles they were numbered and his style was obsolete.  \r\n\"O baby, I have come too late.\" She knelt beside his feet.  \r\n\"I'll never see a face like yours in years of men to come  \r\nI'll never see such arms again in wrestling or in love.\"  \r",
-      "id": "761a17d5909d4c7c9cd0cd1ac8c2db76"
-    },
-    "ffc68f626c7d41be8661babedf589778": {
-      "label": true,
-      "text": "let us make computations of the stars.  \r\n  \r\nOlder, wiser, sadder, blinder, watch us run:  \r\nfaster, longer, harder, stronger, now it comes:  \r\ncolour blisters, image splinters gravitate  \r\ntowards the centre, in final splendour disintegrate,  \r\nThe universe now beckons  \r\nand Man, too, must take His place...  \r\njust a few last fleeting seconds  \r\nto wander in the waste,  \r",
-      "id": "ffc68f626c7d41be8661babedf589778"
-    },
-    "8e8ffd440c2f48ebb5ae04810be5d090": {
-      "label": false,
-      "text": "And boy you'll see  \r\nIt's an illusion shining down in front of me,  \r\n  \r\nAnd then you'll say  \r\nEven in time we shall control the day,  \r\nWhen what you'll see  \r\nDeep inside base controlling you and me.  \r\n  \r\nAnd one peculiar point I see,  \r\nAs one of many ones of me.  \r\nAs truth is gathered, I rearrange,  \r\nInside out, outside in, inside out, outside in,  \r\nPerpetual change.  \r\n  \r",
-      "id": "8e8ffd440c2f48ebb5ae04810be5d090"
-    },
-    "c61414a653bb4a9482f341dbfbea4a47": {
-      "label": false,
-      "text": "While there's still time to choose  \r\n  \r\nEvery day of my life I discover  \r\nSomeone murdering my sisters and brothers  \r\nIn the name of some god or another  \r\nWhat do you know  \r\n  \r\nFor the first precious few it's time to go  \r\nWhat might have been we'll never know  \r\nAll those bad ideas became the law  \r\nOh yes, we've forgotten what we're looking for  \r",
-      "id": "c61414a653bb4a9482f341dbfbea4a47"
-    },
-    "3a325e1d3789416584ad836e2d32df05": {
-      "label": true,
-      "text": "Earth is the third planet from the Sun and the only place known in the universe where life has originated and found habitability. This is enabled by Earth being a water world, the only one in the Solar System sustaining liquid surface water. Almost all of Earth's water is contained in its global ocean, spanning 70.8% of Earth's surface. The other 29.2% are spanned by land, consisting of continents",
-      "id": "3a325e1d3789416584ad836e2d32df05"
-    },
-    "44e9840483164b6b97e06f909e25b8dc": {
-      "label": false,
-      "text": "Human geography\nToggle Human geography subsection\nCultural and historical viewpoint\nSee also\nNotes\nReferences\nExternal links\nEarth",
-      "id": "44e9840483164b6b97e06f909e25b8dc"
-    },
-    "bcf625326bc64c6ca6d37fb59bffa5ba": {
-      "label": true,
-      "text": "When the ebbing tide retreats along the rocky shoreline\nIt leaves a trail of tide pools in a short-lived galaxy\nEach microcosmic planet, a complete society\nA simple kind of mirror to reflect upon our own\nAll the busy little creatures chasing out their destinies\nLiving in their pools, they soon forget about the sea\nWheel within wheels in a spiral array\nA pattern so grand and complex",
-      "id": "bcf625326bc64c6ca6d37fb59bffa5ba"
-    },
-    "7c2be4b17d8f49069f6179c5256acc5e": {
-      "label": true,
-      "text": "Beneath my dreams and wishes  \nI long for thy caresses.  \n  \nA bridal bed awaits us both,  \nAfter the landscape of death I cross.  \nBefore my sorrows I must die,  \nNightwish I send through the starlit sky.  \n  \n\"Passed away in silence  \nThe flute from the realm unseen  \nEmpties it's heart  \nMaking love to me  \nWith it's enchanting melody.  \nLight of Orion,  \nShadow of Andromeda,  ",
-      "id": "7c2be4b17d8f49069f6179c5256acc5e"
-    }
-  },
-  "version": 34
-}

data/concept/local/outerspace/openai.pkl DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:7ea2acd96a43d1c678273e7ec297b1758a3d09d1137f0325ac3058ca9a110112
-size 126895

data/concept/local/outerspace/sbert.pkl DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:9916794dbe5526af5103019735188b637f9975a5326a21713380058034e13525
-size 34935

data/datasets/local/spotify/data-00000-of-00001.parquet DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:32224657332b09187a737c73ab634f9d14c9ba9a240bd105f1b9819cde2afcef
-size 37128682

data/datasets/local/spotify/manifest.json DELETED Viewed

@@ -1,27 +0,0 @@
-{
-  "files": [
-    "data-00000-of-00001.parquet"
-  ],
-  "data_schema": {
-    "fields": {
-      "artist": {
-        "dtype": "string"
-      },
-      "song": {
-        "dtype": "string"
-      },
-      "link": {
-        "dtype": "string"
-      },
-      "text": {
-        "dtype": "string"
-      },
-      "__line_number__": {
-        "dtype": "int64"
-      },
-      "__rowid__": {
-        "dtype": "string"
-      }
-    }
-  }
-}

data/datasets/local/spotify/text/.concepts/local/aliens/sbert-neg-100.pkl DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:93f390fafd0d0db4ae6ae80d30bfbf8eb0a80fa9332f77f30449d40a11df0936
-size 183363

data/datasets/local/spotify/text/.concepts/local/outer_space/sbert-neg-100.pkl DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:3fc9ac4c9b8b8588e48ebabbe34598edb4431985d20e018225b84546b96ce2ea
-size 166637

data/datasets/local/spotify/text/.concepts/local/outerspace/sbert-neg-100.pkl DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:f3432ea5dcfbe7f7a17c94a4cc0c09e3317b8a690456fdf3af3efa0dcaa6f4fc
-size 188685

data/datasets/local/spotify/text/.concepts/local/phone_addiction/sbert-neg-100.pkl DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:f795fb8b5d52650bd9aa5c871ff5d480e95413cd0afb65822a634c02f6674825
-size 163242

data/datasets/local/spotify/text/sbert/data-00000-of-00001.parquet DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:9796beb630cc3503f3c2ac9db8f71e4c1604570836d78bbf364e801cd427c39e
-size 2709987

data/datasets/local/spotify/text/sbert/embedding/local/outerspace/v34/data-00000-of-00001.parquet DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:d1ba0fe68cc02849b0a20b7f72047c8e9cb8e5ef5b57b0cd642fa0b0be8a6e06
-size 3340135

data/datasets/local/spotify/text/sbert/embedding/local/outerspace/v34/signal_manifest.json DELETED Viewed

@@ -1,64 +0,0 @@
-{
-  "files": [
-    "data-00000-of-00001.parquet"
-  ],
-  "parquet_id": "local/outerspace/v34(text.sbert.*.embedding)",
-  "data_schema": {
-    "fields": {
-      "__rowid__": {
-        "dtype": "string"
-      },
-      "text": {
-        "fields": {
-          "sbert": {
-            "repeated_field": {
-              "fields": {
-                "embedding": {
-                  "fields": {
-                    "local/outerspace/v34": {
-                      "dtype": "float32",
-                      "signal": {
-                        "signal_name": "concept_score",
-                        "embedding": "sbert",
-                        "namespace": "local",
-                        "concept_name": "outerspace",
-                        "draft": "main",
-                        "num_negative_examples": 100
-                      },
-                      "bins": [
-                        [
-                          "Not in concept",
-                          null,
-                          0.5
-                        ],
-                        [
-                          "In concept",
-                          0.5,
-                          null
-                        ]
-                      ]
-                    }
-                  }
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  },
-  "signal": {
-    "signal_name": "concept_score",
-    "embedding": "sbert",
-    "namespace": "local",
-    "concept_name": "outerspace",
-    "draft": "main",
-    "num_negative_examples": 100
-  },
-  "enriched_path": [
-    "text",
-    "sbert",
-    "*",
-    "embedding"
-  ]
-}

data/datasets/local/spotify/text/sbert/embeddings-00000-of-00001.keys.pkl DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:d5df43291782b8c731d4ce56537946654c642a01dc9a4e37de394836362f6b45
-size 3727400

data/datasets/local/spotify/text/sbert/embeddings-00000-of-00001.npy DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:94e10c23d7229541e1f60b791a659d13673b10a03649abf0ae092e0e18c5aee3
-size 170446976

data/datasets/local/spotify/text/sbert/signal_manifest.json DELETED Viewed

@@ -1,37 +0,0 @@
-{
-  "files": [
-    "data-00000-of-00001.parquet"
-  ],
-  "parquet_id": "sbert(text)",
-  "data_schema": {
-    "fields": {
-      "__rowid__": {
-        "dtype": "string"
-      },
-      "text": {
-        "fields": {
-          "sbert": {
-            "repeated_field": {
-              "fields": {
-                "embedding": {
-                  "dtype": "embedding"
-                }
-              },
-              "dtype": "string_span"
-            },
-            "signal": {
-              "signal_name": "sbert"
-            }
-          }
-        }
-      }
-    }
-  },
-  "signal": {
-    "signal_name": "sbert"
-  },
-  "enriched_path": [
-    "text"
-  ],
-  "embedding_filename_prefix": "embeddings-00000-of-00001"
-}

requirements.txt CHANGED Viewed

@@ -18,6 +18,7 @@ cytoolz==0.12.1 ; python_version >= "3.9" and python_version < "3.10"
 dask==2023.6.1 ; python_version >= "3.9" and python_version < "3.10"
 datasets==2.13.1 ; python_version >= "3.9" and python_version < "3.10"
 decorator==5.1.1 ; python_version >= "3.9" and python_version < "3.10"
 dill==0.3.6 ; python_version >= "3.9" and python_version < "3.10"
 distributed==2023.6.1 ; python_version >= "3.9" and python_version < "3.10"
 duckdb==0.8.1 ; python_version >= "3.9" and python_version < "3.10"

 dask==2023.6.1 ; python_version >= "3.9" and python_version < "3.10"
 datasets==2.13.1 ; python_version >= "3.9" and python_version < "3.10"
 decorator==5.1.1 ; python_version >= "3.9" and python_version < "3.10"
+detect-secrets==1.4.0 ; python_version >= "3.9" and python_version < "3.10"
 dill==0.3.6 ; python_version >= "3.9" and python_version < "3.10"
 distributed==2023.6.1 ; python_version >= "3.9" and python_version < "3.10"
 duckdb==0.8.1 ; python_version >= "3.9" and python_version < "3.10"

src/concepts/concept.py CHANGED Viewed

@@ -162,7 +162,7 @@ class LogisticEmbeddingModel:
   def __post_init__(self) -> None:
     # See `notebooks/Toxicity.ipynb` for an example of training a concept model.
     self._model = LogisticRegression(
-      class_weight=None, C=30, tol=1e-5, warm_start=True, max_iter=1_000, n_jobs=-1)
   def score_embeddings(self, embeddings: np.ndarray) -> np.ndarray:
     """Get the scores for the provided embeddings."""
@@ -175,11 +175,12 @@ class LogisticEmbeddingModel:
       return np.random.rand(len(embeddings))
   def _setup_training(
-      self, X_train: np.ndarray, y_train: list[bool],
       implicit_negatives: Optional[np.ndarray]) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
-    num_pos_labels = len([y for y in y_train if y])
-    num_neg_labels = len([y for y in y_train if not y])
-    sample_weights = [(1.0 / num_pos_labels if y else 1.0 / num_neg_labels) for y in y_train]
     if implicit_negatives is not None:
       num_implicit_labels = len(implicit_negatives)
@@ -191,7 +192,14 @@ class LogisticEmbeddingModel:
     # Normalize sample weights to sum to the number of training examples.
     weights = np.array(sample_weights)
     weights *= (X_train.shape[0] / np.sum(weights))
-    return X_train, np.array(y_train), weights
   def fit(self, embeddings: np.ndarray, labels: list[bool],
           implicit_negatives: Optional[np.ndarray]) -> None:
@@ -337,11 +345,12 @@ class ConceptModel:
     embedding_items = list(embedding.compute(examples))
     result_items: list[Item] = []
     for item in embedding_items:
       if not isinstance(item, list):
         raise ValueError('Item from embedding is not a list.')
-      embeddings = np.array([np.squeeze(res[EMBEDDING_KEY]) for res in item])
-      scores = self._get_logistic_model(draft).score_embeddings(embeddings).tolist()
       item_result: list[Item] = []
       for embedding_item, score in zip(item, scores):

   def __post_init__(self) -> None:
     # See `notebooks/Toxicity.ipynb` for an example of training a concept model.
     self._model = LogisticRegression(
+      class_weight=None, C=30, tol=1e-5, warm_start=True, max_iter=5_000, n_jobs=-1)
   def score_embeddings(self, embeddings: np.ndarray) -> np.ndarray:
     """Get the scores for the provided embeddings."""
       return np.random.rand(len(embeddings))
   def _setup_training(
+      self, X_train: np.ndarray, labels: list[bool],
       implicit_negatives: Optional[np.ndarray]) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
+    num_pos_labels = len([y for y in labels if y])
+    num_neg_labels = len([y for y in labels if not y])
+    sample_weights = [(1.0 / num_pos_labels if y else 1.0 / num_neg_labels) for y in labels]
+    y_train = np.array(labels)
     if implicit_negatives is not None:
       num_implicit_labels = len(implicit_negatives)
     # Normalize sample weights to sum to the number of training examples.
     weights = np.array(sample_weights)
     weights *= (X_train.shape[0] / np.sum(weights))
+    # Shuffle the data in unison.
+    p = np.random.permutation(len(X_train))
+    X_train = X_train[p]
+    y_train = y_train[p]
+    weights = weights[p]
+    return X_train, y_train, weights
   def fit(self, embeddings: np.ndarray, labels: list[bool],
           implicit_negatives: Optional[np.ndarray]) -> None:
     embedding_items = list(embedding.compute(examples))
     result_items: list[Item] = []
+    logistic_model = self._get_logistic_model(draft)
     for item in embedding_items:
       if not isinstance(item, list):
         raise ValueError('Item from embedding is not a list.')
+      embeddings = np.array([np.reshape(res[EMBEDDING_KEY], -1) for res in item])
+      scores = logistic_model.score_embeddings(embeddings).tolist()
       item_result: list[Item] = []
       for embedding_item, score in zip(item, scores):

src/concepts/concept_test.py DELETED Viewed

@@ -1,84 +0,0 @@
-"""Tests for concept."""
-from ..schema import SignalInputType
-from .concept import DRAFT_MAIN, Concept, Example, draft_examples
-def test_draft_examples_main() -> None:
-  concept = Concept(
-    namespace='test_namespace',
-    concept_name='test_name',
-    type=SignalInputType.TEXT,
-    data={
-      '0': Example(id='0', label=True, text='hello'),
-      '1': Example(id='1', label=False, text='world'),
-    },
-    version=0)
-  assert draft_examples(concept, DRAFT_MAIN) == {
-    '0': Example(id='0', label=True, text='hello'),
-    '1': Example(id='1', label=False, text='world'),
-  }
-def test_draft_examples_simple_draft() -> None:
-  concept = Concept(
-    namespace='test_namespace',
-    concept_name='test_name',
-    type=SignalInputType.TEXT,
-    data={
-      '0': Example(id='0', label=True, text='hello'),
-      '1': Example(id='1', label=False, text='world'),
-      '2': Example(id='2', label=True, text='hello draft 1', draft='draft1'),
-      '3': Example(id='3', label=False, text='world draft 1', draft='draft1'),
-      '4': Example(id='4', label=True, text='hello draft 2', draft='draft2'),
-      '5': Example(id='5', label=False, text='world draft 2', draft='draft2'),
-    },
-    version=0)
-  assert draft_examples(concept, DRAFT_MAIN) == {
-    '0': Example(id='0', label=True, text='hello'),
-    '1': Example(id='1', label=False, text='world'),
-  }
-  assert draft_examples(concept, 'draft1') == {
-    '0': Example(id='0', label=True, text='hello'),
-    '1': Example(id='1', label=False, text='world'),
-    '2': Example(id='2', label=True, text='hello draft 1', draft='draft1'),
-    '3': Example(id='3', label=False, text='world draft 1', draft='draft1'),
-  }
-  assert draft_examples(concept, 'draft2') == {
-    '0': Example(id='0', label=True, text='hello'),
-    '1': Example(id='1', label=False, text='world'),
-    '4': Example(id='4', label=True, text='hello draft 2', draft='draft2'),
-    '5': Example(id='5', label=False, text='world draft 2', draft='draft2'),
-  }
-def test_draft_examples_draft_dedupe() -> None:
-  concept = Concept(
-    namespace='test_namespace',
-    concept_name='test_name',
-    type=SignalInputType.TEXT,
-    data={
-      '0': Example(id='0', label=True, text='hello'),
-      '1': Example(id='1', label=False, text='world'),
-      # Duplicate text.
-      '2': Example(id='2', label=False, text='hello', draft='draft'),
-      '3': Example(id='3', label=False, text='world draft', draft='draft'),
-    },
-    version=0)
-  assert draft_examples(concept, DRAFT_MAIN) == {
-    '0': Example(id='0', label=True, text='hello'),
-    '1': Example(id='1', label=False, text='world'),
-  }
-  assert draft_examples(concept, 'draft') == {
-    # 0 is deduplicated with 2.
-    '1': Example(id='1', label=False, text='world'),
-    # 2 overrides 0's label.
-    '2': Example(id='2', label=False, text='hello', draft='draft'),
-    '3': Example(id='3', label=False, text='world draft', draft='draft'),
-  }

src/concepts/db_concept_test.py DELETED Viewed

@@ -1,606 +0,0 @@
-"""Tests for the the database concept."""
-from pathlib import Path
-from typing import Generator, Iterable, Optional, Type, cast
-import numpy as np
-import pytest
-from pytest_mock import MockerFixture
-from typing_extensions import override
-from ..config import CONFIG
-from ..data.dataset_duckdb import DatasetDuckDB
-from ..data.dataset_utils import lilac_embedding
-from ..db_manager import set_default_dataset_cls
-from ..schema import Item, RichData, SignalInputType
-from ..signals.signal import TextEmbeddingSignal, clear_signal_registry, register_signal
-from .concept import (
-  DRAFT_MAIN,
-  Concept,
-  ConceptModel,
-  DraftId,
-  Example,
-  ExampleIn,
-  LogisticEmbeddingModel,
-)
-from .db_concept import (
-  ConceptDB,
-  ConceptInfo,
-  ConceptModelDB,
-  ConceptUpdate,
-  DiskConceptDB,
-  DiskConceptModelDB,
-)
-ALL_CONCEPT_DBS = [DiskConceptDB]
-ALL_CONCEPT_MODEL_DBS = [DiskConceptModelDB]
-@pytest.fixture(autouse=True)
-def set_data_path(tmp_path: Path, mocker: MockerFixture) -> None:
-  mocker.patch.dict(CONFIG, {'LILAC_DATA_PATH': str(tmp_path)})
-EMBEDDING_MAP: dict[str, list[float]] = {
-  'not in concept': [1.0, 0.0, 0.0],
-  'in concept': [0.9, 0.1, 0.0],
-  'a new data point': [0.1, 0.2, 0.3],
-  'a true draft point': [0.4, 0.5, 0.6],
-  'a false draft point': [0.7, 0.8, 0.9],
-}
-class TestEmbedding(TextEmbeddingSignal):
-  """A test embed function."""
-  name = 'test_embedding'
-  @override
-  def compute(self, data: Iterable[RichData]) -> Iterable[Item]:
-    """Embed the examples, use a hashmap to the vector for simplicity."""
-    for example in data:
-      if example not in EMBEDDING_MAP:
-        raise ValueError(f'Example "{str(example)}" not in embedding map')
-      yield [lilac_embedding(0, len(example), np.array(EMBEDDING_MAP[cast(str, example)]))]
-@pytest.fixture(scope='module', autouse=True)
-def setup_teardown() -> Generator:
-  set_default_dataset_cls(DatasetDuckDB)
-  register_signal(TestEmbedding)
-  # Unit test runs.
-  yield
-  # Teardown.
-  clear_signal_registry()
-@pytest.mark.parametrize('db_cls', ALL_CONCEPT_DBS)
-class ConceptDBSuite:
-  def test_create_concept(self, db_cls: Type[ConceptDB]) -> None:
-    db = db_cls()
-    db.create(namespace='test', name='test_concept', type=SignalInputType.TEXT)
-    assert db.list() == [
-      ConceptInfo(
-        namespace='test', name='test_concept', type=SignalInputType.TEXT, drafts=[DRAFT_MAIN])
-    ]
-    # Make sure list with drafts relects the drafts.
-    train_data = [
-      ExampleIn(label=False, text='not in concept', draft='test_draft'),
-      ExampleIn(label=True, text='in concept', draft='test_draft')
-    ]
-    db.edit('test', 'test_concept', ConceptUpdate(insert=train_data))
-    assert db.list() == [
-      ConceptInfo(
-        namespace='test',
-        name='test_concept',
-        type=SignalInputType.TEXT,
-        drafts=[DRAFT_MAIN, 'test_draft'])
-    ]
-  def test_add_example(self, db_cls: Type[ConceptDB]) -> None:
-    db = db_cls()
-    namespace = 'test'
-    concept_name = 'test_concept'
-    train_data = [
-      ExampleIn(label=False, text='not in concept'),
-      ExampleIn(label=True, text='in concept')
-    ]
-    db.create(namespace=namespace, name=concept_name, type=SignalInputType.TEXT)
-    db.edit(namespace, concept_name, ConceptUpdate(insert=train_data))
-    concept = db.get(namespace, concept_name)
-    assert concept is not None
-    keys = list(concept.data.keys())
-    assert concept == Concept(
-      namespace=namespace,
-      concept_name=concept_name,
-      type=SignalInputType.TEXT,
-      data={
-        keys[0]: Example(id=keys[0], label=False, text='not in concept'),
-        keys[1]: Example(id=keys[1], label=True, text='in concept')
-      },
-      version=1)
-    # Add a draft labels.
-    db.edit(
-      namespace, concept_name,
-      ConceptUpdate(insert=[
-        ExampleIn(label=False, text='really not in concept', draft='test_draft'),
-        ExampleIn(label=True, text='really in concept', draft='test_draft')
-      ]))
-    concept = db.get(namespace, concept_name)
-    assert concept is not None
-    keys = list(concept.data.keys())
-    assert concept == Concept(
-      namespace=namespace,
-      concept_name=concept_name,
-      type=SignalInputType.TEXT,
-      data={
-        keys[0]: Example(id=keys[0], label=False, text='not in concept'),
-        keys[1]: Example(id=keys[1], label=True, text='in concept'),
-        keys[2]: Example(id=keys[2], label=False, text='really not in concept', draft='test_draft'),
-        keys[3]: Example(id=keys[3], label=True, text='really in concept', draft='test_draft'),
-      },
-      version=2)
-  def test_update_concept(self, db_cls: Type[ConceptDB]) -> None:
-    db = db_cls()
-    namespace = 'test'
-    concept_name = 'test_concept'
-    train_data = [
-      ExampleIn(label=False, text='not in concept'),
-      ExampleIn(label=True, text='in concept'),
-      ExampleIn(label=False, text='really not in concept', draft='test_draft'),
-      ExampleIn(label=True, text='really in concept', draft='test_draft')
-    ]
-    db.create(namespace=namespace, name=concept_name, type=SignalInputType.TEXT)
-    db.edit(namespace, concept_name, ConceptUpdate(insert=train_data))
-    concept = db.get(namespace, concept_name)
-    assert concept is not None
-    keys = list(concept.data.keys())
-    # Edit the first example.
-    db.edit(
-      namespace, concept_name,
-      ConceptUpdate(update=[Example(id=keys[0], label=False, text='not in concept, updated')]))
-    concept = db.get(namespace, concept_name)
-    assert concept == Concept(
-      namespace=namespace,
-      concept_name=concept_name,
-      type=SignalInputType.TEXT,
-      data={
-        # The first example should be updated alone.
-        keys[0]: Example(id=keys[0], label=False, text='not in concept, updated'),
-        keys[1]: Example(id=keys[1], label=True, text='in concept'),
-        # Drafts are untouched.
-        keys[2]: Example(id=keys[2], label=False, text='really not in concept', draft='test_draft'),
-        keys[3]: Example(id=keys[3], label=True, text='really in concept', draft='test_draft'),
-      },
-      version=2)
-    # Edit the second example on the draft.
-    db.edit(
-      namespace, concept_name,
-      ConceptUpdate(update=[
-        Example(id=keys[3], label=True, text='really in concept, updated', draft='test_draft')
-      ]))
-    concept = db.get(namespace, concept_name)
-    assert concept == Concept(
-      namespace=namespace,
-      concept_name=concept_name,
-      type=SignalInputType.TEXT,
-      data={
-        # Main remains the same.
-        keys[0]: Example(id=keys[0], label=False, text='not in concept, updated'),
-        keys[1]: Example(id=keys[1], label=True, text='in concept'),
-        keys[2]: Example(id=keys[2], label=False, text='really not in concept', draft='test_draft'),
-        keys[3]: Example(
-          id=keys[3], label=True, text='really in concept, updated', draft='test_draft'),
-      },
-      version=3)
-  def test_remove_concept(self, db_cls: Type[ConceptDB]) -> None:
-    db = db_cls()
-    namespace = 'test'
-    concept_name = 'test_concept'
-    db.create(namespace=namespace, name=concept_name, type=SignalInputType.TEXT)
-    train_data = [
-      ExampleIn(label=False, text='not in concept'),
-      ExampleIn(label=True, text='in concept')
-    ]
-    db.edit(namespace, concept_name, ConceptUpdate(insert=train_data))
-    concept = db.get(namespace, concept_name)
-    db.remove(namespace, concept_name)
-    concept = db.get(namespace, concept_name)
-    assert concept is None
-  def test_remove_concept_examples(self, db_cls: Type[ConceptDB]) -> None:
-    db = db_cls()
-    namespace = 'test'
-    concept_name = 'test_concept'
-    db.create(namespace=namespace, name=concept_name, type=SignalInputType.TEXT)
-    train_data = [
-      ExampleIn(label=False, text='not in concept'),
-      ExampleIn(label=True, text='in concept')
-    ]
-    db.edit(namespace, concept_name, ConceptUpdate(insert=train_data))
-    concept = db.get(namespace, concept_name)
-    assert concept is not None
-    keys = list(concept.data.keys())
-    db.edit(namespace, concept_name, ConceptUpdate(remove=[keys[0]]))
-    concept = db.get(namespace, concept_name)
-    assert concept == Concept(
-      namespace=namespace,
-      concept_name=concept_name,
-      type=SignalInputType.TEXT,
-      data={
-        # key_0 was removed.
-        keys[1]: Example(id=keys[1], label=True, text='in concept')
-      },
-      version=2)
-  def test_remove_concept_examples_draft(self, db_cls: Type[ConceptDB]) -> None:
-    db = db_cls()
-    namespace = 'test'
-    concept_name = 'test_concept'
-    train_data = [
-      ExampleIn(label=False, text='not in concept'),
-      ExampleIn(label=True, text='in concept'),
-      ExampleIn(label=False, text='really not in concept', draft='test_draft'),
-      ExampleIn(label=True, text='really in concept', draft='test_draft')
-    ]
-    db.create(namespace=namespace, name=concept_name, type=SignalInputType.TEXT)
-    db.edit(namespace, concept_name, ConceptUpdate(insert=train_data))
-    concept = db.get(namespace, concept_name)
-    assert concept is not None
-    keys = list(concept.data.keys())
-    db.edit(namespace, concept_name, ConceptUpdate(remove=[keys[2]]))
-    concept = db.get(namespace, concept_name)
-    assert concept == Concept(
-      namespace=namespace,
-      concept_name=concept_name,
-      type=SignalInputType.TEXT,
-      data={
-        keys[0]: Example(id=keys[0], label=False, text='not in concept'),
-        keys[1]: Example(id=keys[1], label=True, text='in concept'),
-        # The first draft example is removed.
-        keys[3]: Example(id=keys[3], label=True, text='really in concept', draft='test_draft'),
-      },
-      version=2)
-  def test_remove_invalid_id(self, db_cls: Type[ConceptDB]) -> None:
-    db = db_cls()
-    namespace = 'test'
-    concept_name = 'test_concept'
-    db.create(namespace=namespace, name=concept_name, type=SignalInputType.TEXT)
-    train_data = [
-      ExampleIn(label=False, text='not in concept'),
-      ExampleIn(label=True, text='in concept'),
-      ExampleIn(label=False, text='really not in concept', draft='test_draft'),
-      ExampleIn(label=True, text='really in concept', draft='test_draft')
-    ]
-    db.edit(namespace, concept_name, ConceptUpdate(insert=train_data))
-    with pytest.raises(ValueError, match='Example with id "invalid_id" does not exist'):
-      db.edit(namespace, concept_name, ConceptUpdate(remove=['invalid_id']))
-  def test_edit_before_creation(self, db_cls: Type[ConceptDB]) -> None:
-    db = db_cls()
-    namespace = 'test'
-    concept_name = 'test_concept'
-    with pytest.raises(
-        ValueError, match='Concept with namespace "test" and name "test_concept" does not exist'):
-      db.edit(namespace, concept_name,
-              ConceptUpdate(insert=[
-                ExampleIn(label=False, text='not in concept'),
-              ]))
-  def test_edit_invalid_id(self, db_cls: Type[ConceptDB]) -> None:
-    db = db_cls()
-    namespace = 'test'
-    concept_name = 'test_concept'
-    db.create(namespace=namespace, name=concept_name, type=SignalInputType.TEXT)
-    train_data = [
-      ExampleIn(label=False, text='not in concept'),
-      ExampleIn(label=True, text='in concept')
-    ]
-    db.edit(namespace, concept_name, ConceptUpdate(insert=train_data))
-    with pytest.raises(ValueError, match='Example with id "invalid_id" does not exist'):
-      db.edit(namespace, concept_name,
-              ConceptUpdate(update=[Example(id='invalid_id', label=False, text='not in concept')]))
-  def test_merge_draft(self, db_cls: Type[ConceptDB]) -> None:
-    db = db_cls()
-    namespace = 'test'
-    concept_name = 'test_concept'
-    db.create(namespace=namespace, name=concept_name, type=SignalInputType.TEXT)
-    train_data = [
-      ExampleIn(label=True, text='hello'),
-      ExampleIn(label=False, text='world'),
-      ExampleIn(label=True, text='hello draft 1', draft='draft1'),
-      ExampleIn(label=False, text='world draft 1', draft='draft1'),
-      # Duplicate of main.
-      ExampleIn(label=False, text='hello', draft='draft2'),
-      ExampleIn(label=True, text='world draft 2', draft='draft2'),
-    ]
-    db.edit(namespace, concept_name, ConceptUpdate(insert=train_data))
-    db.merge_draft(namespace, concept_name, 'draft1')
-    concept = db.get(namespace, concept_name)
-    assert concept is not None
-    keys = list(concept.data.keys())
-    assert concept.dict() == Concept(
-      namespace='test',
-      concept_name='test_concept',
-      type=SignalInputType.TEXT,
-      data={
-        keys[0]: Example(id=keys[0], label=True, text='hello'),
-        keys[1]: Example(id=keys[1], label=False, text='world'),
-        # Draft examples are merged.
-        keys[2]: Example(id=keys[2], label=True, text='hello draft 1'),
-        keys[3]: Example(id=keys[3], label=False, text='world draft 1'),
-        # Draft 2 is untouched.
-        keys[4]: Example(id=keys[4], label=False, text='hello', draft='draft2'),
-        keys[5]: Example(id=keys[5], label=True, text='world draft 2', draft='draft2'),
-      },
-      version=2).dict()
-    db.merge_draft(namespace, concept_name, 'draft2')
-    concept = db.get(namespace, concept_name)
-    assert concept is not None
-    assert concept == Concept(
-      namespace='test',
-      concept_name='test_concept',
-      type=SignalInputType.TEXT,
-      data={
-        # The first example is a duplicate of the label from the draft, so it is removed.
-        keys[1]: Example(id=keys[1], label=False, text='world'),
-        # Draft examples are merged.
-        keys[2]: Example(id=keys[2], label=True, text='hello draft 1'),
-        keys[3]: Example(id=keys[3], label=False, text='world draft 1'),
-        # Draft examples are merged.
-        keys[4]: Example(id=keys[4], label=False, text='hello'),
-        keys[5]: Example(id=keys[5], label=True, text='world draft 2'),
-      },
-      version=3)
-def _make_test_concept_model(
-    concept_db: ConceptDB,
-    logistic_models: dict[DraftId, LogisticEmbeddingModel] = {}) -> ConceptModel:
-  namespace = 'test'
-  concept_name = 'test_concept'
-  concept_db.create(namespace=namespace, name=concept_name, type=SignalInputType.TEXT)
-  train_data = [
-    ExampleIn(label=False, text='not in concept'),
-    ExampleIn(label=True, text='in concept')
-  ]
-  concept_db.edit(namespace, concept_name, ConceptUpdate(insert=train_data))
-  model = ConceptModel(
-    namespace='test', concept_name='test_concept', embedding_name='test_embedding')
-  model._logistic_models = logistic_models
-  return model
-class TestLogisticModel(LogisticEmbeddingModel):
-  @override
-  def score_embeddings(self, embeddings: np.ndarray) -> np.ndarray:
-    """Get the scores for the provided embeddings."""
-    return np.array([.1])
-  @override
-  def fit(self, embeddings: np.ndarray, labels: list[bool],
-          implicit_negatives: Optional[np.ndarray]) -> None:
-    pass
-@pytest.mark.parametrize('concept_db_cls', ALL_CONCEPT_DBS)
-@pytest.mark.parametrize('model_db_cls', ALL_CONCEPT_MODEL_DBS)
-class ConceptModelDBSuite:
-  def test_save_and_get_model(self, concept_db_cls: Type[ConceptDB],
-                              model_db_cls: Type[ConceptModelDB]) -> None:
-    concept_db = concept_db_cls()
-    model_db = model_db_cls(concept_db)
-    model = _make_test_concept_model(concept_db)
-    model_db.sync(model)
-    retrieved_model = model_db.get(
-      namespace='test', concept_name='test_concept', embedding_name='test_embedding')
-    if not retrieved_model:
-      retrieved_model = model_db.create(
-        namespace='test', concept_name='test_concept', embedding_name='test_embedding')
-    assert retrieved_model.namespace == model.namespace
-    assert retrieved_model.concept_name == model.concept_name
-    assert retrieved_model.embedding_name == model.embedding_name
-    assert retrieved_model.version == model.version
-    assert retrieved_model.column_info == model.column_info
-  def test_sync_model(self, concept_db_cls: Type[ConceptDB], model_db_cls: Type[ConceptModelDB],
-                      mocker: MockerFixture) -> None:
-    concept_db = concept_db_cls()
-    model_db = model_db_cls(concept_db)
-    logistic_model = TestLogisticModel()
-    score_embeddings_mock = mocker.spy(TestLogisticModel, 'score_embeddings')
-    fit_mock = mocker.spy(TestLogisticModel, 'fit')
-    model = _make_test_concept_model(concept_db, logistic_models={DRAFT_MAIN: logistic_model})
-    assert model_db.in_sync(model) is False
-    assert score_embeddings_mock.call_count == 0
-    assert fit_mock.call_count == 0
-    model_db.sync(model)
-    assert model_db.in_sync(model) is True
-    assert score_embeddings_mock.call_count == 0
-    assert fit_mock.call_count == 1
-  def test_out_of_sync_model(self, concept_db_cls: Type[ConceptDB],
-                             model_db_cls: Type[ConceptModelDB], mocker: MockerFixture) -> None:
-    concept_db = concept_db_cls()
-    model_db = model_db_cls(concept_db)
-    score_embeddings_mock = mocker.spy(TestLogisticModel, 'score_embeddings')
-    fit_mock = mocker.spy(TestLogisticModel, 'fit')
-    logistic_model = TestLogisticModel()
-    model = _make_test_concept_model(concept_db, logistic_models={DRAFT_MAIN: logistic_model})
-    model_db.sync(model)
-    assert model_db.in_sync(model) is True
-    assert score_embeddings_mock.call_count == 0
-    assert fit_mock.call_count == 1
-    (called_model, called_embeddings, called_labels,
-     called_implicit_negatives) = fit_mock.call_args_list[-1].args
-    assert called_model == logistic_model
-    np.testing.assert_array_equal(
-      called_embeddings, np.array([EMBEDDING_MAP['not in concept'], EMBEDDING_MAP['in concept']]))
-    assert called_labels == [False, True]
-    assert called_implicit_negatives is None
-    # Edit the concept.
-    concept_db.edit('test', 'test_concept',
-                    ConceptUpdate(insert=[ExampleIn(label=False, text='a new data point')]))
-    # Make sure the model is out of sync.
-    assert model_db.in_sync(model) is False
-    assert score_embeddings_mock.call_count == 0
-    assert fit_mock.call_count == 1
-    model_db.sync(model)
-    assert model_db.in_sync(model) is True
-    assert score_embeddings_mock.call_count == 0
-    assert fit_mock.call_count == 2
-    # Fit is called again with new points on main only.
-    (called_model, called_embeddings, called_labels,
-     called_implicit_negatives) = fit_mock.call_args_list[-1].args
-    assert called_model == logistic_model
-    np.testing.assert_array_equal(
-      called_embeddings,
-      np.array([
-        EMBEDDING_MAP['not in concept'], EMBEDDING_MAP['in concept'],
-        EMBEDDING_MAP['a new data point']
-      ]))
-    assert called_labels == [False, True, False]
-    assert called_implicit_negatives is None
-  def test_out_of_sync_draft_model(self, concept_db_cls: Type[ConceptDB],
-                                   model_db_cls: Type[ConceptModelDB],
-                                   mocker: MockerFixture) -> None:
-    concept_db = concept_db_cls()
-    model_db = model_db_cls(concept_db)
-    score_embeddings_mock = mocker.spy(TestLogisticModel, 'score_embeddings')
-    fit_mock = mocker.spy(TestLogisticModel, 'fit')
-    main_model = TestLogisticModel()
-    draft_model = TestLogisticModel()
-    model = _make_test_concept_model(
-      concept_db, logistic_models={
-        DRAFT_MAIN: main_model,
-        'test_draft': draft_model
-      })
-    model_db.sync(model)
-    assert model_db.in_sync(model) is True
-    assert score_embeddings_mock.call_count == 0
-    assert fit_mock.call_count == 1
-    # Make sure drafts cause the model to be out of sync.
-    concept_db.edit(
-      'test',
-      'test_concept',
-      ConceptUpdate(insert=[
-        ExampleIn(label=True, text='a true draft point', draft='test_draft'),
-        ExampleIn(label=False, text='a false draft point', draft='test_draft'),
-        # This point exists in main, but we switched the label.
-        ExampleIn(label=False, text='in concept', draft='test_draft'),
-      ]))
-    # Make sure the model is out of sync.
-    assert model_db.in_sync(model) is False
-    assert score_embeddings_mock.call_count == 0
-    assert fit_mock.call_count == 1
-    model_db.sync(model)
-    assert model_db.in_sync(model) is True
-    assert score_embeddings_mock.call_count == 0
-    assert fit_mock.call_count == 3  # Fit is called on both the draft, and main.
-    # Fit is called again with the same points.
-    ((called_model, called_embeddings, called_labels, called_implicit_negatives),
-     (called_draft_model, called_draft_embeddings, called_draft_labels,
-      called_draft_implicit_negatives)) = (
-        c.args for c in fit_mock.call_args_list[-2:])
-    # The draft model is called with the data from main, and the data from draft.
-    assert called_draft_model == draft_model
-    np.testing.assert_array_equal(
-      called_draft_embeddings,
-      np.array([
-        EMBEDDING_MAP['a true draft point'], EMBEDDING_MAP['a false draft point'],
-        EMBEDDING_MAP['in concept'], EMBEDDING_MAP['not in concept']
-      ]))
-    assert called_draft_labels == [
-      True,
-      False,
-      # This was overriden by the draft.
-      False,
-      False
-    ]
-    assert called_draft_implicit_negatives is None
-    # The main model was fit without the data from the draft.
-    assert called_model == main_model
-    np.testing.assert_array_equal(
-      called_embeddings, np.array([EMBEDDING_MAP['not in concept'], EMBEDDING_MAP['in concept']]))
-    assert called_labels == [False, True]
-    assert called_implicit_negatives is None
-  def test_embedding_not_found_in_map(self, concept_db_cls: Type[ConceptDB],
-                                      model_db_cls: Type[ConceptModelDB]) -> None:
-    concept_db = concept_db_cls()
-    model_db = model_db_cls(concept_db)
-    model = _make_test_concept_model(concept_db)
-    model_db.sync(model)
-    # Edit the concept.
-    concept_db.edit('test', 'test_concept',
-                    ConceptUpdate(insert=[ExampleIn(label=False, text='unknown text')]))
-    # Make sure the model is out of sync.
-    assert model_db.in_sync(model) is False
-    with pytest.raises(ValueError, match='Example "unknown text" not in embedding map'):
-      model_db.sync(model)
-      model_db.sync(model)

src/data/dataset_compute_signal_chain_test.py DELETED Viewed

@@ -1,255 +0,0 @@
-"""Tests for dataset.compute_signal() when signals are chained."""
-import re
-from typing import Iterable, List, Optional, cast
-import numpy as np
-import pytest
-from pytest_mock import MockerFixture
-from typing_extensions import override
-from ..embeddings.vector_store import VectorStore
-from ..schema import UUID_COLUMN, Field, Item, RichData, VectorKey, field, schema
-from ..signals.signal import (
-  TextEmbeddingModelSignal,
-  TextEmbeddingSignal,
-  TextSignal,
-  TextSplitterSignal,
-  clear_signal_registry,
-  register_signal,
-)
-from .dataset import DatasetManifest
-from .dataset_test_utils import (
-  TEST_DATASET_NAME,
-  TEST_NAMESPACE,
-  TestDataMaker,
-  enriched_embedding_span,
-  enriched_embedding_span_field,
-  enriched_item,
-)
-from .dataset_utils import lilac_embedding, lilac_span
-SIMPLE_ITEMS: list[Item] = [{
-  UUID_COLUMN: '1',
-  'str': 'a',
-  'int': 1,
-  'bool': False,
-  'float': 3.0
-}, {
-  UUID_COLUMN: '2',
-  'str': 'b',
-  'int': 2,
-  'bool': True,
-  'float': 2.0
-}, {
-  UUID_COLUMN: '3',
-  'str': 'b',
-  'int': 2,
-  'bool': True,
-  'float': 1.0
-}]
-EMBEDDINGS: list[tuple[str, list[float]]] = [('hello.', [1.0, 0.0, 0.0]),
-                                             ('hello2.', [1.0, 1.0, 0.0]),
-                                             ('hello world.', [1.0, 1.0, 1.0]),
-                                             ('hello world2.', [2.0, 1.0, 1.0])]
-STR_EMBEDDINGS: dict[str, list[float]] = {text: embedding for text, embedding in EMBEDDINGS}
-class TestSplitter(TextSplitterSignal):
-  """Split documents into sentence by splitting on period."""
-  name = 'test_splitter'
-  @override
-  def compute(self, data: Iterable[RichData]) -> Iterable[Item]:
-    for text in data:
-      if not isinstance(text, str):
-        raise ValueError(f'Expected text to be a string, got {type(text)} instead.')
-      sentences = [f'{sentence.strip()}.' for sentence in text.split('.') if sentence]
-      yield [
-        lilac_span(text.index(sentence),
-                   text.index(sentence) + len(sentence)) for sentence in sentences
-      ]
-class TestEmbedding(TextEmbeddingSignal):
-  """A test embed function."""
-  name = 'test_embedding'
-  @override
-  def compute(self, data: Iterable[RichData]) -> Iterable[Item]:
-    """Call the embedding function."""
-    for example in data:
-      yield [lilac_embedding(0, len(example), np.array(STR_EMBEDDINGS[cast(str, example)]))]
-class TestEmbeddingSumSignal(TextEmbeddingModelSignal):
-  """Sums the embeddings to return a single floating point value."""
-  name = 'test_embedding_sum'
-  @override
-  def fields(self) -> Field:
-    return field('float32')
-  @override
-  def vector_compute(self, keys: Iterable[VectorKey], vector_store: VectorStore) -> Iterable[Item]:
-    # The signal just sums the values of the embedding.
-    embedding_sums = vector_store.get(keys).sum(axis=1)
-    for embedding_sum in embedding_sums.tolist():
-      yield embedding_sum
-@pytest.fixture(scope='module', autouse=True)
-def setup_teardown() -> Iterable[None]:
-  # Setup.
-  register_signal(TestSplitter)
-  register_signal(TestEmbedding)
-  register_signal(TestEmbeddingSumSignal)
-  register_signal(NamedEntity)
-  # Unit test runs.
-  yield
-  # Teardown.
-  clear_signal_registry()
-def test_manual_embedding_signal(make_test_data: TestDataMaker, mocker: MockerFixture) -> None:
-  dataset = make_test_data([{
-    UUID_COLUMN: '1',
-    'text': 'hello.',
-  }, {
-    UUID_COLUMN: '2',
-    'text': 'hello2.',
-  }])
-  embed_mock = mocker.spy(TestEmbedding, 'compute')
-  embedding_signal = TestEmbedding()
-  dataset.compute_signal(embedding_signal, 'text')
-  embedding_sum_signal = TestEmbeddingSumSignal(embedding=TestEmbedding.name)
-  dataset.compute_signal(embedding_sum_signal, 'text')
-  # Make sure the embedding signal is not called twice.
-  assert embed_mock.call_count == 1
-  assert dataset.manifest() == DatasetManifest(
-    namespace=TEST_NAMESPACE,
-    dataset_name=TEST_DATASET_NAME,
-    data_schema=schema({
-      UUID_COLUMN: 'string',
-      'text': field(
-        'string',
-        fields={
-          'test_embedding': field(
-            signal=embedding_signal.dict(),
-            fields=[
-              enriched_embedding_span_field(
-                {'test_embedding_sum': field('float32', embedding_sum_signal.dict())})
-            ])
-        }),
-    }),
-    num_items=2)
-  result = dataset.select_rows()
-  expected_result = [{
-    UUID_COLUMN: '1',
-    'text': enriched_item(
-      'hello.', {'test_embedding': [enriched_embedding_span(0, 6, {'test_embedding_sum': 1.0})]})
-  }, {
-    UUID_COLUMN: '2',
-    'text': enriched_item(
-      'hello2.', {'test_embedding': [enriched_embedding_span(0, 7, {'test_embedding_sum': 2.0})]})
-  }]
-  assert list(result) == expected_result
-def test_auto_embedding_signal(make_test_data: TestDataMaker, mocker: MockerFixture) -> None:
-  dataset = make_test_data([{
-    UUID_COLUMN: '1',
-    'text': 'hello.',
-  }, {
-    UUID_COLUMN: '2',
-    'text': 'hello2.',
-  }])
-  embed_mock = mocker.spy(TestEmbedding, 'compute')
-  # The embedding is automatically computed from the TestEmbeddingSumSignal.
-  embedding_sum_signal = TestEmbeddingSumSignal(embedding=TestEmbedding.name)
-  dataset.compute_signal(embedding_sum_signal, 'text')
-  # Make sure the embedding signal is not called twice.
-  assert embed_mock.call_count == 1
-  assert dataset.manifest() == DatasetManifest(
-    namespace=TEST_NAMESPACE,
-    dataset_name=TEST_DATASET_NAME,
-    data_schema=schema({
-      UUID_COLUMN: 'string',
-      'text': field(
-        'string',
-        fields={
-          'test_embedding': field(
-            signal=embedding_sum_signal._embedding_signal.dict(),
-            fields=[
-              enriched_embedding_span_field(
-                {'test_embedding_sum': field('float32', embedding_sum_signal.dict())})
-            ])
-        }),
-    }),
-    num_items=2)
-  result = dataset.select_rows()
-  expected_result = [{
-    UUID_COLUMN: '1',
-    'text': enriched_item(
-      'hello.', {'test_embedding': [enriched_embedding_span(0, 6, {'test_embedding_sum': 1.0})]})
-  }, {
-    UUID_COLUMN: '2',
-    'text': enriched_item(
-      'hello2.', {'test_embedding': [enriched_embedding_span(0, 7, {'test_embedding_sum': 2.0})]})
-  }]
-  assert list(result) == expected_result
-ENTITY_REGEX = r'[A-Za-z]+@[A-Za-z]+'
-class NamedEntity(TextSignal):
-  """Find special entities."""
-  name = 'entity'
-  @override
-  def fields(self) -> Field:
-    return field(fields=['string_span'])
-  @override
-  def compute(self, data: Iterable[RichData]) -> Iterable[Optional[List[Item]]]:
-    for text in data:
-      if not isinstance(text, str):
-        yield None
-        continue
-      yield [lilac_span(m.start(0), m.end(0)) for m in re.finditer(ENTITY_REGEX, text)]
-def test_entity_on_split_signal(make_test_data: TestDataMaker) -> None:
-  text = 'Hello nik@test. Here are some other entities like pii@gmail and all@lilac.'
-  dataset = make_test_data([{UUID_COLUMN: '1', 'text': text}])
-  entity = NamedEntity()
-  dataset.compute_signal(TestSplitter(), 'text')
-  dataset.compute_signal(entity, ('text', 'test_splitter', '*'))
-  result = dataset.select_rows(['text'])
-  assert list(result) == [{
-    UUID_COLUMN: '1',
-    'text': enriched_item(
-      text, {
-        'test_splitter': [
-          lilac_span(0, 15, {'entity': [lilac_span(6, 14)]}),
-          lilac_span(16, 74, {'entity': [
-            lilac_span(50, 59),
-            lilac_span(64, 73),
-          ]}),
-        ]
-      })
-  }]

src/data/dataset_compute_signal_test.py DELETED Viewed

@@ -1,669 +0,0 @@
-"""Tests for dataset.compute_signal()."""
-from typing import Iterable, Optional, Union, cast
-import numpy as np
-import pytest
-from typing_extensions import override
-from ..concepts.concept import ExampleIn
-from ..concepts.db_concept import ConceptUpdate, DiskConceptDB
-from ..schema import UUID_COLUMN, VALUE_KEY, Field, Item, RichData, SignalInputType, field, schema
-from ..signals.concept_scorer import ConceptScoreSignal
-from ..signals.signal import (
-  TextEmbeddingSignal,
-  TextSignal,
-  TextSplitterSignal,
-  clear_signal_registry,
-  register_signal,
-)
-from .dataset import Column, DatasetManifest, GroupsSortBy, SortOrder, val
-from .dataset_test_utils import (
-  TEST_DATASET_NAME,
-  TEST_NAMESPACE,
-  TestDataMaker,
-  enriched_embedding_span_field,
-  enriched_item,
-)
-from .dataset_utils import lilac_embedding, lilac_span
-SIMPLE_ITEMS: list[Item] = [{
-  UUID_COLUMN: '1',
-  'str': 'a',
-  'int': 1,
-  'bool': False,
-  'float': 3.0
-}, {
-  UUID_COLUMN: '2',
-  'str': 'b',
-  'int': 2,
-  'bool': True,
-  'float': 2.0
-}, {
-  UUID_COLUMN: '3',
-  'str': 'b',
-  'int': 2,
-  'bool': True,
-  'float': 1.0
-}]
-class TestInvalidSignal(TextSignal):
-  name = 'test_invalid_signal'
-  @override
-  def fields(self) -> Field:
-    return field('int32')
-  @override
-  def compute(self, data: Iterable[RichData]) -> Iterable[Optional[Item]]:
-    # Return an invalid output that doesn't match the input length.
-    return []
-class TestSparseSignal(TextSignal):
-  name = 'test_sparse_signal'
-  @override
-  def fields(self) -> Field:
-    return field('int32')
-  @override
-  def compute(self, data: Iterable[RichData]) -> Iterable[Optional[Item]]:
-    for text in data:
-      if text == 'hello':
-        # Skip this input.
-        yield None
-      else:
-        yield len(text)
-class TestSparseRichSignal(TextSignal):
-  """Find personally identifiable information (emails, phone numbers, etc)."""
-  name = 'test_sparse_rich_signal'
-  @override
-  def fields(self) -> Field:
-    return field(fields={'emails': ['string']})
-  @override
-  def compute(self, data: Iterable[RichData]) -> Iterable[Optional[Item]]:
-    for text in data:
-      if text == 'hello':
-        # Skip this input.
-        yield None
-      else:
-        yield {'emails': ['test1@hello.com', 'test2@hello.com']}
-class TestParamSignal(TextSignal):
-  name = 'param_signal'
-  param: str
-  def fields(self) -> Field:
-    return field('string')
-  def compute(self, data: Iterable[RichData]) -> Iterable[Optional[Item]]:
-    for text_content in data:
-      yield f'{str(text_content)}_{self.param}'
-class TestSignal(TextSignal):
-  name = 'test_signal'
-  @override
-  def fields(self) -> Field:
-    return field(fields={'len': 'int32', 'flen': 'float32'})
-  @override
-  def compute(self, data: Iterable[RichData]) -> Iterable[Optional[Item]]:
-    return [{'len': len(text_content), 'flen': float(len(text_content))} for text_content in data]
-class TestSplitSignal(TextSplitterSignal):
-  """Split documents into sentence by splitting on period, generating entities."""
-  name = 'test_split'
-  @override
-  def compute(self, data: Iterable[RichData]) -> Iterable[Item]:
-    for text in data:
-      if not isinstance(text, str):
-        raise ValueError(f'Expected text to be a string, got {type(text)} instead.')
-      sentences = [f'{sentence.strip()}.' for sentence in text.split('.') if sentence]
-      yield [
-        lilac_span(text.index(sentence),
-                   text.index(sentence) + len(sentence)) for sentence in sentences
-      ]
-EMBEDDINGS: list[tuple[str, Union[list[float], list[list[float]]]]] = [
-  ('hello.', [1.0, 0.0, 0.0]),
-  # This embedding has an outer dimension of 1.
-  ('hello2.', [[1.0, 1.0, 0.0]]),
-  ('hello3.', [[0, 0, 1.]])
-]
-STR_EMBEDDINGS: dict[str, Union[list[float], list[list[float]]]] = {
-  text: embedding for text, embedding in EMBEDDINGS
-}
-class TestEmbedding(TextEmbeddingSignal):
-  """A test embed function."""
-  name = 'test_embedding'
-  @override
-  def compute(self, data: Iterable[RichData]) -> Iterable[Item]:
-    """Call the embedding function."""
-    for example in data:
-      example = cast(str, example)
-      yield [lilac_embedding(0, len(example), np.array(STR_EMBEDDINGS[example]))]
-class ComputedKeySignal(TextSignal):
-  name = 'computed_key'
-  @override
-  def fields(self) -> Field:
-    return field('int64')
-  @override
-  def compute(self, data: Iterable[RichData]) -> Iterable[Optional[Item]]:
-    for text in data:
-      yield 1
-  def key(self, is_computed_signal: Optional[bool] = False) -> str:
-    return f'key_{is_computed_signal}'
-@pytest.fixture(scope='module', autouse=True)
-def setup_teardown() -> Iterable[None]:
-  # Setup.
-  register_signal(TestSparseSignal)
-  register_signal(TestSparseRichSignal)
-  register_signal(TestParamSignal)
-  register_signal(TestSignal)
-  register_signal(TestSplitSignal)
-  register_signal(TestEmbedding)
-  register_signal(ComputedKeySignal)
-  register_signal(ConceptScoreSignal)
-  # Unit test runs.
-  yield
-  # Teardown.
-  clear_signal_registry()
-def test_signal_output_validation(make_test_data: TestDataMaker) -> None:
-  signal = TestInvalidSignal()
-  dataset = make_test_data([{
-    UUID_COLUMN: '1',
-    'text': 'hello',
-  }, {
-    UUID_COLUMN: '2',
-    'text': 'hello world',
-  }])
-  with pytest.raises(
-      ValueError, match='The signal generated 0 values but the input data had 2 values.'):
-    dataset.compute_signal(signal, 'text')
-def test_sparse_signal(make_test_data: TestDataMaker) -> None:
-  dataset = make_test_data([{
-    UUID_COLUMN: '1',
-    'text': 'hello',
-  }, {
-    UUID_COLUMN: '2',
-    'text': 'hello world',
-  }])
-  dataset.compute_signal(TestSparseSignal(), 'text')
-  result = dataset.select_rows(['text'])
-  assert list(result) == [{
-    UUID_COLUMN: '1',
-    'text': enriched_item('hello', {'test_sparse_signal': None})
-  }, {
-    UUID_COLUMN: '2',
-    'text': enriched_item('hello world', {'test_sparse_signal': 11})
-  }]
-def test_sparse_rich_signal(make_test_data: TestDataMaker) -> None:
-  dataset = make_test_data([{
-    UUID_COLUMN: '1',
-    'text': 'hello',
-  }, {
-    UUID_COLUMN: '2',
-    'text': 'hello world',
-  }])
-  dataset.compute_signal(TestSparseRichSignal(), 'text')
-  result = dataset.select_rows(['text'])
-  assert list(result) == [{
-    UUID_COLUMN: '1',
-    'text': enriched_item('hello', {'test_sparse_rich_signal': None})
-  }, {
-    UUID_COLUMN: '2',
-    'text': enriched_item(
-      'hello world',
-      {'test_sparse_rich_signal': {
-        'emails': ['test1@hello.com', 'test2@hello.com']
-      }})
-  }]
-def test_source_joined_with_signal(make_test_data: TestDataMaker) -> None:
-  dataset = make_test_data(SIMPLE_ITEMS)
-  assert dataset.manifest() == DatasetManifest(
-    namespace=TEST_NAMESPACE,
-    dataset_name=TEST_DATASET_NAME,
-    data_schema=schema({
-      UUID_COLUMN: 'string',
-      'str': 'string',
-      'int': 'int32',
-      'bool': 'boolean',
-      'float': 'float32',
-    }),
-    num_items=3)
-  test_signal = TestSignal()
-  dataset.compute_signal(test_signal, 'str')
-  # Check the enriched dataset manifest has 'text' enriched.
-  assert dataset.manifest() == DatasetManifest(
-    namespace=TEST_NAMESPACE,
-    dataset_name=TEST_DATASET_NAME,
-    data_schema=schema({
-      UUID_COLUMN: 'string',
-      'str': field(
-        'string',
-        fields={
-          'test_signal': field(
-            signal=test_signal.dict(), fields={
-              'len': 'int32',
-              'flen': 'float32'
-            }),
-        }),
-      'int': 'int32',
-      'bool': 'boolean',
-      'float': 'float32',
-    }),
-    num_items=3)
-  result = dataset.select_rows(['str'])
-  assert list(result) == [{
-    UUID_COLUMN: '1',
-    'str': enriched_item('a', {'test_signal': {
-      'len': 1,
-      'flen': 1.0
-    }}),
-  }, {
-    UUID_COLUMN: '2',
-    'str': enriched_item('b', {'test_signal': {
-      'len': 1,
-      'flen': 1.0
-    }}),
-  }, {
-    UUID_COLUMN: '3',
-    'str': enriched_item('b', {'test_signal': {
-      'len': 1,
-      'flen': 1.0
-    }}),
-  }]
-  # Select a specific signal leaf test_signal.flen with val('str').
-  result = dataset.select_rows([val('str'), ('str', 'test_signal', 'flen')])
-  assert list(result) == [{
-    UUID_COLUMN: '1',
-    f'str.{VALUE_KEY}': 'a',
-    'str.test_signal.flen': 1.0
-  }, {
-    UUID_COLUMN: '2',
-    f'str.{VALUE_KEY}': 'b',
-    'str.test_signal.flen': 1.0
-  }, {
-    UUID_COLUMN: '3',
-    f'str.{VALUE_KEY}': 'b',
-    'str.test_signal.flen': 1.0
-  }]
-  # Select a specific signal leaf test_signal.flen and the whole 'str' subtree.
-  result = dataset.select_rows(['str', ('str', 'test_signal', 'flen')])
-  assert list(result) == [{
-    UUID_COLUMN: '1',
-    'str': enriched_item('a', {'test_signal': {
-      'len': 1,
-      'flen': 1.0
-    }}),
-    'str.test_signal.flen': 1.0
-  }, {
-    UUID_COLUMN: '2',
-    'str': enriched_item('b', {'test_signal': {
-      'len': 1,
-      'flen': 1.0
-    }}),
-    'str.test_signal.flen': 1.0
-  }, {
-    UUID_COLUMN: '3',
-    'str': enriched_item('b', {'test_signal': {
-      'len': 1,
-      'flen': 1.0
-    }}),
-    'str.test_signal.flen': 1.0
-  }]
-  # Select multiple signal leafs with aliasing.
-  result = dataset.select_rows([
-    val('str'),
-    Column(('str', 'test_signal', 'flen'), alias='flen'),
-    Column(('str', 'test_signal', 'len'), alias='len')
-  ])
-  assert list(result) == [{
-    UUID_COLUMN: '1',
-    f'str.{VALUE_KEY}': 'a',
-    'flen': 1.0,
-    'len': 1
-  }, {
-    UUID_COLUMN: '2',
-    f'str.{VALUE_KEY}': 'b',
-    'flen': 1.0,
-    'len': 1
-  }, {
-    UUID_COLUMN: '3',
-    f'str.{VALUE_KEY}': 'b',
-    'flen': 1.0,
-    'len': 1
-  }]
-def test_parameterized_signal(make_test_data: TestDataMaker) -> None:
-  dataset = make_test_data([{
-    UUID_COLUMN: '1',
-    'text': 'hello'
-  }, {
-    UUID_COLUMN: '2',
-    'text': 'everybody'
-  }])
-  test_signal_a = TestParamSignal(param='a')
-  test_signal_b = TestParamSignal(param='b')
-  dataset.compute_signal(test_signal_a, 'text')
-  dataset.compute_signal(test_signal_b, 'text')
-  assert dataset.manifest() == DatasetManifest(
-    namespace=TEST_NAMESPACE,
-    dataset_name=TEST_DATASET_NAME,
-    data_schema=schema({
-      UUID_COLUMN: 'string',
-      'text': field(
-        'string',
-        fields={
-          'param_signal(param=a)': field('string', test_signal_a.dict()),
-          'param_signal(param=b)': field('string', test_signal_b.dict()),
-        }),
-    }),
-    num_items=2)
-  result = dataset.select_rows(['text'])
-  assert list(result) == [{
-    UUID_COLUMN: '1',
-    'text': enriched_item('hello', {
-      'param_signal(param=a)': 'hello_a',
-      'param_signal(param=b)': 'hello_b',
-    })
-  }, {
-    UUID_COLUMN: '2',
-    'text': enriched_item('everybody', {
-      'param_signal(param=a)': 'everybody_a',
-      'param_signal(param=b)': 'everybody_b',
-    })
-  }]
-def test_split_signal(make_test_data: TestDataMaker) -> None:
-  dataset = make_test_data([{
-    UUID_COLUMN: '1',
-    'text': '[1, 1] first sentence. [1, 1] second sentence.',
-  }, {
-    UUID_COLUMN: '2',
-    'text': 'b2 [2, 1] first sentence. [2, 1] second sentence.',
-  }])
-  signal = TestSplitSignal()
-  dataset.compute_signal(signal, 'text')
-  assert dataset.manifest() == DatasetManifest(
-    namespace=TEST_NAMESPACE,
-    dataset_name=TEST_DATASET_NAME,
-    data_schema=schema({
-      UUID_COLUMN: 'string',
-      'text': field(
-        'string', fields={'test_split': field(signal=signal.dict(), fields=[field('string_span')])})
-    }),
-    num_items=2)
-  result = dataset.select_rows(['text'])
-  expected_result = [{
-    UUID_COLUMN: '1',
-    'text': enriched_item('[1, 1] first sentence. [1, 1] second sentence.',
-                          {'test_split': [lilac_span(0, 22), lilac_span(23, 46)]})
-  }, {
-    UUID_COLUMN: '2',
-    'text': enriched_item('b2 [2, 1] first sentence. [2, 1] second sentence.',
-                          {'test_split': [
-                            lilac_span(0, 25),
-                            lilac_span(26, 49),
-                          ]})
-  }]
-  assert list(result) == expected_result
-def test_signal_on_repeated_field(make_test_data: TestDataMaker) -> None:
-  dataset = make_test_data([{
-    UUID_COLUMN: '1',
-    'text': ['hello', 'everybody'],
-  }, {
-    UUID_COLUMN: '2',
-    'text': ['hello2', 'everybody2'],
-  }])
-  test_signal = TestSignal()
-  # Run the signal on the repeated field.
-  dataset.compute_signal(test_signal, ('text', '*'))
-  # Check the enriched dataset manifest has 'text' enriched.
-  assert dataset.manifest() == DatasetManifest(
-    namespace=TEST_NAMESPACE,
-    dataset_name=TEST_DATASET_NAME,
-    data_schema=schema({
-      UUID_COLUMN: 'string',
-      'text': field(fields=[
-        field(
-          'string',
-          fields={
-            'test_signal': field(
-              signal=test_signal.dict(), fields={
-                'len': 'int32',
-                'flen': 'float32'
-              })
-          })
-      ])
-    }),
-    num_items=2)
-  result = dataset.select_rows([('text', '*')])
-  assert list(result) == [{
-    UUID_COLUMN: '1',
-    'text.*': [
-      enriched_item('hello', {'test_signal': {
-        'len': 5,
-        'flen': 5.0
-      }}),
-      enriched_item('everybody', {'test_signal': {
-        'len': 9,
-        'flen': 9.0
-      }})
-    ]
-  }, {
-    UUID_COLUMN: '2',
-    'text.*': [
-      enriched_item('hello2', {'test_signal': {
-        'len': 6,
-        'flen': 6.0
-      }}),
-      enriched_item('everybody2', {'test_signal': {
-        'len': 10,
-        'flen': 10.0
-      }})
-    ]
-  }]
-def test_text_splitter(make_test_data: TestDataMaker) -> None:
-  dataset = make_test_data([{
-    UUID_COLUMN: '1',
-    'text': '[1, 1] first sentence. [1, 1] second sentence.',
-  }, {
-    UUID_COLUMN: '2',
-    'text': 'b2 [2, 1] first sentence. [2, 1] second sentence.',
-  }])
-  dataset.compute_signal(TestSplitSignal(), 'text')
-  result = dataset.select_rows(['text'])
-  expected_result = [{
-    UUID_COLUMN: '1',
-    'text': enriched_item('[1, 1] first sentence. [1, 1] second sentence.',
-                          {'test_split': [
-                            lilac_span(0, 22),
-                            lilac_span(23, 46),
-                          ]}),
-  }, {
-    UUID_COLUMN: '2',
-    'text': enriched_item('b2 [2, 1] first sentence. [2, 1] second sentence.',
-                          {'test_split': [
-                            lilac_span(0, 25),
-                            lilac_span(26, 49),
-                          ]}),
-  }]
-  assert list(result) == expected_result
-def test_embedding_signal(make_test_data: TestDataMaker) -> None:
-  dataset = make_test_data([{
-    UUID_COLUMN: '1',
-    'text': 'hello.',
-  }, {
-    UUID_COLUMN: '2',
-    'text': 'hello2.',
-  }])
-  embedding_signal = TestEmbedding()
-  dataset.compute_signal(embedding_signal, 'text')
-  assert dataset.manifest() == DatasetManifest(
-    namespace=TEST_NAMESPACE,
-    dataset_name=TEST_DATASET_NAME,
-    data_schema=schema({
-      UUID_COLUMN: 'string',
-      'text': field(
-        'string',
-        fields={
-          'test_embedding': field(
-            signal=embedding_signal.dict(), fields=[enriched_embedding_span_field()])
-        }),
-    }),
-    num_items=2)
-  result = dataset.select_rows()
-  # Embeddings are replaced with "None".
-  expected_result = [{
-    UUID_COLUMN: '1',
-    'text': enriched_item('hello.', {'test_embedding': [lilac_embedding(0, 6, None)]})
-  }, {
-    UUID_COLUMN: '2',
-    'text': enriched_item('hello2.', {'test_embedding': [lilac_embedding(0, 7, None)]})
-  }]
-  assert list(result) == expected_result
-def test_is_computed_signal_key(make_test_data: TestDataMaker) -> None:
-  dataset = make_test_data([{
-    UUID_COLUMN: '1',
-    'text': 'hello.',
-  }, {
-    UUID_COLUMN: '2',
-    'text': 'hello2.',
-  }])
-  signal = ComputedKeySignal()
-  dataset.compute_signal(signal, 'text')
-  assert dataset.manifest() == DatasetManifest(
-    namespace=TEST_NAMESPACE,
-    dataset_name=TEST_DATASET_NAME,
-    data_schema=schema({
-      UUID_COLUMN: 'string',
-      'text': field('string', fields={'key_True': field('int64', signal=signal.dict())}),
-    }),
-    num_items=2)
-  result = dataset.select_rows()
-  # Embeddings are replaced with "None".
-  expected_result = [{
-    UUID_COLUMN: '1',
-    'text': enriched_item('hello.', {'key_True': 1})
-  }, {
-    UUID_COLUMN: '2',
-    'text': enriched_item('hello2.', {'key_True': 1})
-  }]
-  assert list(result) == expected_result
-def test_concept_signal_with_select_groups(make_test_data: TestDataMaker) -> None:
-  dataset = make_test_data([{
-    UUID_COLUMN: '1',
-    'text': 'hello.',
-  }, {
-    UUID_COLUMN: '2',
-    'text': 'hello2.',
-  }, {
-    UUID_COLUMN: '3',
-    'text': 'hello3.',
-  }])
-  embedding_signal = TestEmbedding()
-  dataset.compute_signal(embedding_signal, 'text')
-  concept_db = DiskConceptDB()
-  concept_db.create(namespace='test_namespace', name='test_concept', type=SignalInputType.TEXT)
-  concept_db.edit(
-    'test_namespace', 'test_concept',
-    ConceptUpdate(insert=[
-      ExampleIn(label=False, text='hello.'),
-      ExampleIn(label=True, text='hello2.'),
-      ExampleIn(label=False, text='hello3.')
-    ]))
-  concept_signal = ConceptScoreSignal(
-    namespace='test_namespace', concept_name='test_concept', embedding='test_embedding')
-  dataset.compute_signal(concept_signal, 'text')
-  concept_key = concept_signal.key(is_computed_signal=True)
-  result = dataset.select_groups(f'text.test_embedding.*.embedding.{concept_key}')
-  assert result.counts == [('Not in concept', 2), ('In concept', 1)]
-  result = dataset.select_groups(
-    f'text.test_embedding.*.embedding.{concept_key}',
-    sort_by=GroupsSortBy.COUNT,
-    sort_order=SortOrder.ASC)
-  assert result.counts == [('In concept', 1), ('Not in concept', 2)]

src/data/dataset_duckdb.py CHANGED Viewed

@@ -6,7 +6,7 @@ import os
 import re
 import shutil
 import threading
-from typing import Any, Iterable, Optional, Sequence, Type, Union, cast
 import duckdb
 import numpy as np
@@ -93,6 +93,7 @@ from .dataset_utils import (
   read_embedding_index,
   replace_embeddings_with_none,
   schema_contains_path,
   unflatten,
   wrap_in_dicts,
   write_item_embeddings_to_disk,
@@ -686,7 +687,7 @@ class DatasetDuckDB(Dataset):
     star_in_cols = any(col.path == ('*',) for col in cols)
     if not cols or star_in_cols:
       # Select all columns.
-      cols.extend([Column(name) for name in schema.fields.keys()])
       if star_in_cols:
         cols = [col for col in cols if col.path != ('*',)]
     return cols
@@ -941,8 +942,9 @@ class DatasetDuckDB(Dataset):
           # The input is an embedding.
           embedding_signal = cast(TextEmbeddingModelSignal, signal)
           vector_store = self.get_vector_store(embedding_signal.embedding, udf_col.path)
-          flat_keys = flatten_keys(df[UUID_COLUMN], input)
-          signal_out = signal.vector_compute(flat_keys, vector_store)
           # Add progress.
           if task_step_id is not None:
             signal_out = progress(
@@ -953,8 +955,9 @@ class DatasetDuckDB(Dataset):
           df[signal_column] = unflatten(signal_out, input)
         else:
           num_rich_data = count_primitives(input)
-          flat_input = cast(Iterable[RichData], flatten(input))
-          signal_out = signal.compute(flat_input)
           # Add progress.
           if task_step_id is not None:
             signal_out = progress(
@@ -962,22 +965,21 @@ class DatasetDuckDB(Dataset):
               task_step_id=task_step_id,
               estimated_len=num_rich_data,
               step_description=f'Computing {signal.key()}...')
-          signal_out = list(signal_out)
           if signal_column in temp_column_to_offset_column:
             offset_column_name, field = temp_column_to_offset_column[signal_column]
-            nested_spans: Iterable[Item] = df[offset_column_name]
             flat_spans = list(flatten(nested_spans))
-            for span, item in zip(flat_spans, signal_out):
               _offset_any_span(cast(int, span[VALUE_KEY][TEXT_SPAN_START_FEATURE]), item, field)
-          if len(signal_out) != num_rich_data:
             raise ValueError(
-              f'The signal generated {len(signal_out)} values but the input data had '
               f"{num_rich_data} values. This means the signal either didn't generate a "
               '"None" for a sparse output, or generated too many items.')
-          df[signal_column] = unflatten(signal_out, input)
         signal.teardown()

 import re
 import shutil
 import threading
+from typing import Any, Iterable, Iterator, Optional, Sequence, Type, Union, cast
 import duckdb
 import numpy as np
   read_embedding_index,
   replace_embeddings_with_none,
   schema_contains_path,
+  sparse_to_dense_compute,
   unflatten,
   wrap_in_dicts,
   write_item_embeddings_to_disk,
     star_in_cols = any(col.path == ('*',) for col in cols)
     if not cols or star_in_cols:
       # Select all columns.
+      cols.extend([Column((name,)) for name in schema.fields.keys()])
       if star_in_cols:
         cols = [col for col in cols if col.path != ('*',)]
     return cols
           # The input is an embedding.
           embedding_signal = cast(TextEmbeddingModelSignal, signal)
           vector_store = self.get_vector_store(embedding_signal.embedding, udf_col.path)
+          flat_keys = list(flatten_keys(df[UUID_COLUMN], input))
+          signal_out = sparse_to_dense_compute(
+            iter(flat_keys), lambda keys: signal.vector_compute(keys, vector_store))
           # Add progress.
           if task_step_id is not None:
             signal_out = progress(
           df[signal_column] = unflatten(signal_out, input)
         else:
           num_rich_data = count_primitives(input)
+          flat_input = cast(Iterator[Optional[RichData]], flatten(input))
+          signal_out = sparse_to_dense_compute(
+            flat_input, lambda x: signal.compute(cast(Iterable[RichData], x)))
           # Add progress.
           if task_step_id is not None:
             signal_out = progress(
               task_step_id=task_step_id,
               estimated_len=num_rich_data,
               step_description=f'Computing {signal.key()}...')
+          signal_out_list = list(signal_out)
           if signal_column in temp_column_to_offset_column:
             offset_column_name, field = temp_column_to_offset_column[signal_column]
+            nested_spans: Iterator[Item] = df[offset_column_name]
             flat_spans = list(flatten(nested_spans))
+            for span, item in zip(flat_spans, signal_out_list):
               _offset_any_span(cast(int, span[VALUE_KEY][TEXT_SPAN_START_FEATURE]), item, field)
+          if len(signal_out_list) != num_rich_data:
             raise ValueError(
+              f'The signal generated {len(signal_out_list)} values but the input data had '
               f"{num_rich_data} values. This means the signal either didn't generate a "
               '"None" for a sparse output, or generated too many items.')
+          df[signal_column] = unflatten(signal_out_list, input)
         signal.teardown()

src/data/dataset_select_groups_test.py DELETED Viewed

@@ -1,317 +0,0 @@
-"""Tests for dataset.select_groups()."""
-import re
-import pytest
-from pytest_mock import MockerFixture
-from ..schema import UUID_COLUMN, Item, field, schema
-from . import dataset as dataset_module
-from .dataset import BinaryOp
-from .dataset_test_utils import TestDataMaker
-def test_flat_data(make_test_data: TestDataMaker) -> None:
-  items: list[Item] = [
-    {
-      'name': 'Name1',
-      'age': 34,
-      'active': False
-    },
-    {
-      'name': 'Name2',
-      'age': 45,
-      'active': True
-    },
-    {
-      'age': 17,
-      'active': True
-    },  # Missing "name".
-    {
-      'name': 'Name3',
-      'active': True
-    },  # Missing "age".
-    {
-      'name': 'Name4',
-      'age': 55
-    }  # Missing "active".
-  ]
-  dataset = make_test_data(items)
-  result = dataset.select_groups(leaf_path='name')
-  assert result.counts == [('Name1', 1), ('Name2', 1), (None, 1), ('Name3', 1), ('Name4', 1)]
-  result = dataset.select_groups(leaf_path='age', bins=[20, 50, 60])
-  assert result.counts == [('1', 2), ('0', 1), (None, 1), ('2', 1)]
-  result = dataset.select_groups(leaf_path='active')
-  assert result.counts == [
-    (True, 3),
-    (False, 1),
-    (None, 1),  # Missing "active".
-  ]
-def test_result_counts(make_test_data: TestDataMaker) -> None:
-  items: list[Item] = [
-    {
-      'active': False
-    },
-    {
-      'active': True
-    },
-    {
-      'active': True
-    },
-    {
-      'active': True
-    },
-    {}  # Missing "active".
-  ]
-  dataset = make_test_data(items, schema=schema({UUID_COLUMN: 'string', 'active': 'boolean'}))
-  result = dataset.select_groups(leaf_path='active')
-  assert result.counts == [(True, 3), (False, 1), (None, 1)]
-def test_list_of_structs(make_test_data: TestDataMaker) -> None:
-  items: list[Item] = [{
-    'list_of_structs': [{
-      'name': 'a'
-    }, {
-      'name': 'b'
-    }]
-  }, {
-    'list_of_structs': [{
-      'name': 'c'
-    }, {
-      'name': 'a'
-    }, {
-      'name': 'd'
-    }]
-  }, {
-    'list_of_structs': [{
-      'name': 'd'
-    }]
-  }]
-  dataset = make_test_data(items)
-  result = dataset.select_groups(leaf_path='list_of_structs.*.name')
-  assert result.counts == [('a', 2), ('d', 2), ('b', 1), ('c', 1)]
-def test_nested_lists(make_test_data: TestDataMaker) -> None:
-  items: list[Item] = [{
-    'nested_list': [[{
-      'name': 'a'
-    }], [{
-      'name': 'b'
-    }]]
-  }, {
-    'nested_list': [[{
-      'name': 'c'
-    }, {
-      'name': 'a'
-    }], [{
-      'name': 'd'
-    }]]
-  }, {
-    'nested_list': [[{
-      'name': 'd'
-    }]]
-  }]
-  dataset = make_test_data(items)
-  result = dataset.select_groups(leaf_path='nested_list.*.*.name')
-  assert result.counts == [('a', 2), ('d', 2), ('b', 1), ('c', 1)]
-def test_nested_struct(make_test_data: TestDataMaker) -> None:
-  items: list[Item] = [
-    {
-      'nested_struct': {
-        'struct': {
-          'name': 'c'
-        }
-      }
-    },
-    {
-      'nested_struct': {
-        'struct': {
-          'name': 'b'
-        }
-      }
-    },
-    {
-      'nested_struct': {
-        'struct': {
-          'name': 'a'
-        }
-      }
-    },
-  ]
-  dataset = make_test_data(items)
-  result = dataset.select_groups(leaf_path='nested_struct.struct.name')
-  assert result.counts == [('c', 1), ('b', 1), ('a', 1)]
-def test_named_bins(make_test_data: TestDataMaker) -> None:
-  items: list[Item] = [{
-    'age': 34,
-  }, {
-    'age': 45,
-  }, {
-    'age': 17,
-  }, {
-    'age': 80
-  }, {
-    'age': 55
-  }, {
-    'age': float('nan')
-  }]
-  dataset = make_test_data(items)
-  result = dataset.select_groups(
-    leaf_path='age',
-    bins=[
-      ('young', None, 20),
-      ('adult', 20, 50),
-      ('middle-aged', 50, 65),
-      ('senior', 65, None),
-    ])
-  assert result.counts == [('adult', 2), ('young', 1), ('senior', 1), ('middle-aged', 1), (None, 1)]
-def test_schema_with_bins(make_test_data: TestDataMaker) -> None:
-  items: list[Item] = [{
-    'age': 34,
-  }, {
-    'age': 45,
-  }, {
-    'age': 17,
-  }, {
-    'age': 80
-  }, {
-    'age': 55
-  }, {
-    'age': float('nan')
-  }]
-  data_schema = schema({
-    UUID_COLUMN: 'string',
-    'age': field(
-      'float32',
-      bins=[
-        ('young', None, 20),
-        ('adult', 20, 50),
-        ('middle-aged', 50, 65),
-        ('senior', 65, None),
-      ])
-  })
-  dataset = make_test_data(items, data_schema)
-  result = dataset.select_groups(leaf_path='age')
-  assert result.counts == [('adult', 2), ('young', 1), ('senior', 1), ('middle-aged', 1), (None, 1)]
-def test_filters(make_test_data: TestDataMaker) -> None:
-  items: list[Item] = [
-    {
-      'name': 'Name1',
-      'age': 34,
-      'active': False
-    },
-    {
-      'name': 'Name2',
-      'age': 45,
-      'active': True
-    },
-    {
-      'age': 17,
-      'active': True
-    },  # Missing "name".
-    {
-      'name': 'Name3',
-      'active': True
-    },  # Missing "age".
-    {
-      'name': 'Name4',
-      'age': 55
-    }  # Missing "active".
-  ]
-  dataset = make_test_data(items)
-  # active = True.
-  result = dataset.select_groups(leaf_path='name', filters=[('active', BinaryOp.EQUALS, True)])
-  assert result.counts == [('Name2', 1), (None, 1), ('Name3', 1)]
-  # age < 35.
-  result = dataset.select_groups(leaf_path='name', filters=[('age', BinaryOp.LESS, 35)])
-  assert result.counts == [('Name1', 1), (None, 1)]
-  # age < 35 and active = True.
-  result = dataset.select_groups(
-    leaf_path='name', filters=[('age', BinaryOp.LESS, 35), ('active', BinaryOp.EQUALS, True)])
-  assert result.counts == [(None, 1)]
-def test_invalid_leaf(make_test_data: TestDataMaker) -> None:
-  items: list[Item] = [
-    {
-      'nested_struct': {
-        'struct': {
-          'name': 'c'
-        }
-      }
-    },
-    {
-      'nested_struct': {
-        'struct': {
-          'name': 'b'
-        }
-      }
-    },
-    {
-      'nested_struct': {
-        'struct': {
-          'name': 'a'
-        }
-      }
-    },
-  ]
-  dataset = make_test_data(items)
-  with pytest.raises(
-      ValueError, match=re.escape("Leaf \"('nested_struct',)\" not found in dataset")):
-    dataset.select_groups(leaf_path='nested_struct')
-  with pytest.raises(
-      ValueError, match=re.escape("Leaf \"('nested_struct', 'struct')\" not found in dataset")):
-    dataset.select_groups(leaf_path='nested_struct.struct')
-  with pytest.raises(
-      ValueError,
-      match=re.escape("Leaf \"('nested_struct', 'struct', 'wrong_name')\" not found in dataset")):
-    dataset.select_groups(leaf_path='nested_struct.struct.wrong_name')
-def test_too_many_distinct(make_test_data: TestDataMaker, mocker: MockerFixture) -> None:
-  too_many_distinct = 5
-  mocker.patch(f'{dataset_module.__name__}.TOO_MANY_DISTINCT', too_many_distinct)
-  items: list[Item] = [{'feature': str(i)} for i in range(too_many_distinct + 10)]
-  dataset = make_test_data(items)
-  res = dataset.select_groups('feature')
-  assert res.too_many_distinct is True
-  assert res.counts == []
-def test_auto_bins_for_float(make_test_data: TestDataMaker) -> None:
-  items: list[Item] = [{'feature': float(i)} for i in range(5)] + [{'feature': float('nan')}]
-  dataset = make_test_data(items)
-  res = dataset.select_groups('feature')
-  assert res.counts == [('0', 1), ('3', 1), ('7', 1), ('11', 1), ('14', 1), (None, 1)]
-  assert res.too_many_distinct is False
-  assert res.bins

src/data/dataset_select_rows_filter_test.py DELETED Viewed

@@ -1,200 +0,0 @@
-"""Tests for dataset.select_rows(filters=[...])."""
-import pytest
-from ..schema import UUID_COLUMN, Item, schema
-from .dataset import BinaryFilterTuple, BinaryOp, ListFilterTuple, ListOp, UnaryOp
-from .dataset_test_utils import TestDataMaker
-TEST_DATA: list[Item] = [{
-  UUID_COLUMN: '1',
-  'str': 'a',
-  'int': 1,
-  'bool': False,
-  'float': 3.0
-}, {
-  UUID_COLUMN: '2',
-  'str': 'b',
-  'int': 2,
-  'bool': True,
-  'float': 2.0
-}, {
-  UUID_COLUMN: '3',
-  'str': 'b',
-  'int': 2,
-  'bool': True,
-  'float': 1.0
-}, {
-  UUID_COLUMN: '4',
-  'float': float('nan')
-}]
-def test_filter_by_ids(make_test_data: TestDataMaker) -> None:
-  dataset = make_test_data(TEST_DATA)
-  id_filter: BinaryFilterTuple = (UUID_COLUMN, BinaryOp.EQUALS, '1')
-  result = dataset.select_rows(filters=[id_filter])
-  assert list(result) == [{UUID_COLUMN: '1', 'str': 'a', 'int': 1, 'bool': False, 'float': 3.0}]
-  id_filter = (UUID_COLUMN, BinaryOp.EQUALS, '2')
-  result = dataset.select_rows(filters=[id_filter])
-  assert list(result) == [{UUID_COLUMN: '2', 'str': 'b', 'int': 2, 'bool': True, 'float': 2.0}]
-  id_filter = (UUID_COLUMN, BinaryOp.EQUALS, b'f')
-  result = dataset.select_rows(filters=[id_filter])
-  assert list(result) == []
-def test_filter_greater(make_test_data: TestDataMaker) -> None:
-  dataset = make_test_data(TEST_DATA)
-  id_filter: BinaryFilterTuple = ('float', BinaryOp.GREATER, 2.0)
-  result = dataset.select_rows(filters=[id_filter])
-  assert list(result) == [{UUID_COLUMN: '1', 'str': 'a', 'int': 1, 'bool': False, 'float': 3.0}]
-def test_filter_greater_equal(make_test_data: TestDataMaker) -> None:
-  dataset = make_test_data(TEST_DATA)
-  id_filter: BinaryFilterTuple = ('float', BinaryOp.GREATER_EQUAL, 2.0)
-  result = dataset.select_rows(filters=[id_filter])
-  assert list(result) == [{
-    UUID_COLUMN: '1',
-    'str': 'a',
-    'int': 1,
-    'bool': False,
-    'float': 3.0
-  }, {
-    UUID_COLUMN: '2',
-    'str': 'b',
-    'int': 2,
-    'bool': True,
-    'float': 2.0
-  }]
-def test_filter_less(make_test_data: TestDataMaker) -> None:
-  dataset = make_test_data(TEST_DATA)
-  id_filter: BinaryFilterTuple = ('float', BinaryOp.LESS, 2.0)
-  result = dataset.select_rows(filters=[id_filter])
-  assert list(result) == [{UUID_COLUMN: '3', 'str': 'b', 'int': 2, 'bool': True, 'float': 1.0}]
-def test_filter_less_equal(make_test_data: TestDataMaker) -> None:
-  dataset = make_test_data(TEST_DATA)
-  id_filter: BinaryFilterTuple = ('float', BinaryOp.LESS_EQUAL, 2.0)
-  result = dataset.select_rows(filters=[id_filter])
-  assert list(result) == [{
-    UUID_COLUMN: '2',
-    'str': 'b',
-    'int': 2,
-    'bool': True,
-    'float': 2.0
-  }, {
-    UUID_COLUMN: '3',
-    'str': 'b',
-    'int': 2,
-    'bool': True,
-    'float': 1.0
-  }]
-def test_filter_not_equal(make_test_data: TestDataMaker) -> None:
-  dataset = make_test_data(TEST_DATA)
-  id_filter: BinaryFilterTuple = ('float', BinaryOp.NOT_EQUAL, 2.0)
-  result = dataset.select_rows(filters=[id_filter])
-  assert list(result) == [
-    {
-      UUID_COLUMN: '1',
-      'str': 'a',
-      'int': 1,
-      'bool': False,
-      'float': 3.0
-    },
-    {
-      UUID_COLUMN: '3',
-      'str': 'b',
-      'int': 2,
-      'bool': True,
-      'float': 1.0
-    },
-    # NaNs are not counted when we are filtering a field.
-  ]
-def test_filter_by_list_of_ids(make_test_data: TestDataMaker) -> None:
-  dataset = make_test_data(TEST_DATA)
-  id_filter: ListFilterTuple = (UUID_COLUMN, ListOp.IN, ['1', '2'])
-  result = dataset.select_rows(filters=[id_filter])
-  assert list(result) == [{
-    UUID_COLUMN: '1',
-    'str': 'a',
-    'int': 1,
-    'bool': False,
-    'float': 3.0
-  }, {
-    UUID_COLUMN: '2',
-    'str': 'b',
-    'int': 2,
-    'bool': True,
-    'float': 2.0
-  }]
-def test_filter_by_exists(make_test_data: TestDataMaker) -> None:
-  items: list[Item] = [{
-    UUID_COLUMN: '1',
-    'name': 'A',
-    'info': {
-      'lang': 'en'
-    },
-    'ages': []
-  }, {
-    UUID_COLUMN: '2',
-    'info': {
-      'lang': 'fr'
-    },
-  }, {
-    UUID_COLUMN: '3',
-    'name': 'C',
-    'ages': [[1, 2], [3, 4]]
-  }]
-  dataset = make_test_data(
-    items,
-    schema=schema({
-      UUID_COLUMN: 'string',
-      'name': 'string',
-      'info': {
-        'lang': 'string'
-      },
-      'ages': [['int32']]
-    }))
-  exists_filter = ('name', UnaryOp.EXISTS)
-  result = dataset.select_rows(['name'], filters=[exists_filter])
-  assert list(result) == [{UUID_COLUMN: '1', 'name': 'A'}, {UUID_COLUMN: '3', 'name': 'C'}]
-  exists_filter = ('info.lang', UnaryOp.EXISTS)
-  result = dataset.select_rows(['name'], filters=[exists_filter])
-  assert list(result) == [{UUID_COLUMN: '1', 'name': 'A'}, {UUID_COLUMN: '2', 'name': None}]
-  exists_filter = ('ages.*.*', UnaryOp.EXISTS)
-  result = dataset.select_rows(['name'], filters=[exists_filter])
-  assert list(result) == [{UUID_COLUMN: '3', 'name': 'C'}]
-  with pytest.raises(ValueError, match='Unable to filter on path'):
-    dataset.select_rows(['name'], filters=[('info', UnaryOp.EXISTS)])

src/data/dataset_select_rows_schema_test.py DELETED Viewed

@@ -1,551 +0,0 @@
-"""Tests for `db.select_rows_schema()`."""
-from typing import Iterable, Optional, cast
-import numpy as np
-import pytest
-from typing_extensions import override
-from ..embeddings.vector_store import VectorStore
-from ..schema import PATH_WILDCARD, UUID_COLUMN, Field, Item, RichData, VectorKey, field, schema
-from ..signals.concept_labels import ConceptLabelsSignal
-from ..signals.concept_scorer import ConceptScoreSignal
-from ..signals.semantic_similarity import SemanticSimilaritySignal
-from ..signals.signal import (
-  EMBEDDING_KEY,
-  TextEmbeddingModelSignal,
-  TextEmbeddingSignal,
-  TextSignal,
-  TextSplitterSignal,
-  clear_signal_registry,
-  register_signal,
-)
-from ..signals.substring_search import SubstringSignal
-from .dataset import (
-  Column,
-  ConceptQuery,
-  KeywordQuery,
-  Search,
-  SearchResultInfo,
-  SelectRowsSchemaResult,
-  SelectRowsSchemaUDF,
-  SemanticQuery,
-  SortOrder,
-  SortResult,
-)
-from .dataset_test_utils import (
-  TEST_DATASET_NAME,
-  TEST_NAMESPACE,
-  TestDataMaker,
-  enriched_embedding_span_field,
-)
-from .dataset_utils import lilac_embedding, lilac_span
-TEST_DATA: list[Item] = [{
-  UUID_COLUMN: '1',
-  'erased': False,
-  'people': [{
-    'name': 'A',
-    'zipcode': 0,
-    'locations': [{
-      'city': 'city1',
-      'state': 'state1'
-    }, {
-      'city': 'city2',
-      'state': 'state2'
-    }]
-  }]
-}, {
-  UUID_COLUMN: '2',
-  'erased': True,
-  'people': [{
-    'name': 'B',
-    'zipcode': 1,
-    'locations': [{
-      'city': 'city3',
-      'state': 'state3'
-    }, {
-      'city': 'city4'
-    }, {
-      'city': 'city5'
-    }]
-  }, {
-    'name': 'C',
-    'zipcode': 2,
-    'locations': [{
-      'city': 'city1',
-      'state': 'state1'
-    }]
-  }]
-}]
-class TestSplitter(TextSplitterSignal):
-  """Split documents into sentence by splitting on period."""
-  name = 'test_splitter'
-  @override
-  def compute(self, data: Iterable[RichData]) -> Iterable[Item]:
-    for text in data:
-      if not isinstance(text, str):
-        raise ValueError(f'Expected text to be a string, got {type(text)} instead.')
-      sentences = [f'{sentence.strip()}.' for sentence in text.split('.') if sentence]
-      yield [
-        lilac_span(text.index(sentence),
-                   text.index(sentence) + len(sentence)) for sentence in sentences
-      ]
-EMBEDDINGS: list[tuple[str, list[float]]] = [('hello.', [1.0, 0.0, 0.0]),
-                                             ('hello2.', [1.0, 1.0, 0.0]),
-                                             ('hello world.', [1.0, 1.0, 1.0]),
-                                             ('hello world2.', [2.0, 1.0, 1.0])]
-STR_EMBEDDINGS: dict[str, list[float]] = {text: embedding for text, embedding in EMBEDDINGS}
-class TestEmbedding(TextEmbeddingSignal):
-  """A test embed function."""
-  name = 'test_embedding'
-  @override
-  def compute(self, data: Iterable[RichData]) -> Iterable[Item]:
-    """Call the embedding function."""
-    for example in data:
-      yield [lilac_embedding(0, len(example), np.array(STR_EMBEDDINGS[cast(str, example)]))]
-class TestEmbeddingSumSignal(TextEmbeddingModelSignal):
-  """Sums the embeddings to return a single floating point value."""
-  name = 'test_embedding_sum'
-  @override
-  def fields(self) -> Field:
-    return field('float32')
-  @override
-  def vector_compute(self, keys: Iterable[VectorKey], vector_store: VectorStore) -> Iterable[Item]:
-    # The signal just sums the values of the embedding.
-    embedding_sums = vector_store.get(keys).sum(axis=1)
-    for embedding_sum in embedding_sums.tolist():
-      yield embedding_sum
-@pytest.fixture(scope='module', autouse=True)
-def setup_teardown() -> Iterable[None]:
-  # Setup.
-  register_signal(LengthSignal)
-  register_signal(AddSpaceSignal)
-  register_signal(TestSplitter)
-  register_signal(TestEmbedding)
-  register_signal(TestEmbeddingSumSignal)
-  # Unit test runs.
-  yield
-  # Teardown.
-  clear_signal_registry()
-class LengthSignal(TextSignal):
-  name = 'length_signal'
-  def fields(self) -> Field:
-    return field('int32')
-  def compute(self, data: Iterable[RichData]) -> Iterable[Optional[Item]]:
-    for text_content in data:
-      yield len(text_content)
-class AddSpaceSignal(TextSignal):
-  name = 'add_space_signal'
-  def fields(self) -> Field:
-    return field('string')
-  def compute(self, data: Iterable[RichData]) -> Iterable[Optional[Item]]:
-    for text_content in data:
-      yield cast(str, text_content) + ' '
-def test_simple_schema(make_test_data: TestDataMaker) -> None:
-  dataset = make_test_data(TEST_DATA)
-  result = dataset.select_rows_schema(combine_columns=True)
-  assert result == SelectRowsSchemaResult(
-    data_schema=schema({
-      UUID_COLUMN: 'string',
-      'erased': 'boolean',
-      'people': [{
-        'name': 'string',
-        'zipcode': 'int32',
-        'locations': [{
-          'city': 'string',
-          'state': 'string'
-        }]
-      }]
-    }))
-def test_subselection_with_combine_cols(make_test_data: TestDataMaker) -> None:
-  dataset = make_test_data(TEST_DATA)
-  result = dataset.select_rows_schema([('people', '*', 'zipcode'),
-                                       ('people', '*', 'locations', '*', 'city')],
-                                      combine_columns=True)
-  assert result == SelectRowsSchemaResult(
-    data_schema=schema({
-      UUID_COLUMN: 'string',
-      'people': [{
-        'zipcode': 'int32',
-        'locations': [{
-          'city': 'string'
-        }]
-      }]
-    }))
-  result = dataset.select_rows_schema([('people', '*', 'name'), ('people', '*', 'locations')],
-                                      combine_columns=True)
-  assert result == SelectRowsSchemaResult(
-    data_schema=schema({
-      UUID_COLUMN: 'string',
-      'people': [{
-        'name': 'string',
-        'locations': [{
-          'city': 'string',
-          'state': 'string'
-        }]
-      }]
-    }))
-  result = dataset.select_rows_schema([('people', '*')], combine_columns=True)
-  assert result == SelectRowsSchemaResult(
-    namespace=TEST_NAMESPACE,
-    dataset_name=TEST_DATASET_NAME,
-    data_schema=schema({
-      UUID_COLUMN: 'string',
-      'people': [{
-        'name': 'string',
-        'zipcode': 'int32',
-        'locations': [{
-          'city': 'string',
-          'state': 'string'
-        }]
-      }]
-    }))
-def test_udf_with_combine_cols(make_test_data: TestDataMaker) -> None:
-  dataset = make_test_data(TEST_DATA)
-  length_signal = LengthSignal()
-  result = dataset.select_rows_schema([('people', '*', 'locations', '*', 'city'),
-                                       Column(('people', '*', 'name'), signal_udf=length_signal)],
-                                      combine_columns=True)
-  assert result == SelectRowsSchemaResult(
-    data_schema=schema({
-      UUID_COLUMN: 'string',
-      'people': [{
-        'name': {
-          'length_signal': field('int32', length_signal.dict())
-        },
-        'locations': [{
-          'city': 'string'
-        }]
-      }],
-    }),
-    udfs=[
-      SelectRowsSchemaUDF(path=('people', '*', 'name', length_signal.key())),
-    ],
-  )
-def test_embedding_udf_with_combine_cols(make_test_data: TestDataMaker) -> None:
-  dataset = make_test_data(TEST_DATA)
-  add_space_signal = AddSpaceSignal()
-  path = ('people', '*', 'name')
-  dataset.compute_signal(add_space_signal, path)
-  result = dataset.select_rows_schema([path, Column(path, signal_udf=add_space_signal)],
-                                      combine_columns=True)
-  assert result == SelectRowsSchemaResult(
-    data_schema=schema({
-      UUID_COLUMN: 'string',
-      'people': [{
-        'name': field(
-          'string', fields={'add_space_signal': field('string', signal=add_space_signal.dict())})
-      }],
-    }),
-    udfs=[
-      SelectRowsSchemaUDF(path=(*path, add_space_signal.key())),
-    ],
-  )
-def test_udf_chained_with_combine_cols(make_test_data: TestDataMaker) -> None:
-  dataset = make_test_data([{
-    UUID_COLUMN: '1',
-    'text': 'hello. hello2.',
-  }, {
-    UUID_COLUMN: '2',
-    'text': 'hello world. hello world2.',
-  }])
-  test_splitter = TestSplitter()
-  dataset.compute_signal(test_splitter, ('text'))
-  add_space_signal = AddSpaceSignal()
-  result = dataset.select_rows_schema(
-    [('text'), Column(('text'), signal_udf=add_space_signal)], combine_columns=True)
-  assert result == SelectRowsSchemaResult(
-    data_schema=schema({
-      UUID_COLUMN: 'string',
-      'text': field(
-        'string',
-        fields={
-          'add_space_signal': field('string', add_space_signal.dict()),
-          'test_splitter': field(signal=test_splitter.dict(), fields=['string_span'])
-        })
-    }),
-    udfs=[
-      SelectRowsSchemaUDF(path=('text', add_space_signal.key())),
-    ],
-  )
-def test_udf_embedding_chained_with_combine_cols(make_test_data: TestDataMaker) -> None:
-  dataset = make_test_data([{
-    UUID_COLUMN: '1',
-    'text': 'hello. hello2.',
-  }, {
-    UUID_COLUMN: '2',
-    'text': 'hello world. hello world2.',
-  }])
-  test_splitter = TestSplitter()
-  dataset.compute_signal(test_splitter, 'text')
-  test_embedding = TestEmbedding()
-  dataset.compute_signal(test_embedding, ('text', 'test_splitter', '*'))
-  embedding_sum_signal = TestEmbeddingSumSignal(embedding='test_embedding')
-  udf_col = Column(('text', 'test_splitter', '*'), signal_udf=embedding_sum_signal)
-  result = dataset.select_rows_schema([('text'), udf_col], combine_columns=True)
-  expected_schema = schema({
-    UUID_COLUMN: 'string',
-    'text': field(
-      'string',
-      fields={
-        'test_splitter': field(
-          signal=test_splitter.dict(),
-          fields=[
-            field(
-              'string_span',
-              fields={
-                'test_embedding': field(
-                  signal=test_embedding.dict(),
-                  fields=[
-                    enriched_embedding_span_field(
-                      {'test_embedding_sum': field('float32', embedding_sum_signal.dict())})
-                  ])
-              })
-          ])
-      })
-  })
-  output_path = ('text', 'test_splitter', '*', 'test_embedding', '*', 'embedding',
-                 'test_embedding_sum')
-  assert result == SelectRowsSchemaResult(
-    data_schema=expected_schema,
-    udfs=[SelectRowsSchemaUDF(path=output_path)],
-  )
-  # Alias the udf.
-  udf_col.alias = 'udf1'
-  result = dataset.select_rows_schema([('text'), udf_col], combine_columns=True)
-  assert result == SelectRowsSchemaResult(
-    data_schema=expected_schema,
-    udfs=[SelectRowsSchemaUDF(path=output_path, alias='udf1')],
-  )
-def test_search_keyword_schema(make_test_data: TestDataMaker) -> None:
-  dataset = make_test_data([{
-    UUID_COLUMN: '1',
-    'text': 'hello world',
-    'text2': 'hello world2',
-  }])
-  query_world = 'world'
-  query_hello = 'hello'
-  result = dataset.select_rows_schema(
-    searches=[
-      Search(path='text', query=KeywordQuery(type='keyword', search=query_world)),
-      Search(path='text2', query=KeywordQuery(type='keyword', search=query_hello)),
-    ],
-    combine_columns=True)
-  expected_world_signal = SubstringSignal(query=query_world)
-  expected_hello_signal = SubstringSignal(query=query_hello)
-  assert result == SelectRowsSchemaResult(
-    data_schema=schema({
-      UUID_COLUMN: 'string',
-      'text': field(
-        'string',
-        fields={
-          expected_world_signal.key(): field(
-            signal=expected_world_signal.dict(), fields=['string_span'])
-        }),
-      'text2': field(
-        'string',
-        fields={
-          expected_hello_signal.key(): field(
-            signal=expected_hello_signal.dict(), fields=['string_span'])
-        })
-    }),
-    search_results=[
-      SearchResultInfo(
-        search_path=('text',),
-        result_path=('text', expected_world_signal.key(), PATH_WILDCARD),
-      ),
-      SearchResultInfo(
-        search_path=('text2',),
-        result_path=('text2', expected_hello_signal.key(), PATH_WILDCARD),
-      )
-    ],
-    udfs=[
-      SelectRowsSchemaUDF(path=('text', expected_world_signal.key())),
-      SelectRowsSchemaUDF(path=('text2', expected_hello_signal.key())),
-    ],
-  )
-def test_search_semantic_schema(make_test_data: TestDataMaker) -> None:
-  dataset = make_test_data([{
-    UUID_COLUMN: '1',
-    'text': 'hello world.',
-  }])
-  query_world = 'world'
-  test_embedding = TestEmbedding()
-  dataset.compute_signal(test_embedding, ('text'))
-  result = dataset.select_rows_schema(
-    searches=[
-      Search(
-        path='text',
-        query=SemanticQuery(type='semantic', search=query_world, embedding='test_embedding')),
-    ],
-    combine_columns=True)
-  test_embedding = TestEmbedding()
-  expected_world_signal = SemanticSimilaritySignal(query=query_world, embedding='test_embedding')
-  similarity_score_path = ('text', 'test_embedding', PATH_WILDCARD, EMBEDDING_KEY,
-                           expected_world_signal.key())
-  assert result == SelectRowsSchemaResult(
-    data_schema=schema({
-      UUID_COLUMN: 'string',
-      'text': field(
-        'string',
-        fields={
-          'test_embedding': field(
-            signal=test_embedding.dict(),
-            fields=[
-              enriched_embedding_span_field(
-                {expected_world_signal.key(): field('float32', expected_world_signal.dict())})
-            ])
-        })
-    }),
-    udfs=[SelectRowsSchemaUDF(path=similarity_score_path)],
-    search_results=[SearchResultInfo(search_path=('text',), result_path=similarity_score_path)],
-    sorts=[SortResult(path=similarity_score_path, order=SortOrder.DESC, search_index=0)])
-def test_search_concept_schema(make_test_data: TestDataMaker) -> None:
-  dataset = make_test_data([{
-    UUID_COLUMN: '1',
-    'text': 'hello world.',
-  }])
-  test_embedding = TestEmbedding()
-  dataset.compute_signal(test_embedding, ('text'))
-  result = dataset.select_rows_schema(
-    searches=[
-      Search(
-        path='text',
-        query=ConceptQuery(
-          type='concept',
-          concept_namespace='test_namespace',
-          concept_name='test_concept',
-          embedding='test_embedding')),
-    ],
-    combine_columns=True)
-  test_embedding = TestEmbedding()
-  expected_world_signal = ConceptScoreSignal(
-    namespace='test_namespace', concept_name='test_concept', embedding='test_embedding')
-  expected_labels_signal = ConceptLabelsSignal(
-    namespace='test_namespace', concept_name='test_concept')
-  concept_score_path = ('text', 'test_embedding', PATH_WILDCARD, EMBEDDING_KEY,
-                        expected_world_signal.key())
-  concept_labels_path = ('text', expected_labels_signal.key())
-  assert result == SelectRowsSchemaResult(
-    data_schema=schema({
-      UUID_COLUMN: 'string',
-      'text': field(
-        'string',
-        fields={
-          'test_embedding': field(
-            signal=test_embedding.dict(),
-            fields=[
-              enriched_embedding_span_field({
-                expected_world_signal.key(): field(
-                  'float32',
-                  expected_world_signal.dict(),
-                  bins=[('Not in concept', None, 0.5), ('In concept', 0.5, None)])
-              })
-            ]),
-          'test_namespace/test_concept/labels': field(
-            fields=[field('string_span', fields={
-              'label': 'boolean',
-              'draft': 'string'
-            })],
-            signal=expected_labels_signal.dict())
-        })
-    }),
-    udfs=[
-      SelectRowsSchemaUDF(path=concept_labels_path),
-      SelectRowsSchemaUDF(path=concept_score_path)
-    ],
-    search_results=[
-      SearchResultInfo(search_path=('text',), result_path=concept_labels_path),
-      SearchResultInfo(search_path=('text',), result_path=concept_score_path)
-    ],
-    sorts=[SortResult(path=concept_score_path, order=SortOrder.DESC, search_index=0)])
-def test_search_sort_override(make_test_data: TestDataMaker) -> None:
-  dataset = make_test_data([{
-    UUID_COLUMN: '1',
-    'text': 'hello world.',
-  }])
-  query_world = 'world'
-  test_embedding = TestEmbedding()
-  dataset.compute_signal(test_embedding, ('text'))
-  result = dataset.select_rows_schema(
-    searches=[
-      Search(
-        path='text',
-        query=SemanticQuery(type='semantic', search=query_world, embedding='test_embedding')),
-    ],
-    # Explicit sort by overrides the semantic search.
-    sort_by=[('text',)],
-    sort_order=SortOrder.DESC,
-    combine_columns=True)
-  assert result.sorts == [SortResult(path=('text',), order=SortOrder.DESC)]

src/data/dataset_select_rows_search_test.py DELETED Viewed

@@ -1,393 +0,0 @@
-"""Tests for dataset.select_rows(searches=[...])."""
-from typing import Iterable, cast
-import numpy as np
-import pytest
-from pytest import approx
-from pytest_mock import MockerFixture
-from sklearn.preprocessing import normalize
-from typing_extensions import override
-from ..concepts.concept import ExampleIn, LogisticEmbeddingModel
-from ..concepts.db_concept import ConceptUpdate, DiskConceptDB
-from ..db_manager import set_default_dataset_cls
-from ..schema import UUID_COLUMN, Item, RichData, SignalInputType
-from ..signals.concept_scorer import ConceptScoreSignal
-from ..signals.semantic_similarity import SemanticSimilaritySignal
-from ..signals.signal import TextEmbeddingSignal, clear_signal_registry, register_signal
-from ..signals.substring_search import SubstringSignal
-from .dataset import ConceptQuery, KeywordQuery, ListOp, Search, SemanticQuery, SortOrder
-from .dataset_duckdb import DatasetDuckDB
-from .dataset_test_utils import TestDataMaker, enriched_embedding_span, enriched_item
-from .dataset_utils import lilac_embedding, lilac_span
-TEST_DATA: list[Item] = [{
-  UUID_COLUMN: '1',
-  'text': 'hello world',
-  'text2': 'again hello world',
-}, {
-  UUID_COLUMN: '2',
-  'text': 'looking for world in text',
-  'text2': 'again looking for world in text',
-}, {
-  UUID_COLUMN: '3',
-  'text': 'unrelated text',
-  'text2': 'again unrelated text'
-}]
-EMBEDDINGS: list[tuple[str, list[float]]] = [
-  ('hello.', [1.0, 0.0, 0.0]),
-  ('hello2.', [1.0, 1.0, 0.0]),
-  ('hello world.', [1.0, 1.0, 1.0]),
-  ('hello world2.', [2.0, 1.0, 1.0]),
-  ('random negative 1', [0, 0, 0.3]),
-  ('random negative 2', [0, 0, 0.4]),
-  ('random negative 3', [0, 0.1, 0.5]),
-  ('random negative 4', [0.1, 0, 0.4]),
-]
-STR_EMBEDDINGS: dict[str, list[float]] = {text: embedding for text, embedding in EMBEDDINGS}
-@pytest.fixture(scope='module', autouse=True)
-def setup_teardown() -> Iterable[None]:
-  # Setup.
-  set_default_dataset_cls(DatasetDuckDB)
-  register_signal(TestEmbedding)
-  # Unit test runs.
-  yield
-  # Teardown.
-  clear_signal_registry()
-def test_search_keyword(make_test_data: TestDataMaker) -> None:
-  dataset = make_test_data(TEST_DATA)
-  query = 'world'
-  result = dataset.select_rows(
-    searches=[Search(path='text', query=KeywordQuery(type='keyword', search=query))],
-    combine_columns=True)
-  expected_signal_udf = SubstringSignal(query=query)
-  assert list(result) == [{
-    UUID_COLUMN: '1',
-    'text': enriched_item('hello world', {expected_signal_udf.key(): [lilac_span(6, 11)]}),
-    'text2': 'again hello world'
-  }, {
-    UUID_COLUMN: '2',
-    'text': enriched_item('looking for world in text',
-                          {expected_signal_udf.key(): [lilac_span(12, 17)]}),
-    'text2': 'again looking for world in text',
-  }]
-def test_search_keyword_special_chars(make_test_data: TestDataMaker) -> None:
-  dataset = make_test_data([{
-    UUID_COLUMN: '1',
-    'text': 'This is 100%',
-  }, {
-    UUID_COLUMN: '2',
-    'text': 'This has _underscore_',
-  }])
-  query = '100%'
-  result = dataset.select_rows(
-    searches=[Search(path='text', query=KeywordQuery(type='keyword', search=query))],
-    combine_columns=True)
-  expected_signal_udf = SubstringSignal(query=query)
-  assert list(result) == [{
-    UUID_COLUMN: '1',
-    'text': enriched_item('This is 100%', {expected_signal_udf.key(): [lilac_span(8, 12)]}),
-  }]
-  query = '_underscore_'
-  result = dataset.select_rows(
-    searches=[Search(path='text', query=KeywordQuery(type='keyword', search=query))],
-    combine_columns=True)
-  expected_signal_udf = SubstringSignal(query=query)
-  assert list(result) == [{
-    UUID_COLUMN: '2',
-    'text': enriched_item('This has _underscore_',
-                          {expected_signal_udf.key(): [lilac_span(9, 21)]}),
-  }]
-def test_search_keyword_multiple(make_test_data: TestDataMaker) -> None:
-  dataset = make_test_data(TEST_DATA)
-  query_world = 'world'
-  query_looking_world = 'looking for world'
-  expected_world_udf = SubstringSignal(query=query_world)
-  expected_again_looking_udf = SubstringSignal(query=query_looking_world)
-  result = dataset.select_rows(
-    searches=[
-      Search(path='text', query=KeywordQuery(type='keyword', search=query_world)),
-      Search(path='text2', query=KeywordQuery(type='keyword', search=query_looking_world)),
-    ],
-    combine_columns=True)
-  assert list(result) == [{
-    UUID_COLUMN: '2',
-    'text': enriched_item('looking for world in text', {
-      expected_world_udf.key(): [lilac_span(12, 17)],
-    }),
-    'text2': enriched_item('again looking for world in text',
-                           {expected_again_looking_udf.key(): [lilac_span(6, 23)]})
-  }]
-def test_search_keyword_with_filters(make_test_data: TestDataMaker) -> None:
-  dataset = make_test_data(TEST_DATA)
-  query = 'world'
-  result = dataset.select_rows(
-    filters=[(UUID_COLUMN, ListOp.IN, ['1', '3'])],
-    searches=[Search(path='text', query=KeywordQuery(type='keyword', search=query))],
-    combine_columns=True)
-  expected_signal_udf = SubstringSignal(query=query)
-  assert list(result) == [
-    {
-      UUID_COLUMN: '1',
-      'text': enriched_item('hello world', {expected_signal_udf.key(): [lilac_span(6, 11)]}),
-      'text2': 'again hello world'
-    },
-    # The second row doesn't match the UUID filter.
-  ]
-class TestEmbedding(TextEmbeddingSignal):
-  """A test embed function."""
-  name = 'test_embedding'
-  @override
-  def compute(self, data: Iterable[RichData]) -> Iterable[Item]:
-    """Call the embedding function."""
-    for example in data:
-      embedding = np.array(STR_EMBEDDINGS[cast(str, example)])
-      embedding = normalize([embedding])[0]
-      yield [lilac_embedding(0, len(example), embedding)]
-def test_semantic_search(make_test_data: TestDataMaker) -> None:
-  dataset = make_test_data([{
-    UUID_COLUMN: '1',
-    'text': 'hello world.',
-  }, {
-    UUID_COLUMN: '2',
-    'text': 'hello world2.',
-  }])
-  test_embedding = TestEmbedding()
-  dataset.compute_signal(test_embedding, ('text'))
-  query = 'hello2.'
-  result = dataset.select_rows(
-    searches=[
-      Search(
-        path='text', query=SemanticQuery(type='semantic', search=query, embedding='test_embedding'))
-    ],
-    combine_columns=True)
-  expected_signal_udf = SemanticSimilaritySignal(query=query, embedding='test_embedding')
-  assert list(result) == [
-    # Results are sorted by score desc.
-    {
-      UUID_COLUMN: '2',
-      'text': enriched_item(
-        'hello world2.', {
-          test_embedding.key():
-            [enriched_embedding_span(0, 13, {expected_signal_udf.key(): approx(0.916, 1e-3)})]
-        })
-    },
-    {
-      UUID_COLUMN: '1',
-      'text': enriched_item(
-        'hello world.', {
-          test_embedding.key():
-            [enriched_embedding_span(0, 12, {expected_signal_udf.key(): approx(0.885, 1e-3)})]
-        })
-    },
-  ]
-def test_concept_search(make_test_data: TestDataMaker, mocker: MockerFixture) -> None:
-  concept_model_mock = mocker.spy(LogisticEmbeddingModel, 'fit')
-  dataset = make_test_data([{
-    UUID_COLUMN: '1',
-    'text': 'hello world.',
-  }, {
-    UUID_COLUMN: '2',
-    'text': 'hello world2.',
-  }, {
-    UUID_COLUMN: '3',
-    'text': 'random negative 1',
-  }, {
-    UUID_COLUMN: '4',
-    'text': 'random negative 2',
-  }, {
-    UUID_COLUMN: '5',
-    'text': 'random negative 3',
-  }, {
-    UUID_COLUMN: '6',
-    'text': 'random negative 4',
-  }])
-  test_embedding = TestEmbedding()
-  dataset.compute_signal(test_embedding, ('text'))
-  concept_db = DiskConceptDB()
-  concept_db.create(namespace='test_namespace', name='test_concept', type=SignalInputType.TEXT)
-  concept_db.edit(
-    'test_namespace', 'test_concept',
-    ConceptUpdate(insert=[
-      ExampleIn(label=False, text='hello world.'),
-      ExampleIn(label=True, text='hello world2.')
-    ]))
-  result = dataset.select_rows(
-    searches=[
-      Search(
-        path='text',
-        query=ConceptQuery(
-          type='concept',
-          concept_namespace='test_namespace',
-          concept_name='test_concept',
-          embedding='test_embedding'))
-    ],
-    filters=[(UUID_COLUMN, ListOp.IN, ['1', '2'])],
-    combine_columns=True)
-  expected_signal_udf = ConceptScoreSignal(
-    namespace='test_namespace', concept_name='test_concept', embedding='test_embedding')
-  assert list(result) == [
-    # Results are sorted by score desc.
-    {
-      UUID_COLUMN: '2',
-      'text': enriched_item(
-        'hello world2.', {
-          test_embedding.key():
-            [enriched_embedding_span(0, 13, {expected_signal_udf.key(): approx(0.75, abs=0.25)})],
-          'test_namespace/test_concept/labels': [lilac_span(0, 13, {'label': True})]
-        })
-    },
-    {
-      UUID_COLUMN: '1',
-      'text': enriched_item(
-        'hello world.', {
-          test_embedding.key():
-            [enriched_embedding_span(0, 12, {expected_signal_udf.key(): approx(0.25, abs=0.25)})],
-          'test_namespace/test_concept/labels': [lilac_span(0, 12, {'label': False})]
-        })
-    },
-  ]
-  (_, embeddings, labels, _) = concept_model_mock.call_args_list[-1].args
-  assert embeddings.shape == (2, 3)
-  assert labels == [
-    # Explicit labels.
-    False,
-    True
-  ]
-def test_sort_override_search(make_test_data: TestDataMaker) -> None:
-  dataset = make_test_data([{
-    UUID_COLUMN: '1',
-    'text': 'hello world.',
-    'value': 10
-  }, {
-    UUID_COLUMN: '2',
-    'text': 'hello world2.',
-    'value': 20
-  }])
-  test_embedding = TestEmbedding()
-  dataset.compute_signal(test_embedding, ('text'))
-  query = 'hello2.'
-  search = Search(
-    path='text', query=SemanticQuery(type='semantic', search=query, embedding='test_embedding'))
-  expected_signal_udf = SemanticSimilaritySignal(query=query, embedding='test_embedding')
-  expected_item_1 = {
-    UUID_COLUMN: '1',
-    'text': enriched_item(
-      'hello world.', {
-        test_embedding.key():
-          [enriched_embedding_span(0, 12, {expected_signal_udf.key(): approx(0.885, 1e-3)})]
-      }),
-    'value': 10
-  }
-  expected_item_2 = {
-    UUID_COLUMN: '2',
-    'text': enriched_item(
-      'hello world2.', {
-        test_embedding.key():
-          [enriched_embedding_span(0, 13, {expected_signal_udf.key(): approx(0.916, 1e-3)})]
-      }),
-    'value': 20
-  }
-  sort_order = SortOrder.ASC
-  result = dataset.select_rows(
-    searches=[search], sort_by=[('value',)], sort_order=sort_order, combine_columns=True)
-  assert list(result) == [
-    # Results are sorted by score ascending.
-    expected_item_1,
-    expected_item_2
-  ]
-  sort_order = SortOrder.DESC
-  result = dataset.select_rows(
-    searches=[search], sort_by=[('text',)], sort_order=sort_order, combine_columns=True)
-  assert list(result) == [
-    # Results are sorted by score descending.
-    expected_item_2,
-    expected_item_1
-  ]
-def test_search_keyword_and_semantic(make_test_data: TestDataMaker) -> None:
-  dataset = make_test_data([{
-    UUID_COLUMN: '1',
-    'text': 'hello world.',
-  }, {
-    UUID_COLUMN: '2',
-    'text': 'hello world2.',
-  }])
-  test_embedding = TestEmbedding()
-  dataset.compute_signal(test_embedding, ('text'))
-  query = 'hello2.'
-  keyword_query = 'rld2'
-  result = dataset.select_rows(
-    searches=[
-      Search(
-        path='text', query=SemanticQuery(type='semantic', search=query,
-                                         embedding='test_embedding')),
-      Search(path='text', query=KeywordQuery(type='keyword', search=keyword_query))
-    ],
-    combine_columns=True)
-  expected_semantic_signal = SemanticSimilaritySignal(query=query, embedding='test_embedding')
-  expected_keyword_signal = SubstringSignal(query=keyword_query)
-  assert list(result) == [
-    # Results are sorted by score desc.
-    {
-      UUID_COLUMN: '2',
-      'text': enriched_item(
-        'hello world2.', {
-          test_embedding.key():
-            [enriched_embedding_span(0, 13, {expected_semantic_signal.key(): approx(0.916, 1e-3)})],
-          expected_keyword_signal.key(): [lilac_span(8, 12)],
-        })
-    },
-    # UUID '1' is not returned because it does not match the keyword query.
-  ]

src/data/dataset_select_rows_sort_test.py DELETED Viewed

@@ -1,904 +0,0 @@
-"""Tests for dataset.select_rows(sort_by=...)."""
-from typing import Iterable, Optional, Sequence, cast
-import numpy as np
-import pytest
-from typing_extensions import override
-from ..embeddings.vector_store import VectorStore
-from ..schema import UUID_COLUMN, Field, Item, RichData, VectorKey, field
-from ..signals.signal import (
-  TextEmbeddingModelSignal,
-  TextEmbeddingSignal,
-  TextSignal,
-  clear_signal_registry,
-  register_signal,
-)
-from .dataset import BinaryOp, Column, SortOrder
-from .dataset_test_utils import TestDataMaker, enriched_item
-from .dataset_utils import lilac_embedding
-class TestSignal(TextSignal):
-  name = 'test_signal'
-  def fields(self) -> Field:
-    return field(fields={'len': 'int32', 'is_all_cap': 'boolean'})
-  def compute(self, data: Iterable[RichData]) -> Iterable[Optional[Item]]:
-    for text_content in data:
-      yield {'len': len(text_content), 'is_all_cap': text_content.isupper()}
-class TestPrimitiveSignal(TextSignal):
-  name = 'primitive_signal'
-  def fields(self) -> Field:
-    return field('int32')
-  def compute(self, data: Iterable[RichData]) -> Iterable[Optional[Item]]:
-    for text_content in data:
-      yield len(text_content) + 1
-class NestedArraySignal(TextSignal):
-  name = 'nested_array'
-  def fields(self) -> Field:
-    return field(fields=[['int32']])
-  def compute(self, data: Iterable[RichData]) -> Iterable[Optional[Item]]:
-    for text_content in data:
-      yield [[len(text_content) + 1], [len(text_content)]]
-@pytest.fixture(scope='module', autouse=True)
-def setup_teardown() -> Iterable[None]:
-  # Setup.
-  register_signal(TestSignal)
-  register_signal(TestPrimitiveSignal)
-  register_signal(NestedArraySignal)
-  register_signal(TopKEmbedding)
-  # Unit test runs.
-  yield
-  # Teardown.
-  clear_signal_registry()
-def test_sort_by_source_no_alias_no_repeated(make_test_data: TestDataMaker) -> None:
-  dataset = make_test_data([{
-    UUID_COLUMN: '1',
-    'erased': True,
-    'score': 4.1,
-    'document': {
-      'num_pages': 4,
-      'header': {
-        'title': 'c'
-      }
-    }
-  }, {
-    UUID_COLUMN: '2',
-    'erased': False,
-    'score': 3.5,
-    'document': {
-      'num_pages': 5,
-      'header': {
-        'title': 'b'
-      }
-    },
-  }, {
-    UUID_COLUMN: '3',
-    'erased': True,
-    'score': 3.7,
-    'document': {
-      'num_pages': 3,
-      'header': {
-        'title': 'a'
-      }
-    },
-  }])
-  # Sort by bool.
-  result = dataset.select_rows(columns=[UUID_COLUMN], sort_by=['erased'], sort_order=SortOrder.ASC)
-  assert list(result) == [{UUID_COLUMN: '2'}, {UUID_COLUMN: '1'}, {UUID_COLUMN: '3'}]
-  result = dataset.select_rows(columns=[UUID_COLUMN], sort_by=['erased'], sort_order=SortOrder.DESC)
-  assert list(result) == [{UUID_COLUMN: '1'}, {UUID_COLUMN: '3'}, {UUID_COLUMN: '2'}]
-  # Sort by float.
-  result = dataset.select_rows(columns=[UUID_COLUMN], sort_by=['score'], sort_order=SortOrder.ASC)
-  assert list(result) == [{UUID_COLUMN: '2'}, {UUID_COLUMN: '3'}, {UUID_COLUMN: '1'}]
-  result = dataset.select_rows(columns=[UUID_COLUMN], sort_by=['score'], sort_order=SortOrder.DESC)
-  assert list(result) == [{UUID_COLUMN: '1'}, {UUID_COLUMN: '3'}, {UUID_COLUMN: '2'}]
-  # Sort by nested int.
-  result = dataset.select_rows(
-    columns=[UUID_COLUMN], sort_by=['document.num_pages'], sort_order=SortOrder.ASC)
-  assert list(result) == [{UUID_COLUMN: '3'}, {UUID_COLUMN: '1'}, {UUID_COLUMN: '2'}]
-  result = dataset.select_rows(
-    columns=[UUID_COLUMN], sort_by=['document.num_pages'], sort_order=SortOrder.DESC)
-  assert list(result) == [{UUID_COLUMN: '2'}, {UUID_COLUMN: '1'}, {UUID_COLUMN: '3'}]
-  # Sort by double nested string.
-  result = dataset.select_rows(
-    columns=[UUID_COLUMN], sort_by=['document.header.title'], sort_order=SortOrder.ASC)
-  assert list(result) == [{UUID_COLUMN: '3'}, {UUID_COLUMN: '2'}, {UUID_COLUMN: '1'}]
-  result = dataset.select_rows(
-    columns=[UUID_COLUMN], sort_by=['document.header.title'], sort_order=SortOrder.DESC)
-  assert list(result) == [{UUID_COLUMN: '1'}, {UUID_COLUMN: '2'}, {UUID_COLUMN: '3'}]
-def test_sort_by_signal_no_alias_no_repeated(make_test_data: TestDataMaker) -> None:
-  dataset = make_test_data([{
-    UUID_COLUMN: '1',
-    'text': 'HEY'
-  }, {
-    UUID_COLUMN: '2',
-    'text': 'everyone'
-  }, {
-    UUID_COLUMN: '3',
-    'text': 'HI'
-  }])
-  dataset.compute_signal(TestSignal(), 'text')
-  # Sort by `signal.len`.
-  result = dataset.select_rows(
-    columns=[UUID_COLUMN], sort_by=['text.test_signal.len'], sort_order=SortOrder.ASC)
-  assert list(result) == [{UUID_COLUMN: '3'}, {UUID_COLUMN: '1'}, {UUID_COLUMN: '2'}]
-  result = dataset.select_rows(
-    columns=[UUID_COLUMN], sort_by=['text.test_signal.len'], sort_order=SortOrder.DESC)
-  assert list(result) == [{UUID_COLUMN: '2'}, {UUID_COLUMN: '1'}, {UUID_COLUMN: '3'}]
-  # Sort by `signal.is_all_cap`.
-  result = dataset.select_rows(
-    columns=[UUID_COLUMN], sort_by=['text.test_signal.is_all_cap'], sort_order=SortOrder.ASC)
-  assert list(result) == [{UUID_COLUMN: '2'}, {UUID_COLUMN: '1'}, {UUID_COLUMN: '3'}]
-  result = dataset.select_rows(
-    columns=[UUID_COLUMN], sort_by=['text.test_signal.is_all_cap'], sort_order=SortOrder.DESC)
-  assert list(result) == [{UUID_COLUMN: '1'}, {UUID_COLUMN: '3'}, {UUID_COLUMN: '2'}]
-def test_sort_by_signal_alias_no_repeated(make_test_data: TestDataMaker) -> None:
-  dataset = make_test_data([{
-    UUID_COLUMN: '1',
-    'text': 'HEY'
-  }, {
-    UUID_COLUMN: '2',
-    'text': 'everyone'
-  }, {
-    UUID_COLUMN: '3',
-    'text': 'HI'
-  }])
-  dataset.compute_signal(TestSignal(), 'text')
-  # Sort by `signal.len`.
-  signal_alias = Column('text.test_signal', alias='signal')
-  result = dataset.select_rows(
-    columns=[signal_alias], sort_by=['signal.len'], sort_order=SortOrder.ASC)
-  assert list(result) == [{
-    UUID_COLUMN: '3',
-    'signal': {
-      'len': 2,
-      'is_all_cap': True
-    }
-  }, {
-    UUID_COLUMN: '1',
-    'signal': {
-      'len': 3,
-      'is_all_cap': True
-    }
-  }, {
-    UUID_COLUMN: '2',
-    'signal': {
-      'len': 8,
-      'is_all_cap': False
-    }
-  }]
-  result = dataset.select_rows(
-    columns=[signal_alias], sort_by=['signal.len'], sort_order=SortOrder.DESC)
-  assert list(result) == [{
-    UUID_COLUMN: '2',
-    'signal': {
-      'len': 8,
-      'is_all_cap': False
-    }
-  }, {
-    UUID_COLUMN: '1',
-    'signal': {
-      'len': 3,
-      'is_all_cap': True
-    }
-  }, {
-    UUID_COLUMN: '3',
-    'signal': {
-      'len': 2,
-      'is_all_cap': True
-    }
-  }]
-def test_sort_by_enriched_alias_no_repeated(make_test_data: TestDataMaker) -> None:
-  dataset = make_test_data([{
-    UUID_COLUMN: '1',
-    'text': 'HEY'
-  }, {
-    UUID_COLUMN: '2',
-    'text': 'everyone'
-  }, {
-    UUID_COLUMN: '3',
-    'text': 'HI'
-  }])
-  dataset.compute_signal(TestSignal(), 'text')
-  # Sort by `document.test_signal.is_all_cap` where 'document' is an alias to 'text'.
-  text_alias = Column('text', alias='document')
-  result = dataset.select_rows(
-    columns=[text_alias], sort_by=['document.test_signal.is_all_cap'], sort_order=SortOrder.ASC)
-  assert list(result) == [{
-    UUID_COLUMN: '2',
-    'document': enriched_item('everyone', {'test_signal': {
-      'len': 8,
-      'is_all_cap': False
-    }})
-  }, {
-    UUID_COLUMN: '1',
-    'document': enriched_item('HEY', {'test_signal': {
-      'len': 3,
-      'is_all_cap': True
-    }})
-  }, {
-    UUID_COLUMN: '3',
-    'document': enriched_item('HI', {'test_signal': {
-      'len': 2,
-      'is_all_cap': True
-    }})
-  }]
-  result = dataset.select_rows(
-    columns=[text_alias], sort_by=['document.test_signal.is_all_cap'], sort_order=SortOrder.DESC)
-  assert list(result) == [{
-    UUID_COLUMN: '1',
-    'document': enriched_item('HEY', {'test_signal': {
-      'len': 3,
-      'is_all_cap': True
-    }})
-  }, {
-    UUID_COLUMN: '3',
-    'document': enriched_item('HI', {'test_signal': {
-      'len': 2,
-      'is_all_cap': True
-    }})
-  }, {
-    UUID_COLUMN: '2',
-    'document': enriched_item('everyone', {'test_signal': {
-      'len': 8,
-      'is_all_cap': False
-    }})
-  }]
-def test_sort_by_udf_alias_no_repeated(make_test_data: TestDataMaker) -> None:
-  dataset = make_test_data([{
-    UUID_COLUMN: '1',
-    'text': 'HEY'
-  }, {
-    UUID_COLUMN: '2',
-    'text': 'everyone'
-  }, {
-    UUID_COLUMN: '3',
-    'text': 'HI'
-  }])
-  # Equivalent to: SELECT `TestSignal(text) AS udf`.
-  text_udf = Column('text', signal_udf=TestSignal(), alias='udf')
-  # Sort by `udf.len`, where `udf` is an alias to `TestSignal(text)`.
-  result = dataset.select_rows(['*', text_udf], sort_by=['udf.len'], sort_order=SortOrder.ASC)
-  assert list(result) == [{
-    UUID_COLUMN: '3',
-    'text': 'HI',
-    'udf': {
-      'len': 2,
-      'is_all_cap': True
-    }
-  }, {
-    UUID_COLUMN: '1',
-    'text': 'HEY',
-    'udf': {
-      'len': 3,
-      'is_all_cap': True
-    }
-  }, {
-    UUID_COLUMN: '2',
-    'text': 'everyone',
-    'udf': {
-      'len': 8,
-      'is_all_cap': False
-    }
-  }]
-def test_sort_by_udf_no_alias_no_repeated(make_test_data: TestDataMaker) -> None:
-  dataset = make_test_data([{
-    UUID_COLUMN: '1',
-    'text': 'HEY'
-  }, {
-    UUID_COLUMN: '2',
-    'text': 'everyone'
-  }, {
-    UUID_COLUMN: '3',
-    'text': 'HI'
-  }])
-  text_udf = Column('text', signal_udf=TestSignal())
-  # Sort by `text.test_signal.len`, produced by executing the udf `TestSignal(text)`.
-  result = dataset.select_rows(['*', text_udf],
-                               sort_by=[('text', 'test_signal', 'len')],
-                               sort_order=SortOrder.ASC,
-                               combine_columns=True)
-  assert list(result) == [{
-    UUID_COLUMN: '3',
-    'text': enriched_item('HI', {'test_signal': {
-      'len': 2,
-      'is_all_cap': True
-    }}),
-  }, {
-    UUID_COLUMN: '1',
-    'text': enriched_item('HEY', {'test_signal': {
-      'len': 3,
-      'is_all_cap': True
-    }}),
-  }, {
-    UUID_COLUMN: '2',
-    'text': enriched_item('everyone', {'test_signal': {
-      'len': 8,
-      'is_all_cap': False
-    }}),
-  }]
-  # Sort descending.
-  result = dataset.select_rows(['*', text_udf],
-                               sort_by=[('text', 'test_signal', 'len')],
-                               sort_order=SortOrder.DESC,
-                               combine_columns=True)
-  assert list(result) == [{
-    UUID_COLUMN: '2',
-    'text': enriched_item('everyone', {'test_signal': {
-      'len': 8,
-      'is_all_cap': False
-    }}),
-  }, {
-    UUID_COLUMN: '1',
-    'text': enriched_item('HEY', {'test_signal': {
-      'len': 3,
-      'is_all_cap': True
-    }}),
-  }, {
-    UUID_COLUMN: '3',
-    'text': enriched_item('HI', {'test_signal': {
-      'len': 2,
-      'is_all_cap': True
-    }}),
-  }]
-def test_sort_by_primitive_udf_alias_no_repeated(make_test_data: TestDataMaker) -> None:
-  dataset = make_test_data([{
-    UUID_COLUMN: '1',
-    'text': 'HEY'
-  }, {
-    UUID_COLUMN: '2',
-    'text': 'everyone'
-  }, {
-    UUID_COLUMN: '3',
-    'text': 'HI'
-  }])
-  # Equivalent to: SELECT `TestPrimitiveSignal(text) AS udf`.
-  text_udf = Column('text', signal_udf=TestPrimitiveSignal(), alias='udf')
-  # Sort by the primitive value returned by the udf.
-  result = dataset.select_rows(['*', text_udf], sort_by=['udf'], sort_order=SortOrder.ASC)
-  assert list(result) == [{
-    UUID_COLUMN: '3',
-    'text': 'HI',
-    'udf': 3
-  }, {
-    UUID_COLUMN: '1',
-    'text': 'HEY',
-    'udf': 4
-  }, {
-    UUID_COLUMN: '2',
-    'text': 'everyone',
-    'udf': 9
-  }]
-def test_sort_by_source_non_leaf_errors(make_test_data: TestDataMaker) -> None:
-  dataset = make_test_data([{
-    UUID_COLUMN: '1',
-    'vals': [7, 1]
-  }, {
-    UUID_COLUMN: '2',
-    'vals': [3, 4]
-  }, {
-    UUID_COLUMN: '3',
-    'vals': [9, 0]
-  }])
-  # Sort by repeated.
-  with pytest.raises(ValueError, match='Unable to sort by path'):
-    dataset.select_rows(columns=[UUID_COLUMN], sort_by=['vals'], sort_order=SortOrder.ASC)
-def test_sort_by_source_no_alias_repeated(make_test_data: TestDataMaker) -> None:
-  dataset = make_test_data([{
-    UUID_COLUMN: '1',
-    'vals': [[{
-      'score': 7
-    }, {
-      'score': 1
-    }], [{
-      'score': 1
-    }, {
-      'score': 7
-    }]]
-  }, {
-    UUID_COLUMN: '2',
-    'vals': [[{
-      'score': 3
-    }, {
-      'score': 4
-    }]]
-  }, {
-    UUID_COLUMN: '3',
-    'vals': [[{
-      'score': 9
-    }, {
-      'score': 0
-    }]]
-  }])
-  # Sort by repeated 'vals'.
-  result = dataset.select_rows(
-    columns=[UUID_COLUMN, 'vals'], sort_by=['vals.*.*.score'], sort_order=SortOrder.ASC)
-  assert list(result) == [{
-    UUID_COLUMN: '3',
-    'vals': [[{
-      'score': 9
-    }, {
-      'score': 0
-    }]]
-  }, {
-    UUID_COLUMN: '1',
-    'vals': [[{
-      'score': 7
-    }, {
-      'score': 1
-    }], [{
-      'score': 1
-    }, {
-      'score': 7
-    }]]
-  }, {
-    UUID_COLUMN: '2',
-    'vals': [[{
-      'score': 3
-    }, {
-      'score': 4
-    }]]
-  }]
-  result = dataset.select_rows(
-    columns=[UUID_COLUMN, 'vals'], sort_by=['vals.*.*.score'], sort_order=SortOrder.DESC)
-  assert list(result) == [{
-    UUID_COLUMN: '3',
-    'vals': [[{
-      'score': 9
-    }, {
-      'score': 0
-    }]]
-  }, {
-    UUID_COLUMN: '1',
-    'vals': [[{
-      'score': 7
-    }, {
-      'score': 1
-    }], [{
-      'score': 1
-    }, {
-      'score': 7
-    }]]
-  }, {
-    UUID_COLUMN: '2',
-    'vals': [[{
-      'score': 3
-    }, {
-      'score': 4
-    }]]
-  }]
-def test_sort_by_source_alias_repeated(make_test_data: TestDataMaker) -> None:
-  dataset = make_test_data([{
-    UUID_COLUMN: '1',
-    'vals': [[7, 1], [1, 7]]
-  }, {
-    UUID_COLUMN: '2',
-    'vals': [[3], [11]]
-  }, {
-    UUID_COLUMN: '3',
-    'vals': [[9, 0]]
-  }])
-  # Sort by repeated 'vals'.
-  result = dataset.select_rows(
-    columns=[UUID_COLUMN, Column('vals', alias='scores')],
-    sort_by=['scores.*.*'],
-    sort_order=SortOrder.ASC)
-  assert list(result) == [{
-    UUID_COLUMN: '3',
-    'scores': [[9, 0]]
-  }, {
-    UUID_COLUMN: '1',
-    'scores': [[7, 1], [1, 7]]
-  }, {
-    UUID_COLUMN: '2',
-    'scores': [[3], [11]]
-  }]
-  result = dataset.select_rows(
-    columns=[UUID_COLUMN, Column('vals', alias='scores')],
-    sort_by=['scores.*.*'],
-    sort_order=SortOrder.DESC)
-  assert list(result) == [{
-    UUID_COLUMN: '2',
-    'scores': [[3], [11]]
-  }, {
-    UUID_COLUMN: '3',
-    'scores': [[9, 0]]
-  }, {
-    UUID_COLUMN: '1',
-    'scores': [[7, 1], [1, 7]]
-  }]
-def test_sort_by_udf_alias_repeated(make_test_data: TestDataMaker) -> None:
-  dataset = make_test_data([{
-    UUID_COLUMN: '1',
-    'text': 'HEY'
-  }, {
-    UUID_COLUMN: '2',
-    'text': 'everyone'
-  }, {
-    UUID_COLUMN: '3',
-    'text': 'HI'
-  }])
-  # Equivalent to: SELECT `NestedArraySignal(text) AS udf`.
-  text_udf = Column('text', signal_udf=NestedArraySignal(), alias='udf')
-  # Sort by `udf.*.*`, where `udf` is an alias to `NestedArraySignal(text)`.
-  result = dataset.select_rows(['*', text_udf], sort_by=['udf.*.*'], sort_order=SortOrder.ASC)
-  assert list(result) == [{
-    UUID_COLUMN: '3',
-    'text': 'HI',
-    'udf': [[3], [2]]
-  }, {
-    UUID_COLUMN: '1',
-    'text': 'HEY',
-    'udf': [[4], [3]]
-  }, {
-    UUID_COLUMN: '2',
-    'text': 'everyone',
-    'udf': [[9], [8]]
-  }]
-  result = dataset.select_rows(['*', text_udf], sort_by=['udf.*.*'], sort_order=SortOrder.DESC)
-  assert list(result) == [{
-    UUID_COLUMN: '2',
-    'text': 'everyone',
-    'udf': [[9], [8]]
-  }, {
-    UUID_COLUMN: '1',
-    'text': 'HEY',
-    'udf': [[4], [3]]
-  }, {
-    UUID_COLUMN: '3',
-    'text': 'HI',
-    'udf': [[3], [2]]
-  }]
-def test_sort_by_complex_signal_udf_alias_called_on_repeated(make_test_data: TestDataMaker) -> None:
-  dataset = make_test_data([{
-    UUID_COLUMN: '1',
-    'texts': [{
-      'text': 'eardrop'
-    }, {
-      'text': 'I'
-    }]
-  }, {
-    UUID_COLUMN: '2',
-    'texts': [{
-      'text': 'hey'
-    }, {
-      'text': 'CARS'
-    }]
-  }, {
-    UUID_COLUMN: '3',
-    'texts': [{
-      'text': 'everyone'
-    }, {
-      'text': ''
-    }]
-  }])
-  # Equivalent to: SELECT `TestSignal(texts.*.text) AS udf`.
-  texts_udf = Column('texts.*.text', signal_udf=TestSignal(), alias='udf')
-  # Sort by `udf.len`, where `udf` is an alias to `TestSignal(texts.*.text)`.
-  result = dataset.select_rows(['*', texts_udf],
-                               sort_by=['udf.len'],
-                               sort_order=SortOrder.ASC,
-                               combine_columns=True)
-  assert list(result) == [{
-    UUID_COLUMN: '3',
-    'texts': [{
-      'text': enriched_item('everyone', {'test_signal': {
-        'len': 8,
-        'is_all_cap': False
-      }})
-    }, {
-      'text': enriched_item('', {'test_signal': {
-        'len': 0,
-        'is_all_cap': False
-      }})
-    }]
-  }, {
-    UUID_COLUMN: '1',
-    'texts': [{
-      'text': enriched_item('eardrop', {'test_signal': {
-        'len': 7,
-        'is_all_cap': False
-      }})
-    }, {
-      'text': enriched_item('I', {'test_signal': {
-        'len': 1,
-        'is_all_cap': True
-      }})
-    }]
-  }, {
-    UUID_COLUMN: '2',
-    'texts': [{
-      'text': enriched_item('hey', {'test_signal': {
-        'len': 3,
-        'is_all_cap': False
-      }})
-    }, {
-      'text': enriched_item('CARS', {'test_signal': {
-        'len': 4,
-        'is_all_cap': True
-      }})
-    }]
-  }]
-def test_sort_by_primitive_signal_udf_alias_called_on_repeated(
-    make_test_data: TestDataMaker) -> None:
-  dataset = make_test_data([{
-    UUID_COLUMN: '1',
-    'texts': [{
-      'text': 'eardrop'
-    }, {
-      'text': 'I'
-    }]
-  }, {
-    UUID_COLUMN: '2',
-    'texts': [{
-      'text': 'hey'
-    }, {
-      'text': 'CARS'
-    }]
-  }, {
-    UUID_COLUMN: '3',
-    'texts': [{
-      'text': 'everyone'
-    }, {
-      'text': ''
-    }]
-  }])
-  # Equivalent to: SELECT `TestPrimitiveSignal(texts.*.text) AS udf`.
-  texts_udf = Column('texts.*.text', signal_udf=TestPrimitiveSignal(), alias='udf')
-  # Sort by `udf`, where `udf` is an alias to `TestPrimitiveSignal(texts.*.text)`.
-  result = dataset.select_rows(['*', texts_udf],
-                               sort_by=['udf'],
-                               sort_order=SortOrder.ASC,
-                               combine_columns=True)
-  assert list(result) == [{
-    UUID_COLUMN: '3',
-    'texts': [{
-      'text': enriched_item('everyone', {'primitive_signal': 9})
-    }, {
-      'text': enriched_item('', {'primitive_signal': 1})
-    }]
-  }, {
-    UUID_COLUMN: '1',
-    'texts': [{
-      'text': enriched_item('eardrop', {'primitive_signal': 8})
-    }, {
-      'text': enriched_item('I', {'primitive_signal': 2})
-    }]
-  }, {
-    UUID_COLUMN: '2',
-    'texts': [{
-      'text': enriched_item('hey', {'primitive_signal': 4})
-    }, {
-      'text': enriched_item('CARS', {'primitive_signal': 5})
-    }]
-  }]
-  result = dataset.select_rows(['*', texts_udf],
-                               sort_by=['udf'],
-                               sort_order=SortOrder.DESC,
-                               combine_columns=True)
-  assert list(result) == [{
-    UUID_COLUMN: '3',
-    'texts': [{
-      'text': enriched_item('everyone', {'primitive_signal': 9})
-    }, {
-      'text': enriched_item('', {'primitive_signal': 1})
-    }]
-  }, {
-    UUID_COLUMN: '1',
-    'texts': [{
-      'text': enriched_item('eardrop', {'primitive_signal': 8})
-    }, {
-      'text': enriched_item('I', {'primitive_signal': 2})
-    }]
-  }, {
-    UUID_COLUMN: '2',
-    'texts': [{
-      'text': enriched_item('hey', {'primitive_signal': 4})
-    }, {
-      'text': enriched_item('CARS', {'primitive_signal': 5})
-    }]
-  }]
-class TopKEmbedding(TextEmbeddingSignal):
-  """A test embed function."""
-  name = 'topk_embedding'
-  def compute(self, data: Iterable[RichData]) -> Iterable[Item]:
-    """Call the embedding function."""
-    for example in data:
-      example = cast(str, example)
-      emb_spans: list[Item] = []
-      for i, score in enumerate(example.split('_')):
-        start, end = i * 2, i * 2 + 1
-        vector = np.array([int(score)])
-        emb_spans.append(lilac_embedding(start, end, vector))
-      yield emb_spans
-class TopKSignal(TextEmbeddingModelSignal):
-  """Compute scores along a given concept for documents."""
-  name = 'topk_signal'
-  _query = np.array([1])
-  def fields(self) -> Field:
-    return field('float32')
-  @override
-  def vector_compute(self, keys: Iterable[VectorKey],
-                     vector_store: VectorStore) -> Iterable[Optional[Item]]:
-    text_embeddings = vector_store.get(keys)
-    dot_products = text_embeddings.dot(self._query).reshape(-1)
-    return dot_products.tolist()
-  @override
-  def vector_compute_topk(
-      self,
-      topk: int,
-      vector_store: VectorStore,
-      keys: Optional[Iterable[VectorKey]] = None) -> Sequence[tuple[VectorKey, Optional[Item]]]:
-    return vector_store.topk(self._query, topk, keys)
-def test_sort_by_topk_embedding_udf(make_test_data: TestDataMaker) -> None:
-  dataset = make_test_data([{
-    UUID_COLUMN: '1',
-    'scores': '8_1',
-  }, {
-    UUID_COLUMN: '2',
-    'scores': '3_5'
-  }, {
-    UUID_COLUMN: '3',
-    'scores': '9_7'
-  }])
-  dataset.compute_signal(TopKEmbedding(), 'scores')
-  # Equivalent to: SELECT `TopKSignal(scores, embedding='...') AS udf`.
-  text_udf = Column('scores', signal_udf=TopKSignal(embedding='topk_embedding'), alias='udf')
-  # Sort by `udf`, where `udf` is an alias to `TopKSignal(scores, embedding='...')`.
-  result = dataset.select_rows(['*', text_udf], sort_by=['udf'], sort_order=SortOrder.DESC, limit=3)
-  assert list(result) == [{
-    UUID_COLUMN: '3',
-    'scores': enriched_item(
-      '9_7', {'topk_embedding': [lilac_embedding(0, 1, None),
-                                 lilac_embedding(2, 3, None)]}),
-    'udf': [9.0, 7.0]
-  }, {
-    UUID_COLUMN: '1',
-    'scores': enriched_item(
-      '8_1', {'topk_embedding': [lilac_embedding(0, 1, None),
-                                 lilac_embedding(2, 3, None)]}),
-    'udf': [8.0, 1.0]
-  }]
-  # Same but set limit to 4.
-  result = dataset.select_rows(['*', text_udf], sort_by=['udf'], sort_order=SortOrder.DESC, limit=4)
-  assert list(result) == [{
-    UUID_COLUMN: '3',
-    'scores': enriched_item(
-      '9_7', {'topk_embedding': [lilac_embedding(0, 1, None),
-                                 lilac_embedding(2, 3, None)]}),
-    'udf': [9.0, 7.0]
-  }, {
-    UUID_COLUMN: '1',
-    'scores': enriched_item(
-      '8_1', {'topk_embedding': [lilac_embedding(0, 1, None),
-                                 lilac_embedding(2, 3, None)]}),
-    'udf': [8.0, 1.0]
-  }, {
-    UUID_COLUMN: '2',
-    'scores': enriched_item(
-      '3_5', {'topk_embedding': [lilac_embedding(0, 1, None),
-                                 lilac_embedding(2, 3, None)]}),
-    'udf': [3.0, 5.0]
-  }]
-def test_sort_by_topk_udf_with_filter(make_test_data: TestDataMaker) -> None:
-  dataset = make_test_data([{
-    UUID_COLUMN: '1',
-    'scores': '8_1',
-    'active': True
-  }, {
-    UUID_COLUMN: '2',
-    'scores': '3_5',
-    'active': True
-  }, {
-    UUID_COLUMN: '3',
-    'scores': '9_7',
-    'active': False
-  }])
-  dataset.compute_signal(TopKEmbedding(), 'scores')
-  # Equivalent to: SELECT `TopKSignal(scores, embedding='...') AS udf`.
-  text_udf = Column('scores', signal_udf=TopKSignal(embedding='topk_embedding'), alias='udf')
-  # Sort by `udf`, where `udf` is an alias to `TopKSignal(scores, embedding='...')`.
-  result = dataset.select_rows(['*', text_udf],
-                               sort_by=['udf'],
-                               filters=[('active', BinaryOp.EQUALS, True)],
-                               sort_order=SortOrder.DESC,
-                               limit=2)
-  # We make sure that '3' is not in the result, because it is not active, even though it has the
-  # highest topk score.
-  assert list(result) == [{
-    UUID_COLUMN: '1',
-    'active': True,
-    'scores': enriched_item(
-      '8_1', {'topk_embedding': [lilac_embedding(0, 1, None),
-                                 lilac_embedding(2, 3, None)]}),
-    'udf': [8.0, 1.0]
-  }, {
-    UUID_COLUMN: '2',
-    'active': True,
-    'scores': enriched_item(
-      '3_5', {'topk_embedding': [lilac_embedding(0, 1, None),
-                                 lilac_embedding(2, 3, None)]}),
-    'udf': [3.0, 5.0]
-  }]

src/data/dataset_select_rows_udf_test.py DELETED Viewed

@@ -1,404 +0,0 @@
-"""Tests for dataset.select_rows(udf_col)."""
-from typing import Iterable, Optional, cast
-import numpy as np
-import pytest
-from typing_extensions import override
-from ..embeddings.vector_store import VectorStore
-from ..schema import UUID_COLUMN, VALUE_KEY, Field, Item, RichData, VectorKey, field
-from ..signals.signal import (
-  TextEmbeddingModelSignal,
-  TextEmbeddingSignal,
-  TextSignal,
-  TextSplitterSignal,
-  clear_signal_registry,
-  register_signal,
-)
-from .dataset import BinaryFilterTuple, BinaryOp, Column, val
-from .dataset_test_utils import TestDataMaker, enriched_item
-from .dataset_utils import lilac_embedding, lilac_span
-EMBEDDINGS: list[tuple[str, list[float]]] = [('hello.', [1.0, 0.0, 0.0]),
-                                             ('hello2.', [1.0, 1.0, 0.0]),
-                                             ('hello world.', [1.0, 1.0, 1.0]),
-                                             ('hello world2.', [2.0, 1.0, 1.0])]
-STR_EMBEDDINGS: dict[str, list[float]] = {text: embedding for text, embedding in EMBEDDINGS}
-class TestEmbedding(TextEmbeddingSignal):
-  """A test embed function."""
-  name = 'test_embedding'
-  @override
-  def compute(self, data: Iterable[RichData]) -> Iterable[Item]:
-    """Call the embedding function."""
-    for example in data:
-      yield [lilac_embedding(0, len(example), np.array(STR_EMBEDDINGS[cast(str, example)]))]
-class LengthSignal(TextSignal):
-  name = 'length_signal'
-  _call_count: int = 0
-  def fields(self) -> Field:
-    return field('int32')
-  def compute(self, data: Iterable[RichData]) -> Iterable[Optional[Item]]:
-    for text_content in data:
-      self._call_count += 1
-      yield len(text_content)
-class TestSignal(TextSignal):
-  name = 'test_signal'
-  @override
-  def fields(self) -> Field:
-    return field(fields={'len': 'int32', 'flen': 'float32'})
-  @override
-  def compute(self, data: Iterable[RichData]) -> Iterable[Optional[Item]]:
-    return [{'len': len(text_content), 'flen': float(len(text_content))} for text_content in data]
-class TestEmbeddingSumSignal(TextEmbeddingModelSignal):
-  """Sums the embeddings to return a single floating point value."""
-  name = 'test_embedding_sum'
-  @override
-  def fields(self) -> Field:
-    return field('float32')
-  @override
-  def vector_compute(self, keys: Iterable[VectorKey], vector_store: VectorStore) -> Iterable[Item]:
-    # The signal just sums the values of the embedding.
-    embedding_sums = vector_store.get(keys).sum(axis=1)
-    for embedding_sum in embedding_sums.tolist():
-      yield embedding_sum
-class ComputedKeySignal(TextSignal):
-  name = 'computed_key'
-  @override
-  def fields(self) -> Field:
-    return field('int64')
-  @override
-  def compute(self, data: Iterable[RichData]) -> Iterable[Optional[Item]]:
-    for text in data:
-      yield 1
-  def key(self, is_computed_signal: Optional[bool] = False) -> str:
-    return f'key_{is_computed_signal}'
-@pytest.fixture(scope='module', autouse=True)
-def setup_teardown() -> Iterable[None]:
-  # Setup.
-  register_signal(LengthSignal)
-  register_signal(TestSplitter)
-  register_signal(TestEmbedding)
-  register_signal(TestSignal)
-  register_signal(TestEmbeddingSumSignal)
-  register_signal(ComputedKeySignal)
-  # Unit test runs.
-  yield
-  # Teardown.
-  clear_signal_registry()
-def test_udf(make_test_data: TestDataMaker) -> None:
-  dataset = make_test_data([{
-    UUID_COLUMN: '1',
-    'text': 'hello'
-  }, {
-    UUID_COLUMN: '2',
-    'text': 'everybody'
-  }])
-  signal_col = Column('text', signal_udf=TestSignal())
-  result = dataset.select_rows(['text', signal_col])
-  assert list(result) == [{
-    UUID_COLUMN: '1',
-    'text': 'hello',
-    'test_signal(text)': {
-      'len': 5,
-      'flen': 5.0
-    }
-  }, {
-    UUID_COLUMN: '2',
-    'text': 'everybody',
-    'test_signal(text)': {
-      'len': 9,
-      'flen': 9.0
-    }
-  }]
-def test_udf_with_filters(make_test_data: TestDataMaker) -> None:
-  dataset = make_test_data([{
-    UUID_COLUMN: '1',
-    'text': 'hello'
-  }, {
-    UUID_COLUMN: '2',
-    'text': 'everybody'
-  }])
-  signal_col = Column('text', signal_udf=TestSignal())
-  # Filter by source feature.
-  filters: list[BinaryFilterTuple] = [('text', BinaryOp.EQUALS, 'everybody')]
-  result = dataset.select_rows(['text', signal_col], filters=filters)
-  assert list(result) == [{
-    UUID_COLUMN: '2',
-    'text': 'everybody',
-    'test_signal(text)': {
-      'len': 9,
-      'flen': 9.0
-    }
-  }]
-def test_udf_with_uuid_filter(make_test_data: TestDataMaker) -> None:
-  dataset = make_test_data([{
-    UUID_COLUMN: '1',
-    'text': 'hello'
-  }, {
-    UUID_COLUMN: '2',
-    'text': 'everybody'
-  }])
-  # Filter by a specific UUID.
-  filters: list[BinaryFilterTuple] = [(UUID_COLUMN, BinaryOp.EQUALS, '1')]
-  udf_col = Column('text', signal_udf=LengthSignal())
-  result = dataset.select_rows(['text', udf_col], filters=filters)
-  assert list(result) == [{UUID_COLUMN: '1', 'text': 'hello', 'length_signal(text)': 5}]
-  assert cast(LengthSignal, udf_col.signal_udf)._call_count == 1
-  filters = [(UUID_COLUMN, BinaryOp.EQUALS, '2')]
-  result = dataset.select_rows(['text', udf_col], filters=filters)
-  assert list(result) == [{UUID_COLUMN: '2', 'text': 'everybody', 'length_signal(text)': 9}]
-  assert cast(LengthSignal, udf_col.signal_udf)._call_count == 1 + 1
-  # No filters.
-  result = dataset.select_rows(['text', udf_col])
-  assert list(result) == [{
-    UUID_COLUMN: '1',
-    'text': 'hello',
-    'length_signal(text)': 5
-  }, {
-    UUID_COLUMN: '2',
-    'text': 'everybody',
-    'length_signal(text)': 9
-  }]
-  assert cast(LengthSignal, udf_col.signal_udf)._call_count == 2 + 2
-def test_udf_with_uuid_filter_repeated(make_test_data: TestDataMaker) -> None:
-  dataset = make_test_data([{
-    UUID_COLUMN: '1',
-    'text': ['hello', 'hi']
-  }, {
-    UUID_COLUMN: '2',
-    'text': ['everybody', 'bye', 'test']
-  }])
-  # Filter by a specific UUID.
-  filters: list[BinaryFilterTuple] = [(UUID_COLUMN, BinaryOp.EQUALS, '1')]
-  udf_col = Column(('text', '*'), signal_udf=LengthSignal())
-  result = dataset.select_rows(['text', udf_col], filters=filters)
-  assert list(result) == [{
-    UUID_COLUMN: '1',
-    'text': ['hello', 'hi'],
-    'length_signal(text)': [5, 2]
-  }]
-  assert cast(LengthSignal, udf_col.signal_udf)._call_count == 2
-  # Filter by a specific UUID.
-  filters = [(UUID_COLUMN, BinaryOp.EQUALS, '2')]
-  result = dataset.select_rows(['text', udf_col], filters=filters)
-  assert list(result) == [{
-    UUID_COLUMN: '2',
-    'text': ['everybody', 'bye', 'test'],
-    'length_signal(text)': [9, 3, 4]
-  }]
-  assert cast(LengthSignal, udf_col.signal_udf)._call_count == 2 + 3
-def test_udf_deeply_nested(make_test_data: TestDataMaker) -> None:
-  dataset = make_test_data([{
-    UUID_COLUMN: '1',
-    'text': [['hello'], ['hi', 'bye']]
-  }, {
-    UUID_COLUMN: '2',
-    'text': [['everybody', 'bye'], ['test']]
-  }])
-  udf_col = Column(('text', '*', '*'), signal_udf=LengthSignal())
-  result = dataset.select_rows([udf_col])
-  assert list(result) == [{
-    UUID_COLUMN: '1',
-    'length_signal(text.*)': [[5], [2, 3]]
-  }, {
-    UUID_COLUMN: '2',
-    'length_signal(text.*)': [[9, 3], [4]]
-  }]
-  assert cast(LengthSignal, udf_col.signal_udf)._call_count == 6
-def test_udf_with_embedding(make_test_data: TestDataMaker) -> None:
-  dataset = make_test_data([{
-    UUID_COLUMN: '1',
-    'text': 'hello.',
-  }, {
-    UUID_COLUMN: '2',
-    'text': 'hello2.',
-  }])
-  dataset.compute_signal(TestEmbedding(), 'text')
-  signal_col = Column('text', signal_udf=TestEmbeddingSumSignal(embedding='test_embedding'))
-  result = dataset.select_rows([val('text'), signal_col])
-  expected_result: list[Item] = [{
-    UUID_COLUMN: '1',
-    f'text.{VALUE_KEY}': 'hello.',
-    'test_embedding_sum(text.test_embedding.*.embedding)': [1.0]
-  }, {
-    UUID_COLUMN: '2',
-    f'text.{VALUE_KEY}': 'hello2.',
-    'test_embedding_sum(text.test_embedding.*.embedding)': [2.0]
-  }]
-  assert list(result) == expected_result
-  # Select rows with alias.
-  signal_col = Column(
-    'text', signal_udf=TestEmbeddingSumSignal(embedding='test_embedding'), alias='emb_sum')
-  result = dataset.select_rows([val('text'), signal_col])
-  expected_result = [{
-    UUID_COLUMN: '1',
-    f'text.{VALUE_KEY}': 'hello.',
-    'emb_sum': [1.0]
-  }, {
-    UUID_COLUMN: '2',
-    f'text.{VALUE_KEY}': 'hello2.',
-    'emb_sum': [2.0]
-  }]
-  assert list(result) == expected_result
-def test_udf_with_nested_embedding(make_test_data: TestDataMaker) -> None:
-  dataset = make_test_data([{
-    UUID_COLUMN: '1',
-    'text': ['hello.', 'hello world.'],
-  }, {
-    UUID_COLUMN: '2',
-    'text': ['hello world2.', 'hello2.'],
-  }])
-  dataset.compute_signal(TestEmbedding(), ('text', '*'))
-  signal_col = Column(('text', '*'), signal_udf=TestEmbeddingSumSignal(embedding='test_embedding'))
-  result = dataset.select_rows([val(('text', '*')), signal_col])
-  expected_result = [{
-    UUID_COLUMN: '1',
-    f'text.*.{VALUE_KEY}': ['hello.', 'hello world.'],
-    'test_embedding_sum(text.*.test_embedding.*.embedding)': [[1.0], [3.0]]
-  }, {
-    UUID_COLUMN: '2',
-    f'text.*.{VALUE_KEY}': ['hello world2.', 'hello2.'],
-    'test_embedding_sum(text.*.test_embedding.*.embedding)': [[4.0], [2.0]]
-  }]
-  assert list(result) == expected_result
-def test_udf_throws_without_precomputing(make_test_data: TestDataMaker) -> None:
-  dataset = make_test_data([{
-    UUID_COLUMN: '1',
-    'text': 'hello.',
-  }, {
-    UUID_COLUMN: '2',
-    'text': 'hello2.',
-  }])
-  # Embedding is not precomputed, yet we ask for the embedding.
-  signal_col = Column('text', signal_udf=TestEmbeddingSumSignal(embedding='test_embedding'))
-  with pytest.raises(ValueError, match='Embedding signal "test_embedding" is not computed'):
-    dataset.select_rows([val('text'), signal_col])
-class TestSplitter(TextSplitterSignal):
-  """Split documents into sentence by splitting on period."""
-  name = 'test_splitter'
-  @override
-  def compute(self, data: Iterable[RichData]) -> Iterable[Item]:
-    for text in data:
-      if not isinstance(text, str):
-        raise ValueError(f'Expected text to be a string, got {type(text)} instead.')
-      result: list[Item] = []
-      for sentence in text.split('.'):
-        start = text.index(sentence)
-        end = start + len(sentence)
-        result.append(lilac_span(start, end))
-      yield result
-def test_udf_after_precomputed_split(make_test_data: TestDataMaker) -> None:
-  dataset = make_test_data([{
-    UUID_COLUMN: '1',
-    'text': 'sentence 1. sentence 2 is longer',
-  }, {
-    UUID_COLUMN: '2',
-    'text': 'sentence 1 is longer. sent2 is short',
-  }])
-  dataset.compute_signal(TestSplitter(), 'text')
-  udf = Column('text', signal_udf=LengthSignal())
-  result = dataset.select_rows(['*', udf], combine_columns=True)
-  assert list(result) == [{
-    UUID_COLUMN: '1',
-    'text': enriched_item('sentence 1. sentence 2 is longer', {
-      'length_signal': 32,
-      'test_splitter': [lilac_span(0, 10), lilac_span(11, 32)]
-    })
-  }, {
-    UUID_COLUMN: '2',
-    'text': enriched_item('sentence 1 is longer. sent2 is short', {
-      'length_signal': 36,
-      'test_splitter': [lilac_span(0, 20), lilac_span(21, 36)]
-    })
-  }]
-def test_is_computed_signal_key(make_test_data: TestDataMaker) -> None:
-  dataset = make_test_data([{
-    UUID_COLUMN: '1',
-    'text': 'hello.',
-  }, {
-    UUID_COLUMN: '2',
-    'text': 'hello2.',
-  }])
-  signal_col = Column('text', signal_udf=ComputedKeySignal())
-  # Filter by source feature.
-  filters: list[BinaryFilterTuple] = [('text', BinaryOp.EQUALS, 'everybody')]
-  result = dataset.select_rows(['text', signal_col])
-  assert list(result) == [{
-    UUID_COLUMN: '1',
-    'text': 'hello.',
-    'key_False(text)': 1
-  }, {
-    UUID_COLUMN: '2',
-    'text': 'hello2.',
-    'key_False(text)': 1
-  }]

src/data/dataset_stats_test.py DELETED Viewed

@@ -1,125 +0,0 @@
-"""Tests for dataset.stats()."""
-from typing import Any, cast
-import pytest
-from pytest_mock import MockerFixture
-from ..schema import UUID_COLUMN, Item, schema
-from . import dataset_duckdb
-from .dataset import StatsResult
-from .dataset_test_utils import TestDataMaker
-SIMPLE_ITEMS: list[Item] = [{
-  UUID_COLUMN: '1',
-  'str': 'a',
-  'int': 1,
-  'bool': False,
-  'float': 3.0,
-}, {
-  UUID_COLUMN: '2',
-  'str': 'b',
-  'int': 2,
-  'bool': True,
-  'float': 2.0
-}, {
-  UUID_COLUMN: '3',
-  'str': 'b',
-  'int': 2,
-  'bool': True,
-  'float': 1.0
-}, {
-  UUID_COLUMN: '4',
-  'float': float('nan')
-}]
-def test_simple_stats(make_test_data: TestDataMaker) -> None:
-  dataset = make_test_data(SIMPLE_ITEMS)
-  result = dataset.stats(leaf_path='str')
-  assert result == StatsResult(
-    path=('str',), total_count=3, approx_count_distinct=2, avg_text_length=1)
-  result = dataset.stats(leaf_path='float')
-  assert result == StatsResult(
-    path=('float',), total_count=4, approx_count_distinct=4, min_val=1.0, max_val=3.0)
-  result = dataset.stats(leaf_path='bool')
-  assert result == StatsResult(path=('bool',), total_count=3, approx_count_distinct=2)
-  result = dataset.stats(leaf_path='int')
-  assert result == StatsResult(
-    path=('int',), total_count=3, approx_count_distinct=2, min_val=1, max_val=2)
-def test_nested_stats(make_test_data: TestDataMaker) -> None:
-  nested_items: list[Item] = [
-    {
-      'name': 'Name1',
-      'addresses': [{
-        'zips': [5, 8]
-      }]
-    },
-    {
-      'name': 'Name2',
-      'addresses': [{
-        'zips': [3]
-      }, {
-        'zips': [11, 8]
-      }]
-    },
-    {
-      'name': 'Name2',
-      'addresses': []
-    },  # No addresses.
-    {
-      'name': 'Name2',
-      'addresses': [{
-        'zips': []
-      }]
-    }  # No zips in the first address.
-  ]
-  nested_schema = schema({
-    UUID_COLUMN: 'string',
-    'name': 'string',
-    'addresses': [{
-      'zips': ['int32']
-    }]
-  })
-  dataset = make_test_data(nested_items, schema=nested_schema)
-  result = dataset.stats(leaf_path='name')
-  assert result == StatsResult(
-    path=('name',), total_count=4, approx_count_distinct=2, avg_text_length=5)
-  result = dataset.stats(leaf_path='addresses.*.zips.*')
-  assert result == StatsResult(
-    path=('addresses', '*', 'zips', '*'),
-    total_count=5,
-    approx_count_distinct=4,
-    min_val=3,
-    max_val=11)
-def test_stats_approximation(make_test_data: TestDataMaker, mocker: MockerFixture) -> None:
-  sample_size = 5
-  mocker.patch(f'{dataset_duckdb.__name__}.SAMPLE_SIZE_DISTINCT_COUNT', sample_size)
-  nested_items: list[Item] = [{'feature': str(i)} for i in range(sample_size * 10)]
-  nested_schema = schema({UUID_COLUMN: 'string', 'feature': 'string'})
-  dataset = make_test_data(nested_items, schema=nested_schema)
-  result = dataset.stats(leaf_path='feature')
-  assert result == StatsResult(
-    path=('feature',), total_count=50, approx_count_distinct=50, avg_text_length=1)
-def test_error_handling(make_test_data: TestDataMaker) -> None:
-  dataset = make_test_data(SIMPLE_ITEMS)
-  with pytest.raises(ValueError, match='leaf_path must be provided'):
-    dataset.stats(cast(Any, None))
-  with pytest.raises(ValueError, match='Leaf "\\(\'unknown\',\\)" not found in dataset'):
-    dataset.stats(leaf_path='unknown')

src/data/dataset_test.py DELETED Viewed

@@ -1,860 +0,0 @@
-"""Implementation-agnostic tests of the Dataset DB API."""
-from typing import Iterable, Optional, cast
-import numpy as np
-import pytest
-from typing_extensions import override
-from ..schema import UUID_COLUMN, VALUE_KEY, Field, Item, RichData, field, schema
-from ..signals.signal import TextEmbeddingSignal, TextSignal, clear_signal_registry, register_signal
-from .dataset import Column, DatasetManifest, val
-from .dataset_test_utils import TEST_DATASET_NAME, TEST_NAMESPACE, TestDataMaker, enriched_item
-from .dataset_utils import lilac_embedding
-SIMPLE_ITEMS: list[Item] = [{
-  UUID_COLUMN: '1',
-  'str': 'a',
-  'int': 1,
-  'bool': False,
-  'float': 3.0
-}, {
-  UUID_COLUMN: '2',
-  'str': 'b',
-  'int': 2,
-  'bool': True,
-  'float': 2.0
-}, {
-  UUID_COLUMN: '3',
-  'str': 'b',
-  'int': 2,
-  'bool': True,
-  'float': 1.0
-}]
-EMBEDDINGS: list[tuple[str, list[float]]] = [('hello.', [1.0, 0.0, 0.0]),
-                                             ('hello2.', [1.0, 1.0, 0.0]),
-                                             ('hello world.', [1.0, 1.0, 1.0]),
-                                             ('hello world2.', [2.0, 1.0, 1.0])]
-STR_EMBEDDINGS: dict[str, list[float]] = {text: embedding for text, embedding in EMBEDDINGS}
-class TestEmbedding(TextEmbeddingSignal):
-  """A test embed function."""
-  name = 'test_embedding'
-  @override
-  def compute(self, data: Iterable[RichData]) -> Iterable[Item]:
-    """Call the embedding function."""
-    for example in data:
-      yield [lilac_embedding(0, len(example), np.array(STR_EMBEDDINGS[cast(str, example)]))]
-class LengthSignal(TextSignal):
-  name = 'length_signal'
-  _call_count: int = 0
-  def fields(self) -> Field:
-    return field('int32')
-  def compute(self, data: Iterable[RichData]) -> Iterable[Optional[Item]]:
-    for text_content in data:
-      self._call_count += 1
-      yield len(text_content)
-class TestSignal(TextSignal):
-  name = 'test_signal'
-  @override
-  def fields(self) -> Field:
-    return field(fields={'len': 'int32', 'flen': 'float32'})
-  @override
-  def compute(self, data: Iterable[RichData]) -> Iterable[Optional[Item]]:
-    return [{'len': len(text_content), 'flen': float(len(text_content))} for text_content in data]
-@pytest.fixture(scope='module', autouse=True)
-def setup_teardown() -> Iterable[None]:
-  # Setup.
-  register_signal(TestSignal)
-  register_signal(LengthSignal)
-  register_signal(SignalWithQuoteInIt)
-  register_signal(SignalWithDoubleQuoteInIt)
-  # Unit test runs.
-  yield
-  # Teardown.
-  clear_signal_registry()
-def test_select_all_columns(make_test_data: TestDataMaker) -> None:
-  dataset = make_test_data(SIMPLE_ITEMS)
-  result = dataset.select_rows()
-  assert list(result) == SIMPLE_ITEMS
-def test_select_subcols_with_dot_seperator(make_test_data: TestDataMaker) -> None:
-  items: list[Item] = [{
-    UUID_COLUMN: '1',
-    'people': [{
-      'name': 'A',
-      'address': {
-        'zip': 1
-      }
-    }, {
-      'name': 'B',
-      'address': {
-        'zip': 2
-      }
-    }]
-  }, {
-    UUID_COLUMN: '2',
-    'people': [{
-      'name': 'C',
-      'address': {
-        'zip': 3
-      }
-    }]
-  }]
-  dataset = make_test_data(items)
-  result = dataset.select_rows(['people.*.name', 'people.*.address.zip'])
-  assert list(result) == [{
-    UUID_COLUMN: '1',
-    'people.*.name': ['A', 'B'],
-    'people.*.address.zip': [1, 2]
-  }, {
-    UUID_COLUMN: '2',
-    'people.*.name': ['C'],
-    'people.*.address.zip': [3]
-  }]
-  result = dataset.select_rows(['people.*.address.zip'], combine_columns=True)
-  assert list(result) == [{
-    UUID_COLUMN: '1',
-    'people': [{
-      'address': {
-        'zip': 1
-      }
-    }, {
-      'address': {
-        'zip': 2
-      }
-    }]
-  }, {
-    UUID_COLUMN: '2',
-    'people': [{
-      'address': {
-        'zip': 3
-      }
-    }]
-  }]
-  result = dataset.select_rows(['people'])
-  assert list(result) == items
-def test_select_subcols_with_escaped_dot(make_test_data: TestDataMaker) -> None:
-  items: list[Item] = [{
-    UUID_COLUMN: '1',
-    'people.new': [{
-      'name': 'A'
-    }, {
-      'name': 'B'
-    }]
-  }, {
-    UUID_COLUMN: '2',
-    'people.new': [{
-      'name': 'C'
-    }]
-  }]
-  dataset = make_test_data(items)
-  result = dataset.select_rows(['"people.new".*.name'])
-  assert list(result) == [{
-    UUID_COLUMN: '1',
-    'people.new.*.name': ['A', 'B'],
-  }, {
-    UUID_COLUMN: '2',
-    'people.new.*.name': ['C'],
-  }]
-  # Escape name even though it does not need to be.
-  result = dataset.select_rows(['"people.new".*."name"'])
-  assert list(result) == [{
-    UUID_COLUMN: '1',
-    'people.new.*.name': ['A', 'B'],
-  }, {
-    UUID_COLUMN: '2',
-    'people.new.*.name': ['C'],
-  }]
-def test_select_star(make_test_data: TestDataMaker) -> None:
-  items: list[Item] = [{
-    UUID_COLUMN: '1',
-    'name': 'A',
-    'info': {
-      'age': 40
-    }
-  }, {
-    UUID_COLUMN: '2',
-    'name': 'B',
-    'info': {
-      'age': 42
-    }
-  }]
-  dataset = make_test_data(items)
-  # Select *.
-  result = dataset.select_rows(['*'])
-  assert list(result) == items
-  # Select (*,).
-  result = dataset.select_rows([('*',)])
-  assert list(result) == items
-  # Select *, plus a redundant `info` column.
-  result = dataset.select_rows(['*', 'info'])
-  assert list(result) == [{
-    UUID_COLUMN: '1',
-    'name': 'A',
-    'info': {
-      'age': 40
-    },
-    'info_2': {
-      'age': 40
-    },
-  }, {
-    UUID_COLUMN: '2',
-    'name': 'B',
-    'info': {
-      'age': 42
-    },
-    'info_2': {
-      'age': 42
-    },
-  }]
-  # Select * plus an inner `info.age` column.
-  result = dataset.select_rows(['*', ('info', 'age')])
-  assert list(result) == [{
-    UUID_COLUMN: '1',
-    'name': 'A',
-    'info': {
-      'age': 40
-    },
-    'info.age': 40
-  }, {
-    UUID_COLUMN: '2',
-    'name': 'B',
-    'info': {
-      'age': 42
-    },
-    'info.age': 42
-  }]
-def test_select_star_with_combine_cols(make_test_data: TestDataMaker) -> None:
-  items: list[Item] = [{
-    UUID_COLUMN: '1',
-    'name': 'A',
-    'info': {
-      'age': 40
-    }
-  }, {
-    UUID_COLUMN: '2',
-    'name': 'B',
-    'info': {
-      'age': 42
-    }
-  }]
-  dataset = make_test_data(items)
-  # Select *.
-  result = dataset.select_rows(['*'], combine_columns=True)
-  assert list(result) == items
-  # Select *, plus a redundant `info` column.
-  result = dataset.select_rows(['*', 'info'], combine_columns=True)
-  assert list(result) == items
-  # Select * plus an inner `info.age` column.
-  result = dataset.select_rows(['*', ('info', 'age')], combine_columns=True)
-  assert list(result) == items
-  # Select *, plus redundant `name`, plus a udf.
-  udf = Column('name', signal_udf=TestSignal())
-  result = dataset.select_rows(['*', 'name', udf], combine_columns=True)
-  assert list(result) == [{
-    UUID_COLUMN: '1',
-    'name': enriched_item('A', {'test_signal': {
-      'len': 1,
-      'flen': 1.0
-    }}),
-    'info': {
-      'age': 40
-    }
-  }, {
-    UUID_COLUMN: '2',
-    'name': enriched_item('B', {'test_signal': {
-      'len': 1,
-      'flen': 1.0
-    }}),
-    'info': {
-      'age': 42
-    }
-  }]
-def test_select_ids(make_test_data: TestDataMaker) -> None:
-  dataset = make_test_data(SIMPLE_ITEMS)
-  result = dataset.select_rows([UUID_COLUMN])
-  assert list(result) == [{UUID_COLUMN: '1'}, {UUID_COLUMN: '2'}, {UUID_COLUMN: '3'}]
-def test_select_ids_with_limit_and_offset(make_test_data: TestDataMaker) -> None:
-  items: list[Item] = [{UUID_COLUMN: str(i)} for i in range(10, 20)]
-  dataset = make_test_data(items)
-  result = dataset.select_rows([UUID_COLUMN], offset=1, limit=3)
-  assert list(result) == [{UUID_COLUMN: '11'}, {UUID_COLUMN: '12'}, {UUID_COLUMN: '13'}]
-  result = dataset.select_rows([UUID_COLUMN], offset=7, limit=2)
-  assert list(result) == [{UUID_COLUMN: '17'}, {UUID_COLUMN: '18'}]
-  result = dataset.select_rows([UUID_COLUMN], offset=9, limit=200)
-  assert list(result) == [{UUID_COLUMN: '19'}]
-  result = dataset.select_rows([UUID_COLUMN], offset=10, limit=200)
-  assert list(result) == []
-def test_columns(make_test_data: TestDataMaker) -> None:
-  dataset = make_test_data(SIMPLE_ITEMS)
-  result = dataset.select_rows(['str', 'float'])
-  assert list(result) == [{
-    UUID_COLUMN: '1',
-    'str': 'a',
-    'float': 3.0
-  }, {
-    UUID_COLUMN: '2',
-    'str': 'b',
-    'float': 2.0
-  }, {
-    UUID_COLUMN: '3',
-    'str': 'b',
-    'float': 1.0
-  }]
-def test_merge_values(make_test_data: TestDataMaker) -> None:
-  dataset = make_test_data([{
-    UUID_COLUMN: '1',
-    'text': 'hello'
-  }, {
-    UUID_COLUMN: '2',
-    'text': 'everybody'
-  }])
-  test_signal = TestSignal()
-  dataset.compute_signal(test_signal, 'text')
-  length_signal = LengthSignal()
-  dataset.compute_signal(length_signal, 'text')
-  result = dataset.select_rows(['text'])
-  assert list(result) == [{
-    UUID_COLUMN: '1',
-    'text': enriched_item('hello', {
-      'length_signal': 5,
-      'test_signal': {
-        'len': 5,
-        'flen': 5.0
-      }
-    })
-  }, {
-    UUID_COLUMN: '2',
-    'text': enriched_item('everybody', {
-      'length_signal': 9,
-      'test_signal': {
-        'len': 9,
-        'flen': 9.0
-      }
-    }),
-  }]
-  # Test subselection.
-  result = dataset.select_rows(
-    [val('text'), ('text', 'test_signal', 'flen'), ('text', 'test_signal', 'len')])
-  assert list(result) == [{
-    UUID_COLUMN: '1',
-    f'text.{VALUE_KEY}': 'hello',
-    'text.test_signal.flen': 5.0,
-    'text.test_signal.len': 5
-  }, {
-    UUID_COLUMN: '2',
-    f'text.{VALUE_KEY}': 'everybody',
-    'text.test_signal.flen': 9.0,
-    'text.test_signal.len': 9
-  }]
-  # Test subselection with combine_columns=True.
-  result = dataset.select_rows(
-    ['text', ('text', 'test_signal', 'flen'), ('text', 'test_signal', 'len')], combine_columns=True)
-  assert list(result) == [{
-    UUID_COLUMN: '1',
-    'text': enriched_item('hello', {
-      'length_signal': 5,
-      'test_signal': {
-        'len': 5,
-        'flen': 5.0
-      }
-    })
-  }, {
-    UUID_COLUMN: '2',
-    'text': enriched_item('everybody', {
-      'length_signal': 9,
-      'test_signal': {
-        'len': 9,
-        'flen': 9.0
-      }
-    }),
-  }]
-  # Test subselection with aliasing.
-  result = dataset.select_rows(
-    columns=[val('text'), Column(('text', 'test_signal', 'len'), alias='metadata')])
-  assert list(result) == [{
-    UUID_COLUMN: '1',
-    f'text.{VALUE_KEY}': 'hello',
-    'metadata': 5
-  }, {
-    UUID_COLUMN: '2',
-    f'text.{VALUE_KEY}': 'everybody',
-    'metadata': 9
-  }]
-  result = dataset.select_rows(columns=[Column(('text'), alias='text_enrichment')])
-  assert list(result) == [{
-    UUID_COLUMN: '1',
-    'text_enrichment': enriched_item('hello', {
-      'length_signal': 5,
-      'test_signal': {
-        'len': 5,
-        'flen': 5.0
-      }
-    })
-  }, {
-    UUID_COLUMN: '2',
-    'text_enrichment': enriched_item('everybody', {
-      'length_signal': 9,
-      'test_signal': {
-        'len': 9,
-        'flen': 9.0
-      }
-    })
-  }]
-def test_merge_array_values(make_test_data: TestDataMaker) -> None:
-  dataset = make_test_data([{
-    UUID_COLUMN: '1',
-    'texts': ['hello', 'everybody']
-  }, {
-    UUID_COLUMN: '2',
-    'texts': ['a', 'bc', 'def']
-  }])
-  test_signal = TestSignal()
-  dataset.compute_signal(test_signal, ('texts', '*'))
-  length_signal = LengthSignal()
-  dataset.compute_signal(length_signal, ('texts', '*'))
-  assert dataset.manifest() == DatasetManifest(
-    namespace=TEST_NAMESPACE,
-    dataset_name=TEST_DATASET_NAME,
-    data_schema=schema({
-      UUID_COLUMN: 'string',
-      'texts': [
-        field(
-          'string',
-          fields={
-            'length_signal': field('int32', length_signal.dict()),
-            'test_signal': field(
-              signal=test_signal.dict(), fields={
-                'len': 'int32',
-                'flen': 'float32'
-              })
-          })
-      ],
-    }),
-    num_items=2)
-  result = dataset.select_rows(['texts'])
-  assert list(result) == [{
-    UUID_COLUMN: '1',
-    'texts': [
-      enriched_item('hello', {
-        'length_signal': 5,
-        'test_signal': {
-          'len': 5,
-          'flen': 5.0
-        }
-      }),
-      enriched_item('everybody', {
-        'length_signal': 9,
-        'test_signal': {
-          'len': 9,
-          'flen': 9.0
-        }
-      })
-    ],
-  }, {
-    UUID_COLUMN: '2',
-    'texts': [
-      enriched_item('a', {
-        'length_signal': 1,
-        'test_signal': {
-          'len': 1,
-          'flen': 1.0
-        }
-      }),
-      enriched_item('bc', {
-        'length_signal': 2,
-        'test_signal': {
-          'len': 2,
-          'flen': 2.0
-        }
-      }),
-      enriched_item('def', {
-        'length_signal': 3,
-        'test_signal': {
-          'len': 3,
-          'flen': 3.0
-        }
-      })
-    ],
-  }]
-  # Test subselection.
-  result = dataset.select_rows(
-    [val(('texts', '*')), ('texts', '*', 'length_signal'), ('texts', '*', 'test_signal', 'flen')])
-  assert list(result) == [{
-    UUID_COLUMN: '1',
-    f'texts.*.{VALUE_KEY}': ['hello', 'everybody'],
-    'texts.*.test_signal.flen': [5.0, 9.0],
-    'texts.*.length_signal': [5, 9]
-  }, {
-    UUID_COLUMN: '2',
-    f'texts.*.{VALUE_KEY}': ['a', 'bc', 'def'],
-    'texts.*.test_signal.flen': [1.0, 2.0, 3.0],
-    'texts.*.length_signal': [1, 2, 3]
-  }]
-def test_combining_columns(make_test_data: TestDataMaker) -> None:
-  dataset = make_test_data([{
-    UUID_COLUMN: '1',
-    'text': 'hello',
-    'extra': {
-      'text': {
-        'length_signal': 5,
-        'test_signal': {
-          'len': 5,
-          'flen': 5.0
-        }
-      }
-    }
-  }, {
-    UUID_COLUMN: '2',
-    'text': 'everybody',
-    'extra': {
-      'text': {
-        'length_signal': 9,
-        'test_signal': {
-          'len': 9,
-          'flen': 9.0
-        }
-      }
-    }
-  }])
-  # Sub-select text and test_signal.
-  result = dataset.select_rows(['text', ('extra', 'text', 'test_signal')], combine_columns=True)
-  assert list(result) == [{
-    UUID_COLUMN: '1',
-    'text': 'hello',
-    'extra': {
-      'text': {
-        'test_signal': {
-          'len': 5,
-          'flen': 5.0
-        }
-      }
-    }
-  }, {
-    UUID_COLUMN: '2',
-    'text': 'everybody',
-    'extra': {
-      'text': {
-        'test_signal': {
-          'len': 9,
-          'flen': 9.0
-        }
-      }
-    }
-  }]
-  # Sub-select text and length_signal.
-  result = dataset.select_rows(['text', ('extra', 'text', 'length_signal')], combine_columns=True)
-  assert list(result) == [{
-    UUID_COLUMN: '1',
-    'text': 'hello',
-    'extra': {
-      'text': {
-        'length_signal': 5
-      }
-    }
-  }, {
-    UUID_COLUMN: '2',
-    'text': 'everybody',
-    'extra': {
-      'text': {
-        'length_signal': 9
-      }
-    }
-  }]
-  # Sub-select length_signal only.
-  result = dataset.select_rows([('extra', 'text', 'length_signal')], combine_columns=True)
-  assert list(result) == [{
-    UUID_COLUMN: '1',
-    'extra': {
-      'text': {
-        'length_signal': 5
-      }
-    }
-  }, {
-    UUID_COLUMN: '2',
-    'extra': {
-      'text': {
-        'length_signal': 9
-      }
-    }
-  }]
-  # Aliases are ignored when combing columns.
-  len_col = Column(('extra', 'text', 'length_signal'), alias='hello')
-  result = dataset.select_rows([len_col], combine_columns=True)
-  assert list(result) == [{
-    UUID_COLUMN: '1',
-    'extra': {
-      'text': {
-        'length_signal': 5
-      }
-    }
-  }, {
-    UUID_COLUMN: '2',
-    'extra': {
-      'text': {
-        'length_signal': 9
-      }
-    }
-  }]
-  # Works with UDFs and aliases are ignored.
-  udf_col = Column('text', alias='ignored', signal_udf=LengthSignal())
-  result = dataset.select_rows(['text', udf_col], combine_columns=True)
-  assert list(result) == [{
-    UUID_COLUMN: '1',
-    'text': enriched_item('hello', {'length_signal': 5})
-  }, {
-    UUID_COLUMN: '2',
-    'text': enriched_item('everybody', {'length_signal': 9})
-  }]
-def test_source_joined_with_named_signal(make_test_data: TestDataMaker) -> None:
-  dataset = make_test_data(SIMPLE_ITEMS)
-  assert dataset.manifest() == DatasetManifest(
-    namespace=TEST_NAMESPACE,
-    dataset_name=TEST_DATASET_NAME,
-    data_schema=schema({
-      UUID_COLUMN: 'string',
-      'str': 'string',
-      'int': 'int32',
-      'bool': 'boolean',
-      'float': 'float32',
-    }),
-    num_items=3)
-  test_signal = TestSignal()
-  dataset.compute_signal(test_signal, 'str')
-  # Check the enriched dataset manifest has 'text' enriched.
-  assert dataset.manifest() == DatasetManifest(
-    namespace=TEST_NAMESPACE,
-    dataset_name=TEST_DATASET_NAME,
-    data_schema=schema({
-      UUID_COLUMN: 'string',
-      'str': field(
-        'string',
-        fields={
-          'test_signal': field(
-            signal=test_signal.dict(), fields={
-              'len': 'int32',
-              'flen': 'float32'
-            })
-        }),
-      'int': 'int32',
-      'bool': 'boolean',
-      'float': 'float32',
-    }),
-    num_items=3)
-  # Select both columns, without val() on str.
-  result = dataset.select_rows(['str', Column(('str', 'test_signal'), alias='test_signal_on_str')])
-  assert list(result) == [{
-    UUID_COLUMN: '1',
-    'str': enriched_item('a', {'test_signal': {
-      'len': 1,
-      'flen': 1.0
-    }}),
-    'test_signal_on_str': {
-      'len': 1,
-      'flen': 1.0
-    }
-  }, {
-    UUID_COLUMN: '2',
-    'str': enriched_item('b', {'test_signal': {
-      'len': 1,
-      'flen': 1.0
-    }}),
-    'test_signal_on_str': {
-      'len': 1,
-      'flen': 1.0
-    }
-  }, {
-    UUID_COLUMN: '3',
-    'str': enriched_item('b', {'test_signal': {
-      'len': 1,
-      'flen': 1.0
-    }}),
-    'test_signal_on_str': {
-      'len': 1,
-      'flen': 1.0
-    }
-  }]
-  # Select both columns, with val() on str.
-  result = dataset.select_rows(
-    [val('str'), Column(('str', 'test_signal'), alias='test_signal_on_str')])
-  assert list(result) == [{
-    UUID_COLUMN: '1',
-    f'str.{VALUE_KEY}': 'a',
-    'test_signal_on_str': {
-      'len': 1,
-      'flen': 1.0
-    }
-  }, {
-    UUID_COLUMN: '2',
-    f'str.{VALUE_KEY}': 'b',
-    'test_signal_on_str': {
-      'len': 1,
-      'flen': 1.0
-    }
-  }, {
-    UUID_COLUMN: '3',
-    f'str.{VALUE_KEY}': 'b',
-    'test_signal_on_str': {
-      'len': 1,
-      'flen': 1.0
-    }
-  }]
-def test_invalid_column_paths(make_test_data: TestDataMaker) -> None:
-  dataset = make_test_data([{
-    UUID_COLUMN: '1',
-    'text': enriched_item('hello', {'test_signal': {
-      'len': 5
-    }}),
-    'text2': [
-      enriched_item('hello', {'test_signal': {
-        'len': 5
-      }}),
-      enriched_item('hi', {'test_signal': {
-        'len': 2
-      }})
-    ],
-  }])
-  with pytest.raises(ValueError, match='Path part "invalid" not found in the dataset'):
-    dataset.select_rows([('text', 'test_signal', 'invalid')])
-  with pytest.raises(ValueError, match='Selecting a specific index of a repeated field'):
-    dataset.select_rows([('text2', '4', 'test_signal')])
-def test_signal_with_quote(make_test_data: TestDataMaker) -> None:
-  dataset = make_test_data([{
-    UUID_COLUMN: '1',
-    'text': 'hello',
-  }, {
-    UUID_COLUMN: '2',
-    'text': 'world',
-  }])
-  dataset.compute_signal(SignalWithQuoteInIt(), 'text')
-  dataset.compute_signal(SignalWithDoubleQuoteInIt(), 'text')
-  result = dataset.select_rows(['text'])
-  assert list(result) == [{
-    UUID_COLUMN: '1',
-    'text': enriched_item('hello', {
-      "test'signal": True,
-      'test"signal': True
-    })
-  }, {
-    UUID_COLUMN: '2',
-    'text': enriched_item('world', {
-      "test'signal": True,
-      'test"signal': True
-    }),
-  }]
-class SignalWithQuoteInIt(TextSignal):
-  name = "test'signal"
-  @override
-  def fields(self) -> Field:
-    return field('boolean')
-  @override
-  def compute(self, data: Iterable[RichData]) -> Iterable[Optional[Item]]:
-    for d in data:
-      yield True
-class SignalWithDoubleQuoteInIt(TextSignal):
-  name = 'test"signal'
-  @override
-  def fields(self) -> Field:
-    return field('boolean')
-  @override
-  def compute(self, data: Iterable[RichData]) -> Iterable[Optional[Item]]:
-    for d in data:
-      yield True

src/data/dataset_utils.py CHANGED Viewed

@@ -73,7 +73,7 @@ def lilac_embedding(start: int, end: int, embedding: Optional[np.ndarray]) -> It
 Tflatten = TypeVar('Tflatten', object, np.ndarray)
-def _flatten(input: Union[Iterable, object], is_primitive_predicate: Callable[[object],
                                                                               bool]) -> Generator:
   """Flattens a nested iterable."""
   if is_primitive_predicate(input):
@@ -83,13 +83,13 @@ def _flatten(input: Union[Iterable, object], is_primitive_predicate: Callable[[o
   elif is_primitive(input):
     yield input
   else:
-    for elem in cast(Iterable, input):
       yield from _flatten(elem, is_primitive_predicate)
-def flatten(input: Union[Iterable, Tflatten],
-            is_primitive_predicate: Callable[[object], bool] = is_primitive) -> Iterable[Tflatten]:
-  """Flattens a nested iterable.
   Primitives and dictionaries are not flattened. The user can also provide a predicate to determine
   what is a primitive.
@@ -97,7 +97,7 @@ def flatten(input: Union[Iterable, Tflatten],
   return _flatten(input, is_primitive_predicate)
-def count_primitives(input: Iterable) -> int:
   """Iterate through each element of the input, flattening each one, computing a count.
   Sum the final set of counts. This is the important iterable not to exhaust.
@@ -128,7 +128,8 @@ def _unflatten(flat_input: Iterator[list[object]],
     return [_unflatten(flat_input, orig_elem) for orig_elem in values]
-def unflatten(flat_input: Iterable, original_input: Union[Iterable, object]) -> list:
   """Unflattens a flattened iterable according to the original iterable's structure."""
   return cast(list, _unflatten(iter(flat_input), original_input))
@@ -234,23 +235,27 @@ def write_item_embeddings_to_disk(keys: Iterable[str], embeddings: Iterable[obje
     return isinstance(input, np.ndarray)
   flat_keys = flatten_keys(keys, embeddings, is_primitive_predicate=embedding_predicate)
   embedding_vectors: list[np.ndarray] = []
-  for lilac_embedding in flatten(embeddings, is_primitive_predicate=embedding_predicate):
     # We use squeeze here because embedding functions can return outer dimensions of 1.
-    embedding_vector = lilac_embedding[EMBEDDING_KEY].reshape(-1)
-    if embedding_vector.ndim != 1:
-      raise ValueError(f'Expected embeddings to be 1-dimensional, got {embedding_vector.ndim} '
-                       f'with shape {embedding_vector.shape}.')
-    embedding_vectors.append(embedding_vector)
-  flat_embeddings = np.array(embedding_vectors)
   # Write the embedding index and the ordered UUID column to disk so they can be joined later.
   with open_file(output_path_prefix + _EMBEDDINGS_SUFFIX, 'wb') as f:
-    np.save(f, flat_embeddings, allow_pickle=False)
   with open_file(output_path_prefix + _KEYS_SUFFIX, 'wb') as f:
-    pickle.dump(flat_keys, f)
   return output_path_prefix
@@ -314,34 +319,63 @@ def parquet_filename(prefix: str, shard_index: int, num_shards: int) -> str:
 def _flatten_keys(uuid: str, nested_input: Iterable, location: list[int],
-                  is_primitive_predicate: Callable[[object], bool]) -> list[VectorKey]:
-  if is_primitive_predicate(nested_input):
-    return [(uuid, *location)]
-  elif is_primitive(nested_input):
-    return []
-  else:
-    result: list[VectorKey] = []
-    if isinstance(nested_input, dict):
-      for value in nested_input.values():
-        result.extend(_flatten_keys(uuid, value, location, is_primitive_predicate))
-    else:
-      for i, input in enumerate(nested_input):
-        result.extend(_flatten_keys(uuid, input, [*location, i], is_primitive_predicate))
-    return result
 def flatten_keys(
     uuids: Iterable[str],
     nested_input: Iterable,
-    is_primitive_predicate: Callable[[object], bool] = is_primitive) -> list[VectorKey]:
   """Flatten the uuid keys of a nested input."""
-  result: list[VectorKey] = []
   for uuid, input in zip(uuids, nested_input):
-    result.extend(_flatten_keys(uuid, input, [], is_primitive_predicate))
-  return result
 def embedding_index_filename_prefix(output_dir: str, shard_index: int, num_shards: int) -> str:
   """Return the filename prefix for the embedding index."""
   npy_filename = f'embeddings-{shard_index:05d}-of-{num_shards:05d}'
   return os.path.join(output_dir, npy_filename)

 Tflatten = TypeVar('Tflatten', object, np.ndarray)
+def _flatten(input: Union[Iterator, object], is_primitive_predicate: Callable[[object],
                                                                               bool]) -> Generator:
   """Flattens a nested iterable."""
   if is_primitive_predicate(input):
   elif is_primitive(input):
     yield input
   else:
+    for elem in cast(Iterator, input):
       yield from _flatten(elem, is_primitive_predicate)
+def flatten(input: Union[Iterator, Iterable, Tflatten],
+            is_primitive_predicate: Callable[[object], bool] = is_primitive) -> Iterator[Tflatten]:
+  """Flattens a nested iterator.
   Primitives and dictionaries are not flattened. The user can also provide a predicate to determine
   what is a primitive.
   return _flatten(input, is_primitive_predicate)
+def count_primitives(input: Union[Iterable, Iterator]) -> int:
   """Iterate through each element of the input, flattening each one, computing a count.
   Sum the final set of counts. This is the important iterable not to exhaust.
     return [_unflatten(flat_input, orig_elem) for orig_elem in values]
+def unflatten(flat_input: Union[Iterable, Iterator], original_input: Union[Iterable,
+                                                                           object]) -> list:
   """Unflattens a flattened iterable according to the original iterable's structure."""
   return cast(list, _unflatten(iter(flat_input), original_input))
     return isinstance(input, np.ndarray)
   flat_keys = flatten_keys(keys, embeddings, is_primitive_predicate=embedding_predicate)
+  flat_embeddings = flatten(embeddings, is_primitive_predicate=embedding_predicate)
   embedding_vectors: list[np.ndarray] = []
+  embedding_keys: list[VectorKey] = []
+  for key, lilac_embedding in zip(flat_keys, flat_embeddings):
+    if not key or not lilac_embedding or EMBEDDING_KEY not in lilac_embedding:
+      # Sparse embeddings may not have an embedding for every key.
+      continue
     # We use squeeze here because embedding functions can return outer dimensions of 1.
+    embedding_vectors.append(lilac_embedding[EMBEDDING_KEY].reshape(-1))
+    embedding_keys.append(key)
+  embedding_vectors = np.array(embedding_vectors)
   # Write the embedding index and the ordered UUID column to disk so they can be joined later.
   with open_file(output_path_prefix + _EMBEDDINGS_SUFFIX, 'wb') as f:
+    np.save(f, embedding_vectors, allow_pickle=False)
   with open_file(output_path_prefix + _KEYS_SUFFIX, 'wb') as f:
+    pickle.dump(embedding_keys, f)
   return output_path_prefix
 def _flatten_keys(uuid: str, nested_input: Iterable, location: list[int],
+                  is_primitive_predicate: Callable[[object], bool]) -> Iterator[VectorKey]:
+  if is_primitive_predicate(nested_input) or is_primitive(nested_input) or isinstance(
+      nested_input, dict):
+    yield (uuid, *location)
+    return
+  for i, input in enumerate(nested_input):
+    yield from _flatten_keys(uuid, input, [*location, i], is_primitive_predicate)
 def flatten_keys(
     uuids: Iterable[str],
     nested_input: Iterable,
+    is_primitive_predicate: Callable[[object],
+                                     bool] = is_primitive) -> Iterator[Optional[VectorKey]]:
   """Flatten the uuid keys of a nested input."""
   for uuid, input in zip(uuids, nested_input):
+    if input is None:
+      yield None
+      continue
+    yield from _flatten_keys(uuid, input, [], is_primitive_predicate)
 def embedding_index_filename_prefix(output_dir: str, shard_index: int, num_shards: int) -> str:
   """Return the filename prefix for the embedding index."""
   npy_filename = f'embeddings-{shard_index:05d}-of-{num_shards:05d}'
   return os.path.join(output_dir, npy_filename)
+Tin = TypeVar('Tin')
+Tout = TypeVar('Tout')
+def sparse_to_dense_compute(
+    sparse_input: Iterator[Optional[Tin]],
+    func: Callable[[Iterable[Tin]], Iterable[Tout]]) -> Iterator[Optional[Tout]]:
+  """Densifies the input before calling the provided `func` and sparsifies the output."""
+  empty_mask: list[bool] = []
+  def densify(x: Iterator[Optional[Tin]]) -> Iterator[Tin]:
+    nonlocal empty_mask
+    for i, value in enumerate(x):
+      empty_mask.append(value is None)
+      if value is not None:
+        yield value
+  dense_input = densify(sparse_input)
+  dense_output = iter(func(dense_input))
+  index = 0
+  while True:
+    try:
+      out = next(dense_output)
+      yield (None if empty_mask[index] else out)
+      index += 1
+    except StopIteration:
+      while index < len(empty_mask):
+        yield None
+        index += 1
+      return

src/data/dataset_utils_test.py DELETED Viewed

@@ -1,114 +0,0 @@
-"""Tests for dataset utils."""
-from ..schema import PathTuple
-from .dataset_utils import count_primitives, flatten, unflatten, wrap_in_dicts
-def test_flatten() -> None:
-  a = [[1, 2], [[3]], [4, 5, 5]]
-  result = list(flatten(a))
-  assert result == [1, 2, 3, 4, 5, 5]
-def test_flatten_primitive() -> None:
-  result = list(flatten('hello'))
-  assert result == ['hello']
-def test_unflatten() -> None:
-  a = [[1, 2], [[3]], [4, 5, 5]]
-  flat_a = list(flatten(a))
-  result = unflatten(flat_a, a)
-  assert result == [[1, 2], [[3]], [4, 5, 5]]
-def test_count_nested() -> None:
-  a = [[1, 2], [[3]], [4, 5, 6]]
-  assert 6 == count_primitives(a)
-def test_wrap_in_dicts_with_spec_of_one_repeated() -> None:
-  a = [[1, 2], [3], [4, 5, 5]]
-  spec: list[PathTuple] = [('a', 'b', 'c'), ('d',)]  # Corresponds to a.b.c.*.d.
-  result = wrap_in_dicts(a, spec)
-  assert result == [{
-    'a': {
-      'b': {
-        'c': [{
-          'd': 1
-        }, {
-          'd': 2
-        }]
-      }
-    }
-  }, {
-    'a': {
-      'b': {
-        'c': [{
-          'd': 3
-        }]
-      }
-    }
-  }, {
-    'a': {
-      'b': {
-        'c': [{
-          'd': 4
-        }, {
-          'd': 5
-        }, {
-          'd': 5
-        }]
-      }
-    }
-  }]
-def test_wrap_in_dicts_with_spec_of_double_repeated() -> None:
-  a = [[[1, 2], [3, 4, 5]], [[6]], [[7], [8], [9, 10]]]
-  spec: list[PathTuple] = [('a', 'b'), tuple(), ('c',)]  # Corresponds to a.b.*.*.c.
-  result = wrap_in_dicts(a, spec)
-  assert result == [{
-    'a': {
-      'b': [[{
-        'c': 1
-      }, {
-        'c': 2
-      }], [{
-        'c': 3
-      }, {
-        'c': 4
-      }, {
-        'c': 5
-      }]]
-    }
-  }, {
-    'a': {
-      'b': [[{
-        'c': 6
-      }]]
-    }
-  }, {
-    'a': {
-      'b': [[{
-        'c': 7
-      }], [{
-        'c': 8
-      }], [{
-        'c': 9
-      }, {
-        'c': 10
-      }]]
-    }
-  }]
-def test_unflatten_primitive() -> None:
-  original = 'hello'
-  result = unflatten(['hello'], original)
-  assert result == 'hello'
-def test_unflatten_primitive_list() -> None:
-  original = ['hello', 'world']
-  result = unflatten(['hello', 'world'], original)
-  assert result == ['hello', 'world']

src/data/sources/csv_source_test.py DELETED Viewed

@@ -1,42 +0,0 @@
-"""Tests for the CSV source."""
-import csv
-import os
-import pathlib
-from ...schema import schema
-from .csv_source import LINE_NUMBER_COLUMN, CSVDataset
-from .source import SourceSchema
-def test_csv(tmp_path: pathlib.Path) -> None:
-  csv_rows = [{'x': 1, 'y': 'ten'}, {'x': 2, 'y': 'twenty'}]
-  filename = 'test-dataset.csv'
-  filepath = os.path.join(tmp_path, filename)
-  with open(filepath, 'w') as f:
-    writer = csv.DictWriter(f, fieldnames=list(csv_rows[0].keys()))
-    writer.writeheader()
-    writer.writerows(csv_rows)
-  source = CSVDataset(filepaths=[filepath])
-  source.setup()
-  source_schema = source.source_schema()
-  assert source_schema == SourceSchema(
-    fields=schema({
-      LINE_NUMBER_COLUMN: 'int64',
-      'x': 'int64',
-      'y': 'string'
-    }).fields, num_items=2)
-  items = list(source.process())
-  assert items == [{
-    LINE_NUMBER_COLUMN: 0,
-    'x': 1,
-    'y': 'ten'
-  }, {
-    LINE_NUMBER_COLUMN: 1,
-    'x': 2,
-    'y': 'twenty'
-  }]

src/data/sources/huggingface_source_test.py DELETED Viewed

@@ -1,170 +0,0 @@
-"""Tests for the pandas source."""
-import os
-import pathlib
-# mypy: disable-error-code="attr-defined"
-from datasets import Dataset, Features, Sequence, Value
-from ...schema import schema
-from .huggingface_source import HF_SPLIT_COLUMN, HuggingFaceDataset
-from .source import SourceSchema
-def test_hf(tmp_path: pathlib.Path) -> None:
-  dataset = Dataset.from_list([{'x': 1, 'y': 'ten'}, {'x': 2, 'y': 'twenty'}])
-  dataset_name = os.path.join(tmp_path, 'hf-test-dataset')
-  dataset.save_to_disk(dataset_name)
-  source = HuggingFaceDataset(dataset_name=dataset_name, load_from_disk=True)
-  items = source.process()
-  source.setup()
-  source_schema = source.source_schema()
-  assert source_schema == SourceSchema(
-    fields=schema({
-      HF_SPLIT_COLUMN: 'string',
-      'x': 'int64',
-      'y': 'string'
-    }).fields, num_items=2)
-  items = list(source.process())
-  assert items == [{
-    HF_SPLIT_COLUMN: 'default',
-    'x': 1,
-    'y': 'ten'
-  }, {
-    HF_SPLIT_COLUMN: 'default',
-    'x': 2,
-    'y': 'twenty'
-  }]
-def test_hf_sequence(tmp_path: pathlib.Path) -> None:
-  dataset = Dataset.from_list([{
-    'scalar': 1,
-    'seq': [1, 0],
-    'seq_dict': {
-      'x': [1, 2, 3],
-      'y': ['four', 'five', 'six']
-    }
-  }, {
-    'scalar': 2,
-    'seq': [2, 0],
-    'seq_dict': {
-      'x': [10, 20, 30],
-      'y': ['forty', 'fifty', 'sixty']
-    }
-  }],
-                              features=Features({
-                                'scalar': Value(dtype='int64'),
-                                'seq': Sequence(feature=Value(dtype='int64')),
-                                'seq_dict': Sequence(feature={
-                                  'x': Value(dtype='int64'),
-                                  'y': Value(dtype='string')
-                                })
-                              }))
-  dataset_name = os.path.join(tmp_path, 'hf-test-dataset')
-  dataset.save_to_disk(dataset_name)
-  source = HuggingFaceDataset(dataset_name=dataset_name, load_from_disk=True)
-  items = source.process()
-  source.setup()
-  source_schema = source.source_schema()
-  assert source_schema == SourceSchema(
-    fields=schema({
-      HF_SPLIT_COLUMN: 'string',
-      'scalar': 'int64',
-      'seq': ['int64'],
-      'seq_dict': {
-        'x': ['int64'],
-        'y': ['string'],
-      },
-    }).fields,
-    num_items=2)
-  items = list(source.process())
-  assert items == [{
-    HF_SPLIT_COLUMN: 'default',
-    'scalar': 1,
-    'seq': [1, 0],
-    'seq_dict': {
-      'x': [1, 2, 3],
-      'y': ['four', 'five', 'six']
-    }
-  }, {
-    HF_SPLIT_COLUMN: 'default',
-    'scalar': 2,
-    'seq': [2, 0],
-    'seq_dict': {
-      'x': [10, 20, 30],
-      'y': ['forty', 'fifty', 'sixty']
-    }
-  }]
-def test_hf_list(tmp_path: pathlib.Path) -> None:
-  dataset = Dataset.from_list([{
-    'scalar': 1,
-    'list': [{
-      'x': 1,
-      'y': 'two'
-    }]
-  }, {
-    'scalar': 2,
-    'list': [{
-      'x': 3,
-      'y': 'four'
-    }]
-  }],
-                              features=Features({
-                                'scalar': Value(dtype='int64'),
-                                'list': [{
-                                  'x': Value(dtype='int64'),
-                                  'y': Value(dtype='string')
-                                }]
-                              }))
-  dataset_name = os.path.join(tmp_path, 'hf-test-dataset')
-  dataset.save_to_disk(dataset_name)
-  source = HuggingFaceDataset(dataset_name=dataset_name, load_from_disk=True)
-  items = source.process()
-  source.setup()
-  source_schema = source.source_schema()
-  assert source_schema == SourceSchema(
-    fields=schema({
-      HF_SPLIT_COLUMN: 'string',
-      'scalar': 'int64',
-      'list': [{
-        'x': 'int64',
-        'y': 'string',
-      }],
-    }).fields,
-    num_items=2)
-  items = list(source.process())
-  assert items == [{
-    HF_SPLIT_COLUMN: 'default',
-    'scalar': 1,
-    'list': [{
-      'x': 1,
-      'y': 'two'
-    }]
-  }, {
-    HF_SPLIT_COLUMN: 'default',
-    'scalar': 2,
-    'list': [{
-      'x': 3,
-      'y': 'four'
-    }]
-  }]

src/data/sources/json_source_test.py DELETED Viewed

@@ -1,74 +0,0 @@
-"""Tests for the JSON source."""
-import json
-import os
-import pathlib
-from ...schema import schema
-from .json_source import ROW_ID_COLUMN, JSONDataset
-from .source import SourceSchema
-def test_simple_json(tmp_path: pathlib.Path) -> None:
-  json_records = [{'x': 1, 'y': 'ten'}, {'x': 2, 'y': 'twenty'}]
-  filename = 'test-dataset.jsonl'
-  filepath = os.path.join(tmp_path, filename)
-  with open(filepath, 'w') as f:
-    f.write(json.dumps(json_records))
-  source = JSONDataset(filepaths=[filepath])
-  source.setup()
-  source_schema = source.source_schema()
-  assert source_schema == SourceSchema(
-    fields=schema({
-      ROW_ID_COLUMN: 'int64',
-      'x': 'int64',
-      'y': 'string'
-    }).fields, num_items=2)
-  items = list(source.process())
-  assert items == [{
-    ROW_ID_COLUMN: 0,
-    'x': 1,
-    'y': 'ten'
-  }, {
-    ROW_ID_COLUMN: 1,
-    'x': 2,
-    'y': 'twenty'
-  }]
-def test_simple_jsonl(tmp_path: pathlib.Path) -> None:
-  json_records = [{'x': 1, 'y': 'ten'}, {'x': 2, 'y': 'twenty'}]
-  json_lines = [json.dumps(record) + '\n' for record in json_records]
-  filename = 'test-dataset.jsonl'
-  filepath = os.path.join(tmp_path, filename)
-  with open(filepath, 'w') as f:
-    f.writelines(json_lines)
-  source = JSONDataset(dataset_name='test_dataset', filepaths=[filepath])
-  source.setup()
-  source_schema = source.source_schema()
-  assert source_schema == SourceSchema(
-    fields=schema({
-      ROW_ID_COLUMN: 'int64',
-      'x': 'int64',
-      'y': 'string'
-    }).fields, num_items=2)
-  items = list(source.process())
-  assert items == [{
-    ROW_ID_COLUMN: 0,
-    'x': 1,
-    'y': 'ten'
-  }, {
-    ROW_ID_COLUMN: 1,
-    'x': 2,
-    'y': 'twenty'
-  }]

src/data/sources/pandas_source_test.py DELETED Viewed

@@ -1,91 +0,0 @@
-"""Tests for the pandas source."""
-import pandas as pd
-from ...schema import schema
-from .pandas_source import PANDAS_INDEX_COLUMN, PandasDataset
-from .source import SourceSchema
-def test_simple_dataframe() -> None:
-  df = pd.DataFrame.from_records([{
-    'name': 'a',
-    'age': 1
-  }, {
-    'name': 'b',
-    'age': 2
-  }, {
-    'name': 'c',
-    'age': 3
-  }])
-  source = PandasDataset(df)
-  source.setup()
-  source_schema = source.source_schema()
-  assert source_schema == SourceSchema(
-    fields=schema({
-      PANDAS_INDEX_COLUMN: 'int64',
-      'name': 'string',
-      'age': 'int64'
-    }).fields,
-    num_items=3)
-  items = list(source.process())
-  assert items == [{
-    PANDAS_INDEX_COLUMN: 0,
-    'name': 'a',
-    'age': 1
-  }, {
-    PANDAS_INDEX_COLUMN: 1,
-    'name': 'b',
-    'age': 2
-  }, {
-    PANDAS_INDEX_COLUMN: 2,
-    'name': 'c',
-    'age': 3
-  }]
-def test_simple_dataframe_with_index() -> None:
-  df = pd.DataFrame.from_records([{
-    'name': 'a',
-    'age': 1
-  }, {
-    'name': 'b',
-    'age': 2
-  }, {
-    'name': 'c',
-    'age': 3
-  }],
-                                 index=['id1', 'id2', 'id3'])
-  source = PandasDataset(df)
-  source.setup()
-  source_schema = source.source_schema()
-  assert source_schema == SourceSchema(
-    fields=schema({
-      PANDAS_INDEX_COLUMN: 'string',
-      'name': 'string',
-      'age': 'int64'
-    }).fields,
-    num_items=3)
-  items = list(source.process())
-  # The PANDAS_INDEX_COLUMN aligns with the pandas index.
-  assert items == [{
-    PANDAS_INDEX_COLUMN: 'id1',
-    'name': 'a',
-    'age': 1
-  }, {
-    PANDAS_INDEX_COLUMN: 'id2',
-    'name': 'b',
-    'age': 2
-  }, {
-    PANDAS_INDEX_COLUMN: 'id3',
-    'name': 'c',
-    'age': 3
-  }]

src/data/sources/source_registry_test.py DELETED Viewed

@@ -1,55 +0,0 @@
-"""A source to compute semantic search for a document."""
-from typing import Iterable, cast
-import pytest
-from typing_extensions import override
-from ...schema import Item
-from .source import Source, SourceSchema
-from .source_registry import clear_source_registry, get_source_cls, register_source, resolve_source
-class TestSource(Source):
-  """A test source."""
-  name = 'test_source'
-  @override
-  def setup(self) -> None:
-    pass
-  @override
-  def source_schema(self) -> SourceSchema:
-    """Return the source schema."""
-    return cast(SourceSchema, None)
-  @override
-  def process(self) -> Iterable[Item]:
-    yield None
-@pytest.fixture(scope='module', autouse=True)
-def setup_teardown() -> Iterable[None]:
-  # Setup.
-  register_source(TestSource)
-  # Unit test runs.
-  yield
-  # Teardown.
-  clear_source_registry()
-def test_get_source_cls() -> None:
-  """Test getting a source."""
-  assert TestSource == get_source_cls('test_source')
-def test_resolve_source() -> None:
-  """Test resolving a source."""
-  test_source = TestSource()
-  # sources pass through.
-  assert resolve_source(test_source) == test_source
-  # Dicts resolve to the base class.
-  assert resolve_source(test_source.dict()) == test_source

src/data_loader_test.py DELETED Viewed

@@ -1,74 +0,0 @@
-"""Tests for data_loader.py."""
-import os
-import pathlib
-import uuid
-from typing import Iterable
-from pytest_mock import MockerFixture
-from typing_extensions import override
-from .data.dataset_duckdb import read_source_manifest
-from .data.dataset_utils import parquet_filename
-from .data.sources.source import Source, SourceSchema
-from .data_loader import process_source
-from .schema import PARQUET_FILENAME_PREFIX, UUID_COLUMN, Item, SourceManifest, schema
-from .test_utils import fake_uuid, read_items
-from .utils import DATASETS_DIR_NAME
-class TestSource(Source):
-  """A test source."""
-  name = 'test_source'
-  @override
-  def setup(self) -> None:
-    pass
-  @override
-  def source_schema(self) -> SourceSchema:
-    """Return the source schema."""
-    return SourceSchema(fields=schema({'x': 'int64', 'y': 'string'}).fields, num_items=2)
-  @override
-  def process(self) -> Iterable[Item]:
-    return [{'x': 1, 'y': 'ten'}, {'x': 2, 'y': 'twenty'}]
-def test_data_loader(tmp_path: pathlib.Path, mocker: MockerFixture) -> None:
-  mock_uuid = mocker.patch.object(uuid, 'uuid4', autospec=True)
-  mock_uuid.side_effect = [fake_uuid(b'1'), fake_uuid(b'2')]
-  source = TestSource()
-  setup_mock = mocker.spy(TestSource, 'setup')
-  output_dir, num_items = process_source(tmp_path, 'test_namespace', 'test_dataset', source)
-  assert setup_mock.call_count == 1
-  assert output_dir == os.path.join(tmp_path, DATASETS_DIR_NAME, 'test_namespace', 'test_dataset')
-  assert num_items == 2
-  source_manifest = read_source_manifest(output_dir)
-  assert source_manifest == SourceManifest(
-    files=[parquet_filename(PARQUET_FILENAME_PREFIX, 0, 1)],
-    data_schema=schema({
-      # UUID_COLUMN is generated by the data loader.
-      UUID_COLUMN: 'string',
-      'x': 'int64',
-      'y': 'string'
-    }),
-  )
-  items = read_items(output_dir, source_manifest.files, source_manifest.data_schema)
-  assert items == [{
-    UUID_COLUMN: fake_uuid(b'1').hex,
-    'x': 1,
-    'y': 'ten'
-  }, {
-    UUID_COLUMN: fake_uuid(b'2').hex,
-    'x': 2,
-    'y': 'twenty'
-  }]

src/embeddings/embedding.py CHANGED Viewed

@@ -57,7 +57,7 @@ def compute_split_embeddings(docs: Iterable[str],
   pool = ThreadPoolExecutor()
   def _splitter(doc: str) -> list[TextChunk]:
-    if doc is None:
       return []
     if split_fn:
       return split_fn(doc)
@@ -65,15 +65,19 @@ def compute_split_embeddings(docs: Iterable[str],
       # Return a single chunk that spans the entire document.
       return [(doc, (0, len(doc)))]
   def _flat_split_batch_docs(docs: Iterable[str]) -> Generator[tuple[int, TextChunk], None, None]:
     """Split a batch of documents into chunks and yield them."""
     for i, doc in enumerate(docs):
-      chunks = _splitter(doc) or [cast(TextChunk, ('', (0, 0)))]
       for chunk in chunks:
         yield (i, chunk)
   doc_chunks = _flat_split_batch_docs(docs)
-  items_to_yield: list[Item] = []
   current_index = 0
   mega_batch_size = batch_size * num_parallel_requests
@@ -81,19 +85,27 @@ def compute_split_embeddings(docs: Iterable[str],
   for batch in chunks(doc_chunks, mega_batch_size):
     texts = [text for _, (text, _) in batch]
     embeddings: list[np.ndarray] = []
     for x in list(pool.map(lambda x: embed_fn(x), chunks(texts, batch_size))):
       embeddings.extend(x)
     matrix = normalize(np.array(embeddings)).astype(np.float16)
     # np.split returns a shallow copy of each embedding so we don't increase the mem footprint.
     embeddings_batch = cast(list[np.ndarray], np.split(matrix, matrix.shape[0]))
     for (index, (_, (start, end))), embedding in zip(batch, embeddings_batch):
       if index == current_index:
         items_to_yield.append(lilac_embedding(start, end, embedding))
       else:
         yield items_to_yield
         items_to_yield = [lilac_embedding(start, end, embedding)]
-        current_index = index
-  # Yield the last batch.
-  if items_to_yield:
     yield items_to_yield

   pool = ThreadPoolExecutor()
   def _splitter(doc: str) -> list[TextChunk]:
+    if not doc:
       return []
     if split_fn:
       return split_fn(doc)
       # Return a single chunk that spans the entire document.
       return [(doc, (0, len(doc)))]
+  num_docs = 0
   def _flat_split_batch_docs(docs: Iterable[str]) -> Generator[tuple[int, TextChunk], None, None]:
     """Split a batch of documents into chunks and yield them."""
+    nonlocal num_docs
     for i, doc in enumerate(docs):
+      num_docs += 1
+      chunks = _splitter(doc)
       for chunk in chunks:
         yield (i, chunk)
   doc_chunks = _flat_split_batch_docs(docs)
+  items_to_yield: Optional[list[Item]] = None
   current_index = 0
   mega_batch_size = batch_size * num_parallel_requests
   for batch in chunks(doc_chunks, mega_batch_size):
     texts = [text for _, (text, _) in batch]
     embeddings: list[np.ndarray] = []
     for x in list(pool.map(lambda x: embed_fn(x), chunks(texts, batch_size))):
       embeddings.extend(x)
     matrix = normalize(np.array(embeddings)).astype(np.float16)
     # np.split returns a shallow copy of each embedding so we don't increase the mem footprint.
     embeddings_batch = cast(list[np.ndarray], np.split(matrix, matrix.shape[0]))
     for (index, (_, (start, end))), embedding in zip(batch, embeddings_batch):
+      embedding = embedding.reshape(-1)
       if index == current_index:
+        if items_to_yield is None:
+          items_to_yield = []
         items_to_yield.append(lilac_embedding(start, end, embedding))
       else:
         yield items_to_yield
+        current_index += 1
+        while current_index < index:
+          yield None
+          current_index += 1
         items_to_yield = [lilac_embedding(start, end, embedding)]
+  while current_index < num_docs:
     yield items_to_yield
+    items_to_yield = None
+    current_index += 1