Spaces:
Running
Running
Commit
β’
1ca4d24
1
Parent(s):
9e5fdea
Add components for sample image return
Browse filesmetadata used for filtering with readme describing it
query file with filter and return code
Include script used to upload sample images to S3
Co-authored-by: Matthew Thompson <thompsonmj@users.noreply.huggingface.co>
- .gitattributes +1 -1
- components/metadata.csv +3 -0
- components/metadata_readme.md +11 -0
- components/query.py +116 -0
- components/sync_samples_to_s3.bash +34 -0
.gitattributes
CHANGED
@@ -33,7 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
-
|
37 |
*.json filter=lfs diff=lfs merge=lfs -text
|
38 |
*.jpeg filter=lfs diff=lfs merge=lfs -text
|
39 |
*.png filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
36 |
*.json filter=lfs diff=lfs merge=lfs -text
|
37 |
*.jpeg filter=lfs diff=lfs merge=lfs -text
|
38 |
*.png filter=lfs diff=lfs merge=lfs -text
|
39 |
+
components/metadata.csv filter=lfs diff=lfs merge=lfs -text
|
components/metadata.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d8576f6ca106f35387506369a70df01fb92192a740c3b5da2a12ad8303976aad
|
3 |
+
size 233934143
|
components/metadata_readme.md
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: Bioclip Demo
|
3 |
+
emoji: π
|
4 |
+
colorFrom: indigo
|
5 |
+
colorTo: purple
|
6 |
+
sdk: gradio
|
7 |
+
sdk_version: 4.36.1
|
8 |
+
app_file: app.py
|
9 |
+
pinned: false
|
10 |
+
license: mit
|
11 |
+
---
|
components/query.py
ADDED
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import io
|
2 |
+
import boto3
|
3 |
+
import requests
|
4 |
+
import numpy as np
|
5 |
+
import polars as pl
|
6 |
+
from PIL import Image
|
7 |
+
from botocore.config import Config
|
8 |
+
import logging
|
9 |
+
|
10 |
+
logger = logging.getLogger(__name__)
|
11 |
+
|
12 |
+
# S3 for sample images
|
13 |
+
my_config = Config(
|
14 |
+
region_name='us-east-1'
|
15 |
+
)
|
16 |
+
s3_client = boto3.client('s3', config=my_config)
|
17 |
+
|
18 |
+
# Set basepath for EOL pages for info
|
19 |
+
EOL_URL = "https://eol.org/pages/"
|
20 |
+
RANKS = ["kingdom", "phylum", "class", "order", "family", "genus", "species"]
|
21 |
+
|
22 |
+
def get_sample(df, pred_taxon, rank):
|
23 |
+
'''
|
24 |
+
Function to retrieve a sample image of the predicted taxon and EOL page link for more info.
|
25 |
+
|
26 |
+
Parameters:
|
27 |
+
-----------
|
28 |
+
df : DataFrame
|
29 |
+
DataFrame with all sample images listed and their filepaths (in "file_path" column).
|
30 |
+
pred_taxon : str
|
31 |
+
Predicted taxon of the uploaded image.
|
32 |
+
rank : int
|
33 |
+
Index of rank in RANKS chosen for prediction.
|
34 |
+
|
35 |
+
Returns:
|
36 |
+
--------
|
37 |
+
img : PIL.Image
|
38 |
+
Sample image of predicted taxon for display.
|
39 |
+
eol_page : str
|
40 |
+
URL to EOL page for the taxon (may be a lower rank, e.g., species sample).
|
41 |
+
'''
|
42 |
+
logger.info(f"Getting sample for taxon: {pred_taxon} at rank: {rank}")
|
43 |
+
try:
|
44 |
+
filepath, eol_page_id, full_name, is_exact = get_sample_data(df, pred_taxon, rank)
|
45 |
+
except Exception as e:
|
46 |
+
logger.error(f"Error retrieving sample data: {e}")
|
47 |
+
return None, f"We encountered the following error trying to retrieve a sample image: {e}."
|
48 |
+
if filepath is None:
|
49 |
+
logger.warning(f"No sample image found for taxon: {pred_taxon}")
|
50 |
+
return None, f"Sorry, our EOL images do not include {pred_taxon}."
|
51 |
+
|
52 |
+
# Get sample image of selected individual
|
53 |
+
try:
|
54 |
+
img_src = s3_client.generate_presigned_url('get_object',
|
55 |
+
Params={'Bucket': 'treeoflife-10m-sample-images',
|
56 |
+
'Key': filepath}
|
57 |
+
)
|
58 |
+
img_resp = requests.get(img_src)
|
59 |
+
img = Image.open(io.BytesIO(img_resp.content))
|
60 |
+
full_eol_url = EOL_URL + eol_page_id
|
61 |
+
if is_exact:
|
62 |
+
eol_page = f"<p>Check out the EOL entry for {pred_taxon} to learn more: <a href={full_eol_url} target='_blank'>{full_eol_url}</a>.</p>"
|
63 |
+
else:
|
64 |
+
eol_page = f"<p>Check out an example EOL entry within {pred_taxon} to learn more: {full_name} <a href={full_eol_url} target='_blank'>{full_eol_url}</a>.</p>"
|
65 |
+
logger.info(f"Successfully retrieved sample image and EOL page for {pred_taxon}")
|
66 |
+
return img, eol_page
|
67 |
+
except Exception as e:
|
68 |
+
logger.error(f"Error retrieving sample image: {e}")
|
69 |
+
return None, f"We encountered the following error trying to retrieve a sample image: {e}."
|
70 |
+
|
71 |
+
def get_sample_data(df, pred_taxon, rank):
|
72 |
+
'''
|
73 |
+
Function to randomly select a sample individual of the given taxon and provide associated native location.
|
74 |
+
|
75 |
+
Parameters:
|
76 |
+
-----------
|
77 |
+
df : DataFrame
|
78 |
+
DataFrame with all sample images listed and their filepaths (in "file_path" column).
|
79 |
+
pred_taxon : str
|
80 |
+
Predicted taxon of the uploaded image.
|
81 |
+
rank : int
|
82 |
+
Index of rank in RANKS chosen for prediction.
|
83 |
+
|
84 |
+
Returns:
|
85 |
+
--------
|
86 |
+
filepath : str
|
87 |
+
Filepath of selected sample image for predicted taxon.
|
88 |
+
eol_page_id : str
|
89 |
+
EOL page ID associated with predicted taxon for more information.
|
90 |
+
full_name : str
|
91 |
+
Full taxonomic name of the selected sample.
|
92 |
+
is_exact : bool
|
93 |
+
Flag indicating if the match is exact (i.e., with empty lower ranks).
|
94 |
+
'''
|
95 |
+
for idx in range(rank + 1):
|
96 |
+
taxon = RANKS[idx]
|
97 |
+
target_taxon = pred_taxon.split(" ")[idx]
|
98 |
+
df = df.filter(pl.col(taxon) == target_taxon)
|
99 |
+
|
100 |
+
if df.shape[0] == 0:
|
101 |
+
return None, np.nan, "", False
|
102 |
+
|
103 |
+
# First, try to find entries with empty lower ranks
|
104 |
+
exact_df = df
|
105 |
+
for lower_rank in RANKS[rank + 1:]:
|
106 |
+
exact_df = exact_df.filter((pl.col(lower_rank).is_null()) | (pl.col(lower_rank) == ""))
|
107 |
+
|
108 |
+
if exact_df.shape[0] > 0:
|
109 |
+
df_filtered = exact_df.sample()
|
110 |
+
full_name = " ".join(df_filtered.select(RANKS[:rank+1]).row(0))
|
111 |
+
return df_filtered["file_path"][0], df_filtered["eol_page_id"].cast(pl.String)[0], full_name, True
|
112 |
+
|
113 |
+
# If no exact matches, return any entry with the specified rank
|
114 |
+
df_filtered = df.sample()
|
115 |
+
full_name = " ".join(df_filtered.select(RANKS[:rank+1]).row(0)) + " " + " ".join(df_filtered.select(RANKS[rank+1:]).row(0))
|
116 |
+
return df_filtered["file_path"][0], df_filtered["eol_page_id"].cast(pl.String)[0], full_name, False
|
components/sync_samples_to_s3.bash
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
|
3 |
+
<<COMMENT
|
4 |
+
Usage:
|
5 |
+
bash sync_samples_to_s3.bash <BASE_DIR>
|
6 |
+
|
7 |
+
Dependencies:
|
8 |
+
- awscli (https://aws.amazon.com/cli/)
|
9 |
+
Credentials to export as environment variables:
|
10 |
+
- AWS_ACCESS_KEY_ID
|
11 |
+
- AWS_SECRET_ACCESS_KEY
|
12 |
+
COMMENT
|
13 |
+
|
14 |
+
# Check if a valid directory is provided as an argument
|
15 |
+
if [ -z "$1" ]; then
|
16 |
+
echo "Usage: $0 <BASE_DIR>"
|
17 |
+
exit 1
|
18 |
+
fi
|
19 |
+
|
20 |
+
if [ ! -d "$1" ]; then
|
21 |
+
echo "Error: $1 is not a valid directory"
|
22 |
+
exit 1
|
23 |
+
fi
|
24 |
+
|
25 |
+
BASE_DIR="$1"
|
26 |
+
S3_BUCKET="s3://treeoflife-10m-sample-images"
|
27 |
+
|
28 |
+
# Loop through all directories and sync them to S3
|
29 |
+
for dir in $BASE_DIR/*; do
|
30 |
+
if [ -d "$dir" ]; then
|
31 |
+
dir_name=$(basename "$dir")
|
32 |
+
aws s3 sync "$dir" "$S3_BUCKET/$dir_name/"
|
33 |
+
fi
|
34 |
+
done
|