egrace479 thompsonmj commited on
Commit
1ca4d24
β€’
1 Parent(s): 9e5fdea

Add components for sample image return

Browse files

metadata used for filtering with readme describing it
query file with filter and return code
Include script used to upload sample images to S3

Co-authored-by: Matthew Thompson <thompsonmj@users.noreply.huggingface.co>

.gitattributes CHANGED
@@ -33,7 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
-
37
  *.json filter=lfs diff=lfs merge=lfs -text
38
  *.jpeg filter=lfs diff=lfs merge=lfs -text
39
  *.png filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
36
  *.json filter=lfs diff=lfs merge=lfs -text
37
  *.jpeg filter=lfs diff=lfs merge=lfs -text
38
  *.png filter=lfs diff=lfs merge=lfs -text
39
+ components/metadata.csv filter=lfs diff=lfs merge=lfs -text
components/metadata.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d8576f6ca106f35387506369a70df01fb92192a740c3b5da2a12ad8303976aad
3
+ size 233934143
components/metadata_readme.md ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Bioclip Demo
3
+ emoji: 🐘
4
+ colorFrom: indigo
5
+ colorTo: purple
6
+ sdk: gradio
7
+ sdk_version: 4.36.1
8
+ app_file: app.py
9
+ pinned: false
10
+ license: mit
11
+ ---
components/query.py ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import io
2
+ import boto3
3
+ import requests
4
+ import numpy as np
5
+ import polars as pl
6
+ from PIL import Image
7
+ from botocore.config import Config
8
+ import logging
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+ # S3 for sample images
13
+ my_config = Config(
14
+ region_name='us-east-1'
15
+ )
16
+ s3_client = boto3.client('s3', config=my_config)
17
+
18
+ # Set basepath for EOL pages for info
19
+ EOL_URL = "https://eol.org/pages/"
20
+ RANKS = ["kingdom", "phylum", "class", "order", "family", "genus", "species"]
21
+
22
+ def get_sample(df, pred_taxon, rank):
23
+ '''
24
+ Function to retrieve a sample image of the predicted taxon and EOL page link for more info.
25
+
26
+ Parameters:
27
+ -----------
28
+ df : DataFrame
29
+ DataFrame with all sample images listed and their filepaths (in "file_path" column).
30
+ pred_taxon : str
31
+ Predicted taxon of the uploaded image.
32
+ rank : int
33
+ Index of rank in RANKS chosen for prediction.
34
+
35
+ Returns:
36
+ --------
37
+ img : PIL.Image
38
+ Sample image of predicted taxon for display.
39
+ eol_page : str
40
+ URL to EOL page for the taxon (may be a lower rank, e.g., species sample).
41
+ '''
42
+ logger.info(f"Getting sample for taxon: {pred_taxon} at rank: {rank}")
43
+ try:
44
+ filepath, eol_page_id, full_name, is_exact = get_sample_data(df, pred_taxon, rank)
45
+ except Exception as e:
46
+ logger.error(f"Error retrieving sample data: {e}")
47
+ return None, f"We encountered the following error trying to retrieve a sample image: {e}."
48
+ if filepath is None:
49
+ logger.warning(f"No sample image found for taxon: {pred_taxon}")
50
+ return None, f"Sorry, our EOL images do not include {pred_taxon}."
51
+
52
+ # Get sample image of selected individual
53
+ try:
54
+ img_src = s3_client.generate_presigned_url('get_object',
55
+ Params={'Bucket': 'treeoflife-10m-sample-images',
56
+ 'Key': filepath}
57
+ )
58
+ img_resp = requests.get(img_src)
59
+ img = Image.open(io.BytesIO(img_resp.content))
60
+ full_eol_url = EOL_URL + eol_page_id
61
+ if is_exact:
62
+ eol_page = f"<p>Check out the EOL entry for {pred_taxon} to learn more: <a href={full_eol_url} target='_blank'>{full_eol_url}</a>.</p>"
63
+ else:
64
+ eol_page = f"<p>Check out an example EOL entry within {pred_taxon} to learn more: {full_name} <a href={full_eol_url} target='_blank'>{full_eol_url}</a>.</p>"
65
+ logger.info(f"Successfully retrieved sample image and EOL page for {pred_taxon}")
66
+ return img, eol_page
67
+ except Exception as e:
68
+ logger.error(f"Error retrieving sample image: {e}")
69
+ return None, f"We encountered the following error trying to retrieve a sample image: {e}."
70
+
71
+ def get_sample_data(df, pred_taxon, rank):
72
+ '''
73
+ Function to randomly select a sample individual of the given taxon and provide associated native location.
74
+
75
+ Parameters:
76
+ -----------
77
+ df : DataFrame
78
+ DataFrame with all sample images listed and their filepaths (in "file_path" column).
79
+ pred_taxon : str
80
+ Predicted taxon of the uploaded image.
81
+ rank : int
82
+ Index of rank in RANKS chosen for prediction.
83
+
84
+ Returns:
85
+ --------
86
+ filepath : str
87
+ Filepath of selected sample image for predicted taxon.
88
+ eol_page_id : str
89
+ EOL page ID associated with predicted taxon for more information.
90
+ full_name : str
91
+ Full taxonomic name of the selected sample.
92
+ is_exact : bool
93
+ Flag indicating if the match is exact (i.e., with empty lower ranks).
94
+ '''
95
+ for idx in range(rank + 1):
96
+ taxon = RANKS[idx]
97
+ target_taxon = pred_taxon.split(" ")[idx]
98
+ df = df.filter(pl.col(taxon) == target_taxon)
99
+
100
+ if df.shape[0] == 0:
101
+ return None, np.nan, "", False
102
+
103
+ # First, try to find entries with empty lower ranks
104
+ exact_df = df
105
+ for lower_rank in RANKS[rank + 1:]:
106
+ exact_df = exact_df.filter((pl.col(lower_rank).is_null()) | (pl.col(lower_rank) == ""))
107
+
108
+ if exact_df.shape[0] > 0:
109
+ df_filtered = exact_df.sample()
110
+ full_name = " ".join(df_filtered.select(RANKS[:rank+1]).row(0))
111
+ return df_filtered["file_path"][0], df_filtered["eol_page_id"].cast(pl.String)[0], full_name, True
112
+
113
+ # If no exact matches, return any entry with the specified rank
114
+ df_filtered = df.sample()
115
+ full_name = " ".join(df_filtered.select(RANKS[:rank+1]).row(0)) + " " + " ".join(df_filtered.select(RANKS[rank+1:]).row(0))
116
+ return df_filtered["file_path"][0], df_filtered["eol_page_id"].cast(pl.String)[0], full_name, False
components/sync_samples_to_s3.bash ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ <<COMMENT
4
+ Usage:
5
+ bash sync_samples_to_s3.bash <BASE_DIR>
6
+
7
+ Dependencies:
8
+ - awscli (https://aws.amazon.com/cli/)
9
+ Credentials to export as environment variables:
10
+ - AWS_ACCESS_KEY_ID
11
+ - AWS_SECRET_ACCESS_KEY
12
+ COMMENT
13
+
14
+ # Check if a valid directory is provided as an argument
15
+ if [ -z "$1" ]; then
16
+ echo "Usage: $0 <BASE_DIR>"
17
+ exit 1
18
+ fi
19
+
20
+ if [ ! -d "$1" ]; then
21
+ echo "Error: $1 is not a valid directory"
22
+ exit 1
23
+ fi
24
+
25
+ BASE_DIR="$1"
26
+ S3_BUCKET="s3://treeoflife-10m-sample-images"
27
+
28
+ # Loop through all directories and sync them to S3
29
+ for dir in $BASE_DIR/*; do
30
+ if [ -d "$dir" ]; then
31
+ dir_name=$(basename "$dir")
32
+ aws s3 sync "$dir" "$S3_BUCKET/$dir_name/"
33
+ fi
34
+ done