bioclip-demo

Sleeping

App Files Files Community

Samuel Stevens commited on Nov 30, 2023

Commit

a33c93d

•

1 Parent(s): 6ee7e7c

try hierarchical averaging

Browse files

Files changed (5) hide show

app.py +1 -0
examples/Sarcoscypha-coccinea.jpeg +3 -0
lib.py +50 -6
make_txt_embedding.py +48 -5
test_lib.py +57 -0

app.py CHANGED Viewed

@@ -37,6 +37,7 @@ open_domain_examples = [
     ["examples/Ursus-arctos.jpeg", "Species"],
     ["examples/Phoca-vitulina.png", "Species"],
     ["examples/Felis-catus.jpeg", "Genus"],
 ]
 zero_shot_examples = [
     [

     ["examples/Ursus-arctos.jpeg", "Species"],
     ["examples/Phoca-vitulina.png", "Species"],
     ["examples/Felis-catus.jpeg", "Genus"],
+    ["examples/Sarcoscypha-coccinea.jpeg", "Order"],
 ]
 zero_shot_examples = [
     [

examples/Sarcoscypha-coccinea.jpeg ADDED Viewed

Git LFS Details

SHA256: 84dfec1fe373d375cd31f129dfd961dfa9d0b400575f9dd9610a08d900fd1cf9
Pointer size: 131 Bytes
Size of remote file: 409 kB

lib.py CHANGED Viewed

@@ -1,3 +1,13 @@
 import itertools
 import json
@@ -33,12 +43,30 @@ class TaxonomicNode:
         return self._children[first].children(rest)
-    def __iter__(self):
-        yield self.name, self.index
         for child in self._children.values():
-            for name, index in child:
-                yield f"{self.name} {name}", index
     @classmethod
     def from_dict(cls, dct, root):
@@ -82,9 +110,25 @@ class TaxonomicTree:
         return self.kingdoms[first].children(rest)
-    def __iter__(self):
         for kingdom in self.kingdoms.values():
-            yield from kingdom
     def __len__(self):
         return self.size

+"""
+Mostly a TaxonomicTree class that implements a taxonomy and some helpers for easily
+walking and looking in the tree.
+A tree is an arrangement of TaxonomicNodes.
+"""
 import itertools
 import json
         return self._children[first].children(rest)
+    def descendants(self, prefix=None):
+        """Iterates over all values in the subtree that match prefix."""
+        if not prefix:
+            yield (self.name,), self.index
+            for child in self._children.values():
+                for name, i in child.descendants():
+                    yield (self.name, *name), i
+            return
+        first, rest = prefix[0], prefix[1:]
+        if first not in self._children:
+            return
+        for name, i in self._children[first].descendants(rest):
+            yield (self.name, *name), i
+    def values(self):
+        """Iterates over all (name, i) pairs in the tree."""
+        yield (self.name,), self.index
         for child in self._children.values():
+            for name, index in child.values():
+                yield (self.name, *name), index
     @classmethod
     def from_dict(cls, dct, root):
         return self.kingdoms[first].children(rest)
+    def descendants(self, prefix=None):
+        """Iterates over all values in the tree that match prefix."""
+        if not prefix:
+            # Give them all the subnodes
+            for kingdom in self.kingdoms.values():
+                yield from kingdom.descendants()
+            return
+        first, rest = prefix[0], prefix[1:]
+        if first not in self.kingdoms:
+            return
+        yield from self.kingdoms[first].descendants(rest)
+    def values(self):
+        """Iterates over all (name, i) pairs in the tree."""
         for kingdom in self.kingdoms.values():
+            yield from kingdom.values()
     def __len__(self):
         return self.size

make_txt_embedding.py CHANGED Viewed

@@ -6,20 +6,28 @@ import argparse
 import csv
 import json
 import os
 import numpy as np
 import torch
 import torch.nn.functional as F
 from open_clip import create_model, get_tokenizer
 from tqdm import tqdm
 import lib
 from templates import openai_imagenet_template
 model_str = "hf-hub:imageomics/bioclip"
 tokenizer_str = "ViT-B-16"
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 @torch.no_grad()
 def write_txt_features(name_lookup):
@@ -38,7 +46,7 @@ def write_txt_features(name_lookup):
     ):
         # Skip if any non-zero elements
         if all_features[:, indices].any():
-            print(f"Skipping batch {batch}")
             continue
         txts = [
@@ -59,6 +67,41 @@ def write_txt_features(name_lookup):
     np.save(args.out_path, all_features)
 def get_name_lookup(catalog_path, cache_path):
     if os.path.isfile(cache_path):
         with open(cache_path) as fd:
@@ -106,14 +149,14 @@ if __name__ == "__main__":
     args = parser.parse_args()
     name_lookup = get_name_lookup(args.catalog_path, cache_path=args.name_cache_path)
-    print("Got name lookup.")
     model = create_model(model_str, output_dict=True, require_pretrained=True)
     model = model.to(device)
-    print("Created model.")
     model = torch.compile(model)
-    print("Compiled model.")
     tokenizer = get_tokenizer(tokenizer_str)
     write_txt_features(name_lookup)

 import csv
 import json
 import os
+import logging
 import numpy as np
 import torch
 import torch.nn.functional as F
 from open_clip import create_model, get_tokenizer
 from tqdm import tqdm
 import lib
 from templates import openai_imagenet_template
+log_format = "[%(asctime)s] [%(levelname)s] [%(name)s] %(message)s"
+logging.basicConfig(level=logging.INFO, format=log_format)
+logger = logging.getLogger()
 model_str = "hf-hub:imageomics/bioclip"
 tokenizer_str = "ViT-B-16"
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+ranks = ("Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species")
 @torch.no_grad()
 def write_txt_features(name_lookup):
     ):
         # Skip if any non-zero elements
         if all_features[:, indices].any():
+            logger.info(f"Skipping batch {batch}")
             continue
         txts = [
     np.save(args.out_path, all_features)
+def convert_txt_features_to_avgs(name_lookup):
+    assert os.path.isfile(args.out_path)
+    # Put that big boy on the GPU. We're going fast.
+    all_features = torch.from_numpy(np.load(args.out_path)).to(device)
+    logger.info("Loaded text features from disk to %s.", device)
+    all_names = [set() for rank in ranks]
+    for name, index in tqdm(name_lookup.values()):
+        i = len(name) - 1
+        all_names[i].add((name, index))
+    zeroed = 0
+    for i, rank in reversed(list(enumerate(ranks))):
+        if rank == "Species":
+            continue
+        for name, index in tqdm(all_names[i], desc=rank):
+            species = tuple(zip(*((d, i) for d, i in name_lookup.descendants(prefix=name) if len(d) >= 7)))
+            if not species:
+                logger.warning("No species for %s.", " ".join(name))
+                all_features[:, index] = 0.0
+                zeroed += 1
+                continue
+            values, indices = species
+            mean = all_features[:, indices].mean(dim=1)
+            all_features[:, index] = F.normalize(mean, dim=0)
+    out_path, ext = os.path.splitext(args.out_path)
+    np.save(f"{out_path}_avgs{ext}", all_features.cpu().numpy())
+    if zeroed:
+        logger.warning("Zeroed out %d nodes because they didn't have any genus or species-level labels.", zeroed)
 def get_name_lookup(catalog_path, cache_path):
     if os.path.isfile(cache_path):
         with open(cache_path) as fd:
     args = parser.parse_args()
     name_lookup = get_name_lookup(args.catalog_path, cache_path=args.name_cache_path)
+    logger.info("Got name lookup.")
     model = create_model(model_str, output_dict=True, require_pretrained=True)
     model = model.to(device)
+    logger.info("Created model.")
     model = torch.compile(model)
+    logger.info("Compiled model.")
     tokenizer = get_tokenizer(tokenizer_str)
     write_txt_features(name_lookup)
+    convert_txt_features_to_avgs(name_lookup)

test_lib.py CHANGED Viewed

@@ -422,3 +422,60 @@ def test_taxonomiclookup_children_of_gorilla():
     )
     expected = set()
     assert actual == expected

     )
     expected = set()
     assert actual == expected
+def test_taxonomictree_descendants_last():
+    lookup = lib.TaxonomicTree()
+    lookup.add(("A", "B", "C", "D", "E", "F", "G"))
+    actual = list(lookup.descendants(("A", "B", "C", "D", "E", "F", "G")))
+    expected = [
+        (("A", "B", "C", "D", "E", "F", "G"), 6),
+    ]
+    assert actual == expected
+def test_taxonomictree_descendants_entire_tree():
+    lookup = lib.TaxonomicTree()
+    lookup.add(("A", "B"))
+    actual = list(lookup.descendants())
+    expected = [
+        (("A",), 0),
+        (("A", "B"), 1),
+    ]
+    assert actual == expected
+def test_taxonomictree_descendants_entire_tree_with_prefix():
+    lookup = lib.TaxonomicTree()
+    lookup.add(("A", "B"))
+    actual = list(lookup.descendants(prefix=("A",)))
+    expected = [
+        (("A",), 0),
+        (("A", "B"), 1),
+    ]
+    assert actual == expected
+def test_taxonomictree_descendants_general():
+    lookup = lib.TaxonomicTree()
+    lookup.add(("A", "B", "C", "D", "E", "F", "G"))
+    actual = list(lookup.descendants(("A", "B", "C", "D")))
+    expected = [
+        (("A", "B", "C", "D"), 3),
+        (("A", "B", "C", "D", "E"), 4),
+        (("A", "B", "C", "D", "E", "F"), 5),
+        (("A", "B", "C", "D", "E", "F", "G"), 6),
+    ]
+    assert actual == expected