Spaces:

fffiloni
/

AniDoc

Running on L40S

App Files Files Community

fffiloni commited on 10 days ago

Commit

c705408

•

1 Parent(s): 172ef2e

Migrated from GitHub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +11 -0
LightGlue/.flake8 +4 -0
LightGlue/LICENSE +201 -0
LightGlue/README.md +180 -0
LightGlue/assets/DSC_0410.JPG +0 -0
LightGlue/assets/DSC_0411.JPG +0 -0
LightGlue/assets/architecture.svg +0 -0
LightGlue/assets/benchmark.png +0 -0
LightGlue/assets/benchmark_cpu.png +0 -0
LightGlue/assets/easy_hard.jpg +0 -0
LightGlue/assets/sacre_coeur1.jpg +0 -0
LightGlue/assets/sacre_coeur2.jpg +0 -0
LightGlue/assets/teaser.svg +1499 -0
LightGlue/benchmark.py +255 -0
LightGlue/demo.ipynb +0 -0
LightGlue/lightglue/__init__.py +7 -0
LightGlue/lightglue/aliked.py +758 -0
LightGlue/lightglue/disk.py +55 -0
LightGlue/lightglue/dog_hardnet.py +41 -0
LightGlue/lightglue/lightglue.py +655 -0
LightGlue/lightglue/sift.py +216 -0
LightGlue/lightglue/superpoint.py +227 -0
LightGlue/lightglue/utils.py +165 -0
LightGlue/lightglue/viz2d.py +185 -0
LightGlue/pyproject.toml +30 -0
LightGlue/requirements.txt +6 -0
ORIGINAL_README.md +115 -0
cotracker/__init__.py +5 -0
cotracker/build/lib/datasets/__init__.py +5 -0
cotracker/build/lib/datasets/dataclass_utils.py +166 -0
cotracker/build/lib/datasets/dr_dataset.py +161 -0
cotracker/build/lib/datasets/kubric_movif_dataset.py +441 -0
cotracker/build/lib/datasets/tap_vid_datasets.py +209 -0
cotracker/build/lib/datasets/utils.py +106 -0
cotracker/build/lib/evaluation/__init__.py +5 -0
cotracker/build/lib/evaluation/core/__init__.py +5 -0
cotracker/build/lib/evaluation/core/eval_utils.py +138 -0
cotracker/build/lib/evaluation/core/evaluator.py +253 -0
cotracker/build/lib/evaluation/evaluate.py +169 -0
cotracker/build/lib/models/__init__.py +5 -0
cotracker/build/lib/models/build_cotracker.py +33 -0
cotracker/build/lib/models/core/__init__.py +5 -0
cotracker/build/lib/models/core/cotracker/__init__.py +5 -0
cotracker/build/lib/models/core/cotracker/blocks.py +367 -0
cotracker/build/lib/models/core/cotracker/cotracker.py +503 -0
cotracker/build/lib/models/core/cotracker/losses.py +61 -0
cotracker/build/lib/models/core/embeddings.py +120 -0
cotracker/build/lib/models/core/model_utils.py +271 -0
cotracker/build/lib/models/evaluation_predictor.py +104 -0
cotracker/build/lib/utils/__init__.py +5 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,14 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+figure/showcases/image1.gif filter=lfs diff=lfs merge=lfs -text
+figure/showcases/image2.gif filter=lfs diff=lfs merge=lfs -text
+figure/showcases/image29.gif filter=lfs diff=lfs merge=lfs -text
+figure/showcases/image3.gif filter=lfs diff=lfs merge=lfs -text
+figure/showcases/image30.gif filter=lfs diff=lfs merge=lfs -text
+figure/showcases/image31.gif filter=lfs diff=lfs merge=lfs -text
+figure/showcases/image33.gif filter=lfs diff=lfs merge=lfs -text
+figure/showcases/image34.gif filter=lfs diff=lfs merge=lfs -text
+figure/showcases/image35.gif filter=lfs diff=lfs merge=lfs -text
+figure/showcases/image4.gif filter=lfs diff=lfs merge=lfs -text
+figure/teaser.png filter=lfs diff=lfs merge=lfs -text

LightGlue/.flake8 ADDED Viewed

	@@ -0,0 +1,4 @@

+[flake8]
+max-line-length = 88
+extend-ignore = E203
+exclude = .git,__pycache__,build,.venv/

LightGlue/LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright 2023 ETH Zurich
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

LightGlue/README.md ADDED Viewed

	@@ -0,0 +1,180 @@

+<p align="center">
+  <h1 align="center"><ins>LightGlue</ins> ⚡️<br>Local Feature Matching at Light Speed</h1>
+  <p align="center">
+    <a href="https://www.linkedin.com/in/philipplindenberger/">Philipp Lindenberger</a>
+    ·
+    <a href="https://psarlin.com/">Paul-Edouard&nbsp;Sarlin</a>
+    ·
+    <a href="https://www.microsoft.com/en-us/research/people/mapoll/">Marc&nbsp;Pollefeys</a>
+  </p>
+  <h2 align="center">
+    <p>ICCV 2023</p>
+    <a href="https://arxiv.org/pdf/2306.13643.pdf" align="center">Paper</a> |
+    <a href="https://colab.research.google.com/github/cvg/LightGlue/blob/main/demo.ipynb" align="center">Colab</a> |
+    <a href="https://psarlin.com/assets/LightGlue_ICCV2023_poster_compressed.pdf" align="center">Poster</a> |
+    <a href="https://github.com/cvg/glue-factory" align="center">Train your own!</a>
+  </h2>
+</p>
+<p align="center">
+    <a href="https://arxiv.org/abs/2306.13643"><img src="assets/easy_hard.jpg" alt="example" width=80%></a>
+    <br>
+    <em>LightGlue is a deep neural network that matches sparse local features across image pairs.<br>An adaptive mechanism makes it fast for easy pairs (top) and reduces the computational complexity for difficult ones (bottom).</em>
+</p>
+##
+This repository hosts the inference code of LightGlue, a lightweight feature matcher with high accuracy and blazing fast inference. It takes as input a set of keypoints and descriptors for each image and returns the indices of corresponding points. The architecture is based on adaptive pruning techniques, in both network width and depth - [check out the paper for more details](https://arxiv.org/pdf/2306.13643.pdf).
+We release pretrained weights of LightGlue with [SuperPoint](https://arxiv.org/abs/1712.07629), [DISK](https://arxiv.org/abs/2006.13566), [ALIKED](https://arxiv.org/abs/2304.03608) and [SIFT](https://www.cs.ubc.ca/~lowe/papers/ijcv04.pdf) local features.
+The training and evaluation code can be found in our library [glue-factory](https://github.com/cvg/glue-factory/).
+## Installation and demo [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/cvg/LightGlue/blob/main/demo.ipynb)
+Install this repo using pip:
+```bash
+git clone https://github.com/cvg/LightGlue.git && cd LightGlue
+python -m pip install -e .
+```
+We provide a [demo notebook](demo.ipynb) which shows how to perform feature extraction and matching on an image pair.
+Here is a minimal script to match two images:
+```python
+from lightglue import LightGlue, SuperPoint, DISK, SIFT, ALIKED, DoGHardNet
+from lightglue.utils import load_image, rbd
+# SuperPoint+LightGlue
+extractor = SuperPoint(max_num_keypoints=2048).eval().cuda()  # load the extractor
+matcher = LightGlue(features='superpoint').eval().cuda()  # load the matcher
+# or DISK+LightGlue, ALIKED+LightGlue or SIFT+LightGlue
+extractor = DISK(max_num_keypoints=2048).eval().cuda()  # load the extractor
+matcher = LightGlue(features='disk').eval().cuda()  # load the matcher
+# load each image as a torch.Tensor on GPU with shape (3,H,W), normalized in [0,1]
+image0 = load_image('path/to/image_0.jpg').cuda()
+image1 = load_image('path/to/image_1.jpg').cuda()
+# extract local features
+feats0 = extractor.extract(image0)  # auto-resize the image, disable with resize=None
+feats1 = extractor.extract(image1)
+# match the features
+matches01 = matcher({'image0': feats0, 'image1': feats1})
+feats0, feats1, matches01 = [rbd(x) for x in [feats0, feats1, matches01]]  # remove batch dimension
+matches = matches01['matches']  # indices with shape (K,2)
+points0 = feats0['keypoints'][matches[..., 0]]  # coordinates in image #0, shape (K,2)
+points1 = feats1['keypoints'][matches[..., 1]]  # coordinates in image #1, shape (K,2)
+```
+We also provide a convenience method to match a pair of images:
+```python
+from lightglue import match_pair
+feats0, feats1, matches01 = match_pair(extractor, matcher, image0, image1)
+```
+##
+<p align="center">
+  <a href="https://arxiv.org/abs/2306.13643"><img src="assets/teaser.svg" alt="Logo" width=50%></a>
+  <br>
+  <em>LightGlue can adjust its depth (number of layers) and width (number of keypoints) per image pair, with a marginal impact on accuracy.</em>
+</p>
+## Advanced configuration
+<details>
+<summary>[Detail of all parameters - click to expand]</summary>
+- ```n_layers```: Number of stacked self+cross attention layers. Reduce this value for faster inference at the cost of accuracy (continuous red line in the plot above). Default: 9 (all layers).
+- ```flash```: Enable FlashAttention. Significantly increases the speed and reduces the memory consumption without any impact on accuracy. Default: True (LightGlue automatically detects if FlashAttention is available).
+- ```mp```: Enable mixed precision inference. Default: False (off)
+- ```depth_confidence```: Controls the early stopping. A lower values stops more often at earlier layers. Default: 0.95, disable with -1.
+- ```width_confidence```: Controls the iterative point pruning. A lower value prunes more points earlier. Default: 0.99, disable with -1.
+- ```filter_threshold```: Match confidence. Increase this value to obtain less, but stronger matches. Default: 0.1
+</details>
+The default values give a good trade-off between speed and accuracy. To maximize the accuracy, use all keypoints and disable the adaptive mechanisms:
+```python
+extractor = SuperPoint(max_num_keypoints=None)
+matcher = LightGlue(features='superpoint', depth_confidence=-1, width_confidence=-1)
+```
+To increase the speed with a small drop of accuracy, decrease the number of keypoints and lower the adaptive thresholds:
+```python
+extractor = SuperPoint(max_num_keypoints=1024)
+matcher = LightGlue(features='superpoint', depth_confidence=0.9, width_confidence=0.95)
+```
+The maximum speed is obtained with a combination of:
+- [FlashAttention](https://arxiv.org/abs/2205.14135): automatically used when ```torch >= 2.0``` or if [installed from source](https://github.com/HazyResearch/flash-attention#installation-and-features).
+- PyTorch compilation, available when ```torch >= 2.0```:
+```python
+matcher = matcher.eval().cuda()
+matcher.compile(mode='reduce-overhead')
+```
+For inputs with fewer than 1536 keypoints (determined experimentally), this compiles LightGlue but disables point pruning (large overhead). For larger input sizes, it automatically falls backs to eager mode with point pruning. Adaptive depths is supported for any input size.
+## Benchmark
+<p align="center">
+  <a><img src="assets/benchmark.png" alt="Logo" width=80%></a>
+  <br>
+  <em>Benchmark results on GPU (RTX 3080). With compilation and adaptivity, LightGlue runs at 150 FPS @ 1024 keypoints and 50 FPS @ 4096 keypoints per image. This is a 4-10x speedup over SuperGlue. </em>
+</p>
+<p align="center">
+  <a><img src="assets/benchmark_cpu.png" alt="Logo" width=80%></a>
+  <br>
+  <em>Benchmark results on CPU (Intel i7 10700K). LightGlue runs at 20 FPS @ 512 keypoints. </em>
+</p>
+Obtain the same plots for your setup using our [benchmark script](benchmark.py):
+```
+python benchmark.py [--device cuda] [--add_superglue] [--num_keypoints 512 1024 2048 4096] [--compile]
+```
+<details>
+<summary>[Performance tip - click to expand]</summary>
+Note: **Point pruning** introduces an overhead that sometimes outweighs its benefits.
+Point pruning is thus enabled only when the there are more than N keypoints in an image, where N is hardware-dependent.
+We provide defaults optimized for current hardware (RTX 30xx GPUs).
+We suggest running the benchmark script and adjusting the thresholds for your hardware by updating `LightGlue.pruning_keypoint_thresholds['cuda']`.
+</details>
+## Training and evaluation
+With [Glue Factory](https://github.com/cvg/glue-factory), you can train LightGlue with your own local features, on your own dataset!
+You can also evaluate it and other baselines on standard benchmarks like HPatches and MegaDepth.
+## Other links
+- [hloc - the visual localization toolbox](https://github.com/cvg/Hierarchical-Localization/): run LightGlue for Structure-from-Motion and visual localization.
+- [LightGlue-ONNX](https://github.com/fabio-sim/LightGlue-ONNX): export LightGlue to the Open Neural Network Exchange (ONNX) format with support for TensorRT and OpenVINO.
+- [Image Matching WebUI](https://github.com/Vincentqyw/image-matching-webui): a web GUI to easily compare different matchers, including LightGlue.
+- [kornia](https://kornia.readthedocs.io) now exposes LightGlue via the interfaces [`LightGlue`](https://kornia.readthedocs.io/en/latest/feature.html#kornia.feature.LightGlue) and [`LightGlueMatcher`](https://kornia.readthedocs.io/en/latest/feature.html#kornia.feature.LightGlueMatcher).
+## BibTeX citation
+If you use any ideas from the paper or code from this repo, please consider citing:
+```txt
+@inproceedings{lindenberger2023lightglue,
+  author    = {Philipp Lindenberger and
+               Paul-Edouard Sarlin and
+               Marc Pollefeys},
+  title     = {{LightGlue: Local Feature Matching at Light Speed}},
+  booktitle = {ICCV},
+  year      = {2023}
+}
+```
+## License
+The pre-trained weights of LightGlue and the code provided in this repository are released under the [Apache-2.0 license](./LICENSE). [DISK](https://github.com/cvlab-epfl/disk) follows this license as well but SuperPoint follows [a different, restrictive license](https://github.com/magicleap/SuperPointPretrainedNetwork/blob/master/LICENSE) (this includes its pre-trained weights and its [inference file](./lightglue/superpoint.py)). [ALIKED](https://github.com/Shiaoming/ALIKED) was published under a BSD-3-Clause license.

LightGlue/assets/DSC_0410.JPG ADDED Viewed

LightGlue/assets/DSC_0411.JPG ADDED Viewed

LightGlue/assets/architecture.svg ADDED Viewed

LightGlue/assets/benchmark.png ADDED Viewed

LightGlue/assets/benchmark_cpu.png ADDED Viewed

LightGlue/assets/easy_hard.jpg ADDED Viewed

LightGlue/assets/sacre_coeur1.jpg ADDED Viewed

LightGlue/assets/sacre_coeur2.jpg ADDED Viewed

LightGlue/assets/teaser.svg ADDED Viewed

LightGlue/benchmark.py ADDED Viewed

	@@ -0,0 +1,255 @@

+# Benchmark script for LightGlue on real images
+import argparse
+import time
+from collections import defaultdict
+from pathlib import Path
+import matplotlib.pyplot as plt
+import numpy as np
+import torch
+import torch._dynamo
+from lightglue import LightGlue, SuperPoint
+from lightglue.utils import load_image
+torch.set_grad_enabled(False)
+def measure(matcher, data, device="cuda", r=100):
+    timings = np.zeros((r, 1))
+    if device.type == "cuda":
+        starter = torch.cuda.Event(enable_timing=True)
+        ender = torch.cuda.Event(enable_timing=True)
+    # warmup
+    for _ in range(10):
+        _ = matcher(data)
+    # measurements
+    with torch.no_grad():
+        for rep in range(r):
+            if device.type == "cuda":
+                starter.record()
+                _ = matcher(data)
+                ender.record()
+                # sync gpu
+                torch.cuda.synchronize()
+                curr_time = starter.elapsed_time(ender)
+            else:
+                start = time.perf_counter()
+                _ = matcher(data)
+                curr_time = (time.perf_counter() - start) * 1e3
+            timings[rep] = curr_time
+    mean_syn = np.sum(timings) / r
+    std_syn = np.std(timings)
+    return {"mean": mean_syn, "std": std_syn}
+def print_as_table(d, title, cnames):
+    print()
+    header = f"{title:30} " + " ".join([f"{x:>7}" for x in cnames])
+    print(header)
+    print("-" * len(header))
+    for k, l in d.items():
+        print(f"{k:30}", " ".join([f"{x:>7.1f}" for x in l]))
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Benchmark script for LightGlue")
+    parser.add_argument(
+        "--device",
+        choices=["auto", "cuda", "cpu", "mps"],
+        default="auto",
+        help="device to benchmark on",
+    )
+    parser.add_argument("--compile", action="store_true", help="Compile LightGlue runs")
+    parser.add_argument(
+        "--no_flash", action="store_true", help="disable FlashAttention"
+    )
+    parser.add_argument(
+        "--no_prune_thresholds",
+        action="store_true",
+        help="disable pruning thresholds (i.e. always do pruning)",
+    )
+    parser.add_argument(
+        "--add_superglue",
+        action="store_true",
+        help="add SuperGlue to the benchmark (requires hloc)",
+    )
+    parser.add_argument(
+        "--measure", default="time", choices=["time", "log-time", "throughput"]
+    )
+    parser.add_argument(
+        "--repeat", "--r", type=int, default=100, help="repetitions of measurements"
+    )
+    parser.add_argument(
+        "--num_keypoints",
+        nargs="+",
+        type=int,
+        default=[256, 512, 1024, 2048, 4096],
+        help="number of keypoints (list separated by spaces)",
+    )
+    parser.add_argument(
+        "--matmul_precision", default="highest", choices=["highest", "high", "medium"]
+    )
+    parser.add_argument(
+        "--save", default=None, type=str, help="path where figure should be saved"
+    )
+    args = parser.parse_intermixed_args()
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    if args.device != "auto":
+        device = torch.device(args.device)
+    print("Running benchmark on device:", device)
+    images = Path("assets")
+    inputs = {
+        "easy": (
+            load_image(images / "DSC_0411.JPG"),
+            load_image(images / "DSC_0410.JPG"),
+        ),
+        "difficult": (
+            load_image(images / "sacre_coeur1.jpg"),
+            load_image(images / "sacre_coeur2.jpg"),
+        ),
+    }
+    configs = {
+        "LightGlue-full": {
+            "depth_confidence": -1,
+            "width_confidence": -1,
+        },
+        # 'LG-prune': {
+        #     'width_confidence': -1,
+        # },
+        # 'LG-depth': {
+        #     'depth_confidence': -1,
+        # },
+        "LightGlue-adaptive": {},
+    }
+    if args.compile:
+        configs = {**configs, **{k + "-compile": v for k, v in configs.items()}}
+    sg_configs = {
+        # 'SuperGlue': {},
+        "SuperGlue-fast": {"sinkhorn_iterations": 5}
+    }
+    torch.set_float32_matmul_precision(args.matmul_precision)
+    results = {k: defaultdict(list) for k, v in inputs.items()}
+    extractor = SuperPoint(max_num_keypoints=None, detection_threshold=-1)
+    extractor = extractor.eval().to(device)
+    figsize = (len(inputs) * 4.5, 4.5)
+    fig, axes = plt.subplots(1, len(inputs), sharey=True, figsize=figsize)
+    axes = axes if len(inputs) > 1 else [axes]
+    fig.canvas.manager.set_window_title(f"LightGlue benchmark ({device.type})")
+    for title, ax in zip(inputs.keys(), axes):
+        ax.set_xscale("log", base=2)
+        bases = [2**x for x in range(7, 16)]
+        ax.set_xticks(bases, bases)
+        ax.grid(which="major")
+        if args.measure == "log-time":
+            ax.set_yscale("log")
+            yticks = [10**x for x in range(6)]
+            ax.set_yticks(yticks, yticks)
+            mpos = [10**x * i for x in range(6) for i in range(2, 10)]
+            mlabel = [
+                10**x * i if i in [2, 5] else None
+                for x in range(6)
+                for i in range(2, 10)
+            ]
+            ax.set_yticks(mpos, mlabel, minor=True)
+            ax.grid(which="minor", linewidth=0.2)
+        ax.set_title(title)
+        ax.set_xlabel("# keypoints")
+        if args.measure == "throughput":
+            ax.set_ylabel("Throughput [pairs/s]")
+        else:
+            ax.set_ylabel("Latency [ms]")
+    for name, conf in configs.items():
+        print("Run benchmark for:", name)
+        torch.cuda.empty_cache()
+        matcher = LightGlue(features="superpoint", flash=not args.no_flash, **conf)
+        if args.no_prune_thresholds:
+            matcher.pruning_keypoint_thresholds = {
+                k: -1 for k in matcher.pruning_keypoint_thresholds
+            }
+        matcher = matcher.eval().to(device)
+        if name.endswith("compile"):
+            import torch._dynamo
+            torch._dynamo.reset()  # avoid buffer overflow
+            matcher.compile()
+        for pair_name, ax in zip(inputs.keys(), axes):
+            image0, image1 = [x.to(device) for x in inputs[pair_name]]
+            runtimes = []
+            for num_kpts in args.num_keypoints:
+                extractor.conf.max_num_keypoints = num_kpts
+                feats0 = extractor.extract(image0)
+                feats1 = extractor.extract(image1)
+                runtime = measure(
+                    matcher,
+                    {"image0": feats0, "image1": feats1},
+                    device=device,
+                    r=args.repeat,
+                )["mean"]
+                results[pair_name][name].append(
+                    1000 / runtime if args.measure == "throughput" else runtime
+                )
+            ax.plot(
+                args.num_keypoints, results[pair_name][name], label=name, marker="o"
+            )
+        del matcher, feats0, feats1
+    if args.add_superglue:
+        from hloc.matchers.superglue import SuperGlue
+        for name, conf in sg_configs.items():
+            print("Run benchmark for:", name)
+            matcher = SuperGlue(conf)
+            matcher = matcher.eval().to(device)
+            for pair_name, ax in zip(inputs.keys(), axes):
+                image0, image1 = [x.to(device) for x in inputs[pair_name]]
+                runtimes = []
+                for num_kpts in args.num_keypoints:
+                    extractor.conf.max_num_keypoints = num_kpts
+                    feats0 = extractor.extract(image0)
+                    feats1 = extractor.extract(image1)
+                    data = {
+                        "image0": image0[None],
+                        "image1": image1[None],
+                        **{k + "0": v for k, v in feats0.items()},
+                        **{k + "1": v for k, v in feats1.items()},
+                    }
+                    data["scores0"] = data["keypoint_scores0"]
+                    data["scores1"] = data["keypoint_scores1"]
+                    data["descriptors0"] = (
+                        data["descriptors0"].transpose(-1, -2).contiguous()
+                    )
+                    data["descriptors1"] = (
+                        data["descriptors1"].transpose(-1, -2).contiguous()
+                    )
+                    runtime = measure(matcher, data, device=device, r=args.repeat)[
+                        "mean"
+                    ]
+                    results[pair_name][name].append(
+                        1000 / runtime if args.measure == "throughput" else runtime
+                    )
+                ax.plot(
+                    args.num_keypoints, results[pair_name][name], label=name, marker="o"
+                )
+            del matcher, data, image0, image1, feats0, feats1
+    for name, runtimes in results.items():
+        print_as_table(runtimes, name, args.num_keypoints)
+    axes[0].legend()
+    fig.tight_layout()
+    if args.save:
+        plt.savefig(args.save, dpi=fig.dpi)
+    plt.show()

LightGlue/demo.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

LightGlue/lightglue/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+from .aliked import ALIKED  # noqa
+from .disk import DISK  # noqa
+from .dog_hardnet import DoGHardNet  # noqa
+from .lightglue import LightGlue  # noqa
+from .sift import SIFT  # noqa
+from .superpoint import SuperPoint  # noqa
+from .utils import match_pair  # noqa

LightGlue/lightglue/aliked.py ADDED Viewed

	@@ -0,0 +1,758 @@

+# BSD 3-Clause License
+# Copyright (c) 2022, Zhao Xiaoming
+# All rights reserved.
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+# 3. Neither the name of the copyright holder nor the names of its
+#    contributors may be used to endorse or promote products derived from
+#    this software without specific prior written permission.
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# Authors:
+# Xiaoming Zhao, Xingming Wu, Weihai Chen, Peter C.Y. Chen, Qingsong Xu, and Zhengguo Li
+# Code from https://github.com/Shiaoming/ALIKED
+from typing import Callable, Optional
+import torch
+import torch.nn.functional as F
+import torchvision
+from kornia.color import grayscale_to_rgb
+from torch import nn
+from torch.nn.modules.utils import _pair
+from torchvision.models import resnet
+from .utils import Extractor
+def get_patches(
+    tensor: torch.Tensor, required_corners: torch.Tensor, ps: int
+) -> torch.Tensor:
+    c, h, w = tensor.shape
+    corner = (required_corners - ps / 2 + 1).long()
+    corner[:, 0] = corner[:, 0].clamp(min=0, max=w - 1 - ps)
+    corner[:, 1] = corner[:, 1].clamp(min=0, max=h - 1 - ps)
+    offset = torch.arange(0, ps)
+    kw = {"indexing": "ij"} if torch.__version__ >= "1.10" else {}
+    x, y = torch.meshgrid(offset, offset, **kw)
+    patches = torch.stack((x, y)).permute(2, 1, 0).unsqueeze(2)
+    patches = patches.to(corner) + corner[None, None]
+    pts = patches.reshape(-1, 2)
+    sampled = tensor.permute(1, 2, 0)[tuple(pts.T)[::-1]]
+    sampled = sampled.reshape(ps, ps, -1, c)
+    assert sampled.shape[:3] == patches.shape[:3]
+    return sampled.permute(2, 3, 0, 1)
+def simple_nms(scores: torch.Tensor, nms_radius: int):
+    """Fast Non-maximum suppression to remove nearby points"""
+    zeros = torch.zeros_like(scores)
+    max_mask = scores == torch.nn.functional.max_pool2d(
+        scores, kernel_size=nms_radius * 2 + 1, stride=1, padding=nms_radius
+    )
+    for _ in range(2):
+        supp_mask = (
+            torch.nn.functional.max_pool2d(
+                max_mask.float(),
+                kernel_size=nms_radius * 2 + 1,
+                stride=1,
+                padding=nms_radius,
+            )
+            > 0
+        )
+        supp_scores = torch.where(supp_mask, zeros, scores)
+        new_max_mask = supp_scores == torch.nn.functional.max_pool2d(
+            supp_scores, kernel_size=nms_radius * 2 + 1, stride=1, padding=nms_radius
+        )
+        max_mask = max_mask | (new_max_mask & (~supp_mask))
+    return torch.where(max_mask, scores, zeros)
+class DKD(nn.Module):
+    def __init__(
+        self,
+        radius: int = 2,
+        top_k: int = 0,
+        scores_th: float = 0.2,
+        n_limit: int = 20000,
+    ):
+        """
+        Args:
+            radius: soft detection radius, kernel size is (2 * radius + 1)
+            top_k: top_k > 0: return top k keypoints
+            scores_th: top_k <= 0 threshold mode:
+                scores_th > 0: return keypoints with scores>scores_th
+                else: return keypoints with scores > scores.mean()
+            n_limit: max number of keypoint in threshold mode
+        """
+        super().__init__()
+        self.radius = radius
+        self.top_k = top_k
+        self.scores_th = scores_th
+        self.n_limit = n_limit
+        self.kernel_size = 2 * self.radius + 1
+        self.temperature = 0.1  # tuned temperature
+        self.unfold = nn.Unfold(kernel_size=self.kernel_size, padding=self.radius)
+        # local xy grid
+        x = torch.linspace(-self.radius, self.radius, self.kernel_size)
+        # (kernel_size*kernel_size) x 2 : (w,h)
+        kw = {"indexing": "ij"} if torch.__version__ >= "1.10" else {}
+        self.hw_grid = (
+            torch.stack(torch.meshgrid([x, x], **kw)).view(2, -1).t()[:, [1, 0]]
+        )
+    def forward(
+        self,
+        scores_map: torch.Tensor,
+        sub_pixel: bool = True,
+        image_size: Optional[torch.Tensor] = None,
+    ):
+        """
+        :param scores_map: Bx1xHxW
+        :param descriptor_map: BxCxHxW
+        :param sub_pixel: whether to use sub-pixel keypoint detection
+        :return: kpts: list[Nx2,...]; kptscores: list[N,....] normalised position: -1~1
+        """
+        b, c, h, w = scores_map.shape
+        scores_nograd = scores_map.detach()
+        nms_scores = simple_nms(scores_nograd, self.radius)
+        # remove border
+        nms_scores[:, :, : self.radius, :] = 0
+        nms_scores[:, :, :, : self.radius] = 0
+        if image_size is not None:
+            for i in range(scores_map.shape[0]):
+                w, h = image_size[i].long()
+                nms_scores[i, :, h.item() - self.radius :, :] = 0
+                nms_scores[i, :, :, w.item() - self.radius :] = 0
+        else:
+            nms_scores[:, :, -self.radius :, :] = 0
+            nms_scores[:, :, :, -self.radius :] = 0
+        # detect keypoints without grad
+        if self.top_k > 0:
+            topk = torch.topk(nms_scores.view(b, -1), self.top_k)
+            indices_keypoints = [topk.indices[i] for i in range(b)]  # B x top_k
+        else:
+            if self.scores_th > 0:
+                masks = nms_scores > self.scores_th
+                if masks.sum() == 0:
+                    th = scores_nograd.reshape(b, -1).mean(dim=1)  # th = self.scores_th
+                    masks = nms_scores > th.reshape(b, 1, 1, 1)
+            else:
+                th = scores_nograd.reshape(b, -1).mean(dim=1)  # th = self.scores_th
+                masks = nms_scores > th.reshape(b, 1, 1, 1)
+            masks = masks.reshape(b, -1)
+            indices_keypoints = []  # list, B x (any size)
+            scores_view = scores_nograd.reshape(b, -1)
+            for mask, scores in zip(masks, scores_view):
+                indices = mask.nonzero()[:, 0]
+                if len(indices) > self.n_limit:
+                    kpts_sc = scores[indices]
+                    sort_idx = kpts_sc.sort(descending=True)[1]
+                    sel_idx = sort_idx[: self.n_limit]
+                    indices = indices[sel_idx]
+                indices_keypoints.append(indices)
+        wh = torch.tensor([w - 1, h - 1], device=scores_nograd.device)
+        keypoints = []
+        scoredispersitys = []
+        kptscores = []
+        if sub_pixel:
+            # detect soft keypoints with grad backpropagation
+            patches = self.unfold(scores_map)  # B x (kernel**2) x (H*W)
+            self.hw_grid = self.hw_grid.to(scores_map)  # to device
+            for b_idx in range(b):
+                patch = patches[b_idx].t()  # (H*W) x (kernel**2)
+                indices_kpt = indices_keypoints[
+                    b_idx
+                ]  # one dimension vector, say its size is M
+                patch_scores = patch[indices_kpt]  # M x (kernel**2)
+                keypoints_xy_nms = torch.stack(
+                    [indices_kpt % w, torch.div(indices_kpt, w, rounding_mode="trunc")],
+                    dim=1,
+                )  # Mx2
+                # max is detached to prevent undesired backprop loops in the graph
+                max_v = patch_scores.max(dim=1).values.detach()[:, None]
+                x_exp = (
+                    (patch_scores - max_v) / self.temperature
+                ).exp()  # M * (kernel**2), in [0, 1]
+                # \frac{ \sum{(i,j) \times \exp(x/T)} }{ \sum{\exp(x/T)} }
+                xy_residual = (
+                    x_exp @ self.hw_grid / x_exp.sum(dim=1)[:, None]
+                )  # Soft-argmax, Mx2
+                hw_grid_dist2 = (
+                    torch.norm(
+                        (self.hw_grid[None, :, :] - xy_residual[:, None, :])
+                        / self.radius,
+                        dim=-1,
+                    )
+                    ** 2
+                )
+                scoredispersity = (x_exp * hw_grid_dist2).sum(dim=1) / x_exp.sum(dim=1)
+                # compute result keypoints
+                keypoints_xy = keypoints_xy_nms + xy_residual
+                keypoints_xy = keypoints_xy / wh * 2 - 1  # (w,h) -> (-1~1,-1~1)
+                kptscore = torch.nn.functional.grid_sample(
+                    scores_map[b_idx].unsqueeze(0),
+                    keypoints_xy.view(1, 1, -1, 2),
+                    mode="bilinear",
+                    align_corners=True,
+                )[
+                    0, 0, 0, :
+                ]  # CxN
+                keypoints.append(keypoints_xy)
+                scoredispersitys.append(scoredispersity)
+                kptscores.append(kptscore)
+        else:
+            for b_idx in range(b):
+                indices_kpt = indices_keypoints[
+                    b_idx
+                ]  # one dimension vector, say its size is M
+                # To avoid warning: UserWarning: __floordiv__ is deprecated
+                keypoints_xy_nms = torch.stack(
+                    [indices_kpt % w, torch.div(indices_kpt, w, rounding_mode="trunc")],
+                    dim=1,
+                )  # Mx2
+                keypoints_xy = keypoints_xy_nms / wh * 2 - 1  # (w,h) -> (-1~1,-1~1)
+                kptscore = torch.nn.functional.grid_sample(
+                    scores_map[b_idx].unsqueeze(0),
+                    keypoints_xy.view(1, 1, -1, 2),
+                    mode="bilinear",
+                    align_corners=True,
+                )[
+                    0, 0, 0, :
+                ]  # CxN
+                keypoints.append(keypoints_xy)
+                scoredispersitys.append(kptscore)  # for jit.script compatability
+                kptscores.append(kptscore)
+        return keypoints, scoredispersitys, kptscores
+class InputPadder(object):
+    """Pads images such that dimensions are divisible by 8"""
+    def __init__(self, h: int, w: int, divis_by: int = 8):
+        self.ht = h
+        self.wd = w
+        pad_ht = (((self.ht // divis_by) + 1) * divis_by - self.ht) % divis_by
+        pad_wd = (((self.wd // divis_by) + 1) * divis_by - self.wd) % divis_by
+        self._pad = [
+            pad_wd // 2,
+            pad_wd - pad_wd // 2,
+            pad_ht // 2,
+            pad_ht - pad_ht // 2,
+        ]
+    def pad(self, x: torch.Tensor):
+        assert x.ndim == 4
+        return F.pad(x, self._pad, mode="replicate")
+    def unpad(self, x: torch.Tensor):
+        assert x.ndim == 4
+        ht = x.shape[-2]
+        wd = x.shape[-1]
+        c = [self._pad[2], ht - self._pad[3], self._pad[0], wd - self._pad[1]]
+        return x[..., c[0] : c[1], c[2] : c[3]]
+class DeformableConv2d(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size=3,
+        stride=1,
+        padding=1,
+        bias=False,
+        mask=False,
+    ):
+        super(DeformableConv2d, self).__init__()
+        self.padding = padding
+        self.mask = mask
+        self.channel_num = (
+            3 * kernel_size * kernel_size if mask else 2 * kernel_size * kernel_size
+        )
+        self.offset_conv = nn.Conv2d(
+            in_channels,
+            self.channel_num,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=self.padding,
+            bias=True,
+        )
+        self.regular_conv = nn.Conv2d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=self.padding,
+            bias=bias,
+        )
+    def forward(self, x):
+        h, w = x.shape[2:]
+        max_offset = max(h, w) / 4.0
+        out = self.offset_conv(x)
+        if self.mask:
+            o1, o2, mask = torch.chunk(out, 3, dim=1)
+            offset = torch.cat((o1, o2), dim=1)
+            mask = torch.sigmoid(mask)
+        else:
+            offset = out
+            mask = None
+        offset = offset.clamp(-max_offset, max_offset)
+        x = torchvision.ops.deform_conv2d(
+            input=x,
+            offset=offset,
+            weight=self.regular_conv.weight,
+            bias=self.regular_conv.bias,
+            padding=self.padding,
+            mask=mask,
+        )
+        return x
+def get_conv(
+    inplanes,
+    planes,
+    kernel_size=3,
+    stride=1,
+    padding=1,
+    bias=False,
+    conv_type="conv",
+    mask=False,
+):
+    if conv_type == "conv":
+        conv = nn.Conv2d(
+            inplanes,
+            planes,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            bias=bias,
+        )
+    elif conv_type == "dcn":
+        conv = DeformableConv2d(
+            inplanes,
+            planes,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=_pair(padding),
+            bias=bias,
+            mask=mask,
+        )
+    else:
+        raise TypeError
+    return conv
+class ConvBlock(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        gate: Optional[Callable[..., nn.Module]] = None,
+        norm_layer: Optional[Callable[..., nn.Module]] = None,
+        conv_type: str = "conv",
+        mask: bool = False,
+    ):
+        super().__init__()
+        if gate is None:
+            self.gate = nn.ReLU(inplace=True)
+        else:
+            self.gate = gate
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        self.conv1 = get_conv(
+            in_channels, out_channels, kernel_size=3, conv_type=conv_type, mask=mask
+        )
+        self.bn1 = norm_layer(out_channels)
+        self.conv2 = get_conv(
+            out_channels, out_channels, kernel_size=3, conv_type=conv_type, mask=mask
+        )
+        self.bn2 = norm_layer(out_channels)
+    def forward(self, x):
+        x = self.gate(self.bn1(self.conv1(x)))  # B x in_channels x H x W
+        x = self.gate(self.bn2(self.conv2(x)))  # B x out_channels x H x W
+        return x
+# modified based on torchvision\models\resnet.py#27->BasicBlock
+class ResBlock(nn.Module):
+    expansion: int = 1
+    def __init__(
+        self,
+        inplanes: int,
+        planes: int,
+        stride: int = 1,
+        downsample: Optional[nn.Module] = None,
+        groups: int = 1,
+        base_width: int = 64,
+        dilation: int = 1,
+        gate: Optional[Callable[..., nn.Module]] = None,
+        norm_layer: Optional[Callable[..., nn.Module]] = None,
+        conv_type: str = "conv",
+        mask: bool = False,
+    ) -> None:
+        super(ResBlock, self).__init__()
+        if gate is None:
+            self.gate = nn.ReLU(inplace=True)
+        else:
+            self.gate = gate
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        if groups != 1 or base_width != 64:
+            raise ValueError("ResBlock only supports groups=1 and base_width=64")
+        if dilation > 1:
+            raise NotImplementedError("Dilation > 1 not supported in ResBlock")
+        # Both self.conv1 and self.downsample layers
+        # downsample the input when stride != 1
+        self.conv1 = get_conv(
+            inplanes, planes, kernel_size=3, conv_type=conv_type, mask=mask
+        )
+        self.bn1 = norm_layer(planes)
+        self.conv2 = get_conv(
+            planes, planes, kernel_size=3, conv_type=conv_type, mask=mask
+        )
+        self.bn2 = norm_layer(planes)
+        self.downsample = downsample
+        self.stride = stride
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        identity = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.gate(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        if self.downsample is not None:
+            identity = self.downsample(x)
+        out += identity
+        out = self.gate(out)
+        return out
+class SDDH(nn.Module):
+    def __init__(
+        self,
+        dims: int,
+        kernel_size: int = 3,
+        n_pos: int = 8,
+        gate=nn.ReLU(),
+        conv2D=False,
+        mask=False,
+    ):
+        super(SDDH, self).__init__()
+        self.kernel_size = kernel_size
+        self.n_pos = n_pos
+        self.conv2D = conv2D
+        self.mask = mask
+        self.get_patches_func = get_patches
+        # estimate offsets
+        self.channel_num = 3 * n_pos if mask else 2 * n_pos
+        self.offset_conv = nn.Sequential(
+            nn.Conv2d(
+                dims,
+                self.channel_num,
+                kernel_size=kernel_size,
+                stride=1,
+                padding=0,
+                bias=True,
+            ),
+            gate,
+            nn.Conv2d(
+                self.channel_num,
+                self.channel_num,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                bias=True,
+            ),
+        )
+        # sampled feature conv
+        self.sf_conv = nn.Conv2d(
+            dims, dims, kernel_size=1, stride=1, padding=0, bias=False
+        )
+        # convM
+        if not conv2D:
+            # deformable desc weights
+            agg_weights = torch.nn.Parameter(torch.rand(n_pos, dims, dims))
+            self.register_parameter("agg_weights", agg_weights)
+        else:
+            self.convM = nn.Conv2d(
+                dims * n_pos, dims, kernel_size=1, stride=1, padding=0, bias=False
+            )
+    def forward(self, x, keypoints):
+        # x: [B,C,H,W]
+        # keypoints: list, [[N_kpts,2], ...] (w,h)
+        b, c, h, w = x.shape
+        wh = torch.tensor([[w - 1, h - 1]], device=x.device)
+        max_offset = max(h, w) / 4.0
+        offsets = []
+        descriptors = []
+        # get offsets for each keypoint
+        for ib in range(b):
+            xi, kptsi = x[ib], keypoints[ib]
+            kptsi_wh = (kptsi / 2 + 0.5) * wh
+            N_kpts = len(kptsi)
+            if self.kernel_size > 1:
+                patch = self.get_patches_func(
+                    xi, kptsi_wh.long(), self.kernel_size
+                )  # [N_kpts, C, K, K]
+            else:
+                kptsi_wh_long = kptsi_wh.long()
+                patch = (
+                    xi[:, kptsi_wh_long[:, 1], kptsi_wh_long[:, 0]]
+                    .permute(1, 0)
+                    .reshape(N_kpts, c, 1, 1)
+                )
+            offset = self.offset_conv(patch).clamp(
+                -max_offset, max_offset
+            )  # [N_kpts, 2*n_pos, 1, 1]
+            if self.mask:
+                offset = (
+                    offset[:, :, 0, 0].view(N_kpts, 3, self.n_pos).permute(0, 2, 1)
+                )  # [N_kpts, n_pos, 3]
+                offset = offset[:, :, :-1]  # [N_kpts, n_pos, 2]
+                mask_weight = torch.sigmoid(offset[:, :, -1])  # [N_kpts, n_pos]
+            else:
+                offset = (
+                    offset[:, :, 0, 0].view(N_kpts, 2, self.n_pos).permute(0, 2, 1)
+                )  # [N_kpts, n_pos, 2]
+            offsets.append(offset)  # for visualization
+            # get sample positions
+            pos = kptsi_wh.unsqueeze(1) + offset  # [N_kpts, n_pos, 2]
+            pos = 2.0 * pos / wh[None] - 1
+            pos = pos.reshape(1, N_kpts * self.n_pos, 1, 2)
+            # sample features
+            features = F.grid_sample(
+                xi.unsqueeze(0), pos, mode="bilinear", align_corners=True
+            )  # [1,C,(N_kpts*n_pos),1]
+            features = features.reshape(c, N_kpts, self.n_pos, 1).permute(
+                1, 0, 2, 3
+            )  # [N_kpts, C, n_pos, 1]
+            if self.mask:
+                features = torch.einsum("ncpo,np->ncpo", features, mask_weight)
+            features = torch.selu_(self.sf_conv(features)).squeeze(
+                -1
+            )  # [N_kpts, C, n_pos]
+            # convM
+            if not self.conv2D:
+                descs = torch.einsum(
+                    "ncp,pcd->nd", features, self.agg_weights
+                )  # [N_kpts, C]
+            else:
+                features = features.reshape(N_kpts, -1)[
+                    :, :, None, None
+                ]  # [N_kpts, C*n_pos, 1, 1]
+                descs = self.convM(features).squeeze()  # [N_kpts, C]
+            # normalize
+            descs = F.normalize(descs, p=2.0, dim=1)
+            descriptors.append(descs)
+        return descriptors, offsets
+class ALIKED(Extractor):
+    default_conf = {
+        "model_name": "aliked-n16",
+        "max_num_keypoints": -1,
+        "detection_threshold": 0.2,
+        "nms_radius": 2,
+    }
+    checkpoint_url = "https://github.com/Shiaoming/ALIKED/raw/main/models/{}.pth"
+    n_limit_max = 20000
+    # c1, c2, c3, c4, dim, K, M
+    cfgs = {
+        "aliked-t16": [8, 16, 32, 64, 64, 3, 16],
+        "aliked-n16": [16, 32, 64, 128, 128, 3, 16],
+        "aliked-n16rot": [16, 32, 64, 128, 128, 3, 16],
+        "aliked-n32": [16, 32, 64, 128, 128, 3, 32],
+    }
+    preprocess_conf = {
+        "resize": 1024,
+    }
+    required_data_keys = ["image"]
+    def __init__(self, **conf):
+        super().__init__(**conf)  # Update with default configuration.
+        conf = self.conf
+        c1, c2, c3, c4, dim, K, M = self.cfgs[conf.model_name]
+        conv_types = ["conv", "conv", "dcn", "dcn"]
+        conv2D = False
+        mask = False
+        # build model
+        self.pool2 = nn.AvgPool2d(kernel_size=2, stride=2)
+        self.pool4 = nn.AvgPool2d(kernel_size=4, stride=4)
+        self.norm = nn.BatchNorm2d
+        self.gate = nn.SELU(inplace=True)
+        self.block1 = ConvBlock(3, c1, self.gate, self.norm, conv_type=conv_types[0])
+        self.block2 = self.get_resblock(c1, c2, conv_types[1], mask)
+        self.block3 = self.get_resblock(c2, c3, conv_types[2], mask)
+        self.block4 = self.get_resblock(c3, c4, conv_types[3], mask)
+        self.conv1 = resnet.conv1x1(c1, dim // 4)
+        self.conv2 = resnet.conv1x1(c2, dim // 4)
+        self.conv3 = resnet.conv1x1(c3, dim // 4)
+        self.conv4 = resnet.conv1x1(dim, dim // 4)
+        self.upsample2 = nn.Upsample(
+            scale_factor=2, mode="bilinear", align_corners=True
+        )
+        self.upsample4 = nn.Upsample(
+            scale_factor=4, mode="bilinear", align_corners=True
+        )
+        self.upsample8 = nn.Upsample(
+            scale_factor=8, mode="bilinear", align_corners=True
+        )
+        self.upsample32 = nn.Upsample(
+            scale_factor=32, mode="bilinear", align_corners=True
+        )
+        self.score_head = nn.Sequential(
+            resnet.conv1x1(dim, 8),
+            self.gate,
+            resnet.conv3x3(8, 4),
+            self.gate,
+            resnet.conv3x3(4, 4),
+            self.gate,
+            resnet.conv3x3(4, 1),
+        )
+        self.desc_head = SDDH(dim, K, M, gate=self.gate, conv2D=conv2D, mask=mask)
+        self.dkd = DKD(
+            radius=conf.nms_radius,
+            top_k=-1 if conf.detection_threshold > 0 else conf.max_num_keypoints,
+            scores_th=conf.detection_threshold,
+            n_limit=conf.max_num_keypoints
+            if conf.max_num_keypoints > 0
+            else self.n_limit_max,
+        )
+        state_dict = torch.hub.load_state_dict_from_url(
+            self.checkpoint_url.format(conf.model_name), map_location="cpu"
+        )
+        self.load_state_dict(state_dict, strict=True)
+    def get_resblock(self, c_in, c_out, conv_type, mask):
+        return ResBlock(
+            c_in,
+            c_out,
+            1,
+            nn.Conv2d(c_in, c_out, 1),
+            gate=self.gate,
+            norm_layer=self.norm,
+            conv_type=conv_type,
+            mask=mask,
+        )
+    def extract_dense_map(self, image):
+        # Pads images such that dimensions are divisible by
+        div_by = 2**5
+        padder = InputPadder(image.shape[-2], image.shape[-1], div_by)
+        image = padder.pad(image)
+        # ================================== feature encoder
+        x1 = self.block1(image)  # B x c1 x H x W
+        x2 = self.pool2(x1)
+        x2 = self.block2(x2)  # B x c2 x H/2 x W/2
+        x3 = self.pool4(x2)
+        x3 = self.block3(x3)  # B x c3 x H/8 x W/8
+        x4 = self.pool4(x3)
+        x4 = self.block4(x4)  # B x dim x H/32 x W/32
+        # ================================== feature aggregation
+        x1 = self.gate(self.conv1(x1))  # B x dim//4 x H x W
+        x2 = self.gate(self.conv2(x2))  # B x dim//4 x H//2 x W//2
+        x3 = self.gate(self.conv3(x3))  # B x dim//4 x H//8 x W//8
+        x4 = self.gate(self.conv4(x4))  # B x dim//4 x H//32 x W//32
+        x2_up = self.upsample2(x2)  # B x dim//4 x H x W
+        x3_up = self.upsample8(x3)  # B x dim//4 x H x W
+        x4_up = self.upsample32(x4)  # B x dim//4 x H x W
+        x1234 = torch.cat([x1, x2_up, x3_up, x4_up], dim=1)
+        # ================================== score head
+        score_map = torch.sigmoid(self.score_head(x1234))
+        feature_map = torch.nn.functional.normalize(x1234, p=2, dim=1)
+        # Unpads images
+        feature_map = padder.unpad(feature_map)
+        score_map = padder.unpad(score_map)
+        return feature_map, score_map
+    def forward(self, data: dict) -> dict:
+        image = data["image"]
+        if image.shape[1] == 1:
+            image = grayscale_to_rgb(image)
+        feature_map, score_map = self.extract_dense_map(image)
+        keypoints, kptscores, scoredispersitys = self.dkd(
+            score_map, image_size=data.get("image_size")
+        )
+        descriptors, offsets = self.desc_head(feature_map, keypoints)
+        _, _, h, w = image.shape
+        wh = torch.tensor([w - 1, h - 1], device=image.device)
+        # no padding required
+        # we can set detection_threshold=-1 and conf.max_num_keypoints > 0
+        return {
+            "keypoints": wh * (torch.stack(keypoints) + 1) / 2.0,  # B x N x 2
+            "descriptors": torch.stack(descriptors),  # B x N x D
+            "keypoint_scores": torch.stack(kptscores),  # B x N
+        }

LightGlue/lightglue/disk.py ADDED Viewed

	@@ -0,0 +1,55 @@

+import kornia
+import torch
+from .utils import Extractor
+class DISK(Extractor):
+    default_conf = {
+        "weights": "depth",
+        "max_num_keypoints": None,
+        "desc_dim": 128,
+        "nms_window_size": 5,
+        "detection_threshold": 0.0,
+        "pad_if_not_divisible": True,
+    }
+    preprocess_conf = {
+        "resize": 1024,
+        "grayscale": False,
+    }
+    required_data_keys = ["image"]
+    def __init__(self, **conf) -> None:
+        super().__init__(**conf)  # Update with default configuration.
+        self.model = kornia.feature.DISK.from_pretrained(self.conf.weights)
+    def forward(self, data: dict) -> dict:
+        """Compute keypoints, scores, descriptors for image"""
+        for key in self.required_data_keys:
+            assert key in data, f"Missing key {key} in data"
+        image = data["image"]
+        if image.shape[1] == 1:
+            image = kornia.color.grayscale_to_rgb(image)
+        features = self.model(
+            image,
+            n=self.conf.max_num_keypoints,
+            window_size=self.conf.nms_window_size,
+            score_threshold=self.conf.detection_threshold,
+            pad_if_not_divisible=self.conf.pad_if_not_divisible,
+        )
+        keypoints = [f.keypoints for f in features]
+        scores = [f.detection_scores for f in features]
+        descriptors = [f.descriptors for f in features]
+        del features
+        keypoints = torch.stack(keypoints, 0)
+        scores = torch.stack(scores, 0)
+        descriptors = torch.stack(descriptors, 0)
+        return {
+            "keypoints": keypoints.to(image).contiguous(),
+            "keypoint_scores": scores.to(image).contiguous(),
+            "descriptors": descriptors.to(image).contiguous(),
+        }

LightGlue/lightglue/dog_hardnet.py ADDED Viewed

	@@ -0,0 +1,41 @@

+import torch
+from kornia.color import rgb_to_grayscale
+from kornia.feature import HardNet, LAFDescriptor, laf_from_center_scale_ori
+from .sift import SIFT
+class DoGHardNet(SIFT):
+    required_data_keys = ["image"]
+    def __init__(self, **conf):
+        super().__init__(**conf)
+        self.laf_desc = LAFDescriptor(HardNet(True)).eval()
+    def forward(self, data: dict) -> dict:
+        image = data["image"]
+        if image.shape[1] == 3:
+            image = rgb_to_grayscale(image)
+        device = image.device
+        self.laf_desc = self.laf_desc.to(device)
+        self.laf_desc.descriptor = self.laf_desc.descriptor.eval()
+        pred = []
+        if "image_size" in data.keys():
+            im_size = data.get("image_size").long()
+        else:
+            im_size = None
+        for k in range(len(image)):
+            img = image[k]
+            if im_size is not None:
+                w, h = data["image_size"][k]
+                img = img[:, : h.to(torch.int32), : w.to(torch.int32)]
+            p = self.extract_single_image(img)
+            lafs = laf_from_center_scale_ori(
+                p["keypoints"].reshape(1, -1, 2),
+                6.0 * p["scales"].reshape(1, -1, 1, 1),
+                torch.rad2deg(p["oris"]).reshape(1, -1, 1),
+            ).to(device)
+            p["descriptors"] = self.laf_desc(img[None], lafs).reshape(-1, 128)
+            pred.append(p)
+        pred = {k: torch.stack([p[k] for p in pred], 0).to(device) for k in pred[0]}
+        return pred

LightGlue/lightglue/lightglue.py ADDED Viewed

	@@ -0,0 +1,655 @@

+import warnings
+from pathlib import Path
+from types import SimpleNamespace
+from typing import Callable, List, Optional, Tuple
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import nn
+try:
+    from flash_attn.modules.mha import FlashCrossAttention
+except ModuleNotFoundError:
+    FlashCrossAttention = None
+if FlashCrossAttention or hasattr(F, "scaled_dot_product_attention"):
+    FLASH_AVAILABLE = True
+else:
+    FLASH_AVAILABLE = False
+torch.backends.cudnn.deterministic = True
+@torch.cuda.amp.custom_fwd(cast_inputs=torch.float32)
+def normalize_keypoints(
+    kpts: torch.Tensor, size: Optional[torch.Tensor] = None
+) -> torch.Tensor:
+    if size is None:
+        size = 1 + kpts.max(-2).values - kpts.min(-2).values
+    elif not isinstance(size, torch.Tensor):
+        size = torch.tensor(size, device=kpts.device, dtype=kpts.dtype)
+    size = size.to(kpts)
+    shift = size / 2
+    scale = size.max(-1).values / 2
+    kpts = (kpts - shift[..., None, :]) / scale[..., None, None]
+    return kpts
+def pad_to_length(x: torch.Tensor, length: int) -> Tuple[torch.Tensor]:
+    if length <= x.shape[-2]:
+        return x, torch.ones_like(x[..., :1], dtype=torch.bool)
+    pad = torch.ones(
+        *x.shape[:-2], length - x.shape[-2], x.shape[-1], device=x.device, dtype=x.dtype
+    )
+    y = torch.cat([x, pad], dim=-2)
+    mask = torch.zeros(*y.shape[:-1], 1, dtype=torch.bool, device=x.device)
+    mask[..., : x.shape[-2], :] = True
+    return y, mask
+def rotate_half(x: torch.Tensor) -> torch.Tensor:
+    x = x.unflatten(-1, (-1, 2))
+    x1, x2 = x.unbind(dim=-1)
+    return torch.stack((-x2, x1), dim=-1).flatten(start_dim=-2)
+def apply_cached_rotary_emb(freqs: torch.Tensor, t: torch.Tensor) -> torch.Tensor:
+    return (t * freqs[0]) + (rotate_half(t) * freqs[1])
+class LearnableFourierPositionalEncoding(nn.Module):
+    def __init__(self, M: int, dim: int, F_dim: int = None, gamma: float = 1.0) -> None:
+        super().__init__()
+        F_dim = F_dim if F_dim is not None else dim
+        self.gamma = gamma
+        self.Wr = nn.Linear(M, F_dim // 2, bias=False)
+        nn.init.normal_(self.Wr.weight.data, mean=0, std=self.gamma**-2)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """encode position vector"""
+        projected = self.Wr(x)
+        cosines, sines = torch.cos(projected), torch.sin(projected)
+        emb = torch.stack([cosines, sines], 0).unsqueeze(-3)
+        return emb.repeat_interleave(2, dim=-1)
+class TokenConfidence(nn.Module):
+    def __init__(self, dim: int) -> None:
+        super().__init__()
+        self.token = nn.Sequential(nn.Linear(dim, 1), nn.Sigmoid())
+    def forward(self, desc0: torch.Tensor, desc1: torch.Tensor):
+        """get confidence tokens"""
+        return (
+            self.token(desc0.detach()).squeeze(-1),
+            self.token(desc1.detach()).squeeze(-1),
+        )
+class Attention(nn.Module):
+    def __init__(self, allow_flash: bool) -> None:
+        super().__init__()
+        if allow_flash and not FLASH_AVAILABLE:
+            warnings.warn(
+                "FlashAttention is not available. For optimal speed, "
+                "consider installing torch >= 2.0 or flash-attn.",
+                stacklevel=2,
+            )
+        self.enable_flash = allow_flash and FLASH_AVAILABLE
+        self.has_sdp = hasattr(F, "scaled_dot_product_attention")
+        if allow_flash and FlashCrossAttention:
+            self.flash_ = FlashCrossAttention()
+        if self.has_sdp:
+            torch.backends.cuda.enable_flash_sdp(allow_flash)
+    def forward(self, q, k, v, mask: Optional[torch.Tensor] = None) -> torch.Tensor:
+        if q.shape[-2] == 0 or k.shape[-2] == 0:
+            return q.new_zeros((*q.shape[:-1], v.shape[-1]))
+        if self.enable_flash and q.device.type == "cuda":
+            # use torch 2.0 scaled_dot_product_attention with flash
+            if self.has_sdp:
+                args = [x.half().contiguous() for x in [q, k, v]]
+                v = F.scaled_dot_product_attention(*args, attn_mask=mask).to(q.dtype)
+                return v if mask is None else v.nan_to_num()
+            else:
+                assert mask is None
+                q, k, v = [x.transpose(-2, -3).contiguous() for x in [q, k, v]]
+                m = self.flash_(q.half(), torch.stack([k, v], 2).half())
+                return m.transpose(-2, -3).to(q.dtype).clone()
+        elif self.has_sdp:
+            args = [x.contiguous() for x in [q, k, v]]
+            v = F.scaled_dot_product_attention(*args, attn_mask=mask)
+            return v if mask is None else v.nan_to_num()
+        else:
+            s = q.shape[-1] ** -0.5
+            sim = torch.einsum("...id,...jd->...ij", q, k) * s
+            if mask is not None:
+                sim.masked_fill(~mask, -float("inf"))
+            attn = F.softmax(sim, -1)
+            return torch.einsum("...ij,...jd->...id", attn, v)
+class SelfBlock(nn.Module):
+    def __init__(
+        self, embed_dim: int, num_heads: int, flash: bool = False, bias: bool = True
+    ) -> None:
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        assert self.embed_dim % num_heads == 0
+        self.head_dim = self.embed_dim // num_heads
+        self.Wqkv = nn.Linear(embed_dim, 3 * embed_dim, bias=bias)
+        self.inner_attn = Attention(flash)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.ffn = nn.Sequential(
+            nn.Linear(2 * embed_dim, 2 * embed_dim),
+            nn.LayerNorm(2 * embed_dim, elementwise_affine=True),
+            nn.GELU(),
+            nn.Linear(2 * embed_dim, embed_dim),
+        )
+    def forward(
+        self,
+        x: torch.Tensor,
+        encoding: torch.Tensor,
+        mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        qkv = self.Wqkv(x)
+        qkv = qkv.unflatten(-1, (self.num_heads, -1, 3)).transpose(1, 2)
+        q, k, v = qkv[..., 0], qkv[..., 1], qkv[..., 2]
+        q = apply_cached_rotary_emb(encoding, q)
+        k = apply_cached_rotary_emb(encoding, k)
+        context = self.inner_attn(q, k, v, mask=mask)
+        message = self.out_proj(context.transpose(1, 2).flatten(start_dim=-2))
+        return x + self.ffn(torch.cat([x, message], -1))
+class CrossBlock(nn.Module):
+    def __init__(
+        self, embed_dim: int, num_heads: int, flash: bool = False, bias: bool = True
+    ) -> None:
+        super().__init__()
+        self.heads = num_heads
+        dim_head = embed_dim // num_heads
+        self.scale = dim_head**-0.5
+        inner_dim = dim_head * num_heads
+        self.to_qk = nn.Linear(embed_dim, inner_dim, bias=bias)
+        self.to_v = nn.Linear(embed_dim, inner_dim, bias=bias)
+        self.to_out = nn.Linear(inner_dim, embed_dim, bias=bias)
+        self.ffn = nn.Sequential(
+            nn.Linear(2 * embed_dim, 2 * embed_dim),
+            nn.LayerNorm(2 * embed_dim, elementwise_affine=True),
+            nn.GELU(),
+            nn.Linear(2 * embed_dim, embed_dim),
+        )
+        if flash and FLASH_AVAILABLE:
+            self.flash = Attention(True)
+        else:
+            self.flash = None
+    def map_(self, func: Callable, x0: torch.Tensor, x1: torch.Tensor):
+        return func(x0), func(x1)
+    def forward(
+        self, x0: torch.Tensor, x1: torch.Tensor, mask: Optional[torch.Tensor] = None
+    ) -> List[torch.Tensor]:
+        qk0, qk1 = self.map_(self.to_qk, x0, x1)
+        v0, v1 = self.map_(self.to_v, x0, x1)
+        qk0, qk1, v0, v1 = map(
+            lambda t: t.unflatten(-1, (self.heads, -1)).transpose(1, 2),
+            (qk0, qk1, v0, v1),
+        )
+        if self.flash is not None and qk0.device.type == "cuda":
+            m0 = self.flash(qk0, qk1, v1, mask)
+            m1 = self.flash(
+                qk1, qk0, v0, mask.transpose(-1, -2) if mask is not None else None
+            )
+        else:
+            qk0, qk1 = qk0 * self.scale**0.5, qk1 * self.scale**0.5
+            sim = torch.einsum("bhid, bhjd -> bhij", qk0, qk1)
+            if mask is not None:
+                sim = sim.masked_fill(~mask, -float("inf"))
+            attn01 = F.softmax(sim, dim=-1)
+            attn10 = F.softmax(sim.transpose(-2, -1).contiguous(), dim=-1)
+            m0 = torch.einsum("bhij, bhjd -> bhid", attn01, v1)
+            m1 = torch.einsum("bhji, bhjd -> bhid", attn10.transpose(-2, -1), v0)
+            if mask is not None:
+                m0, m1 = m0.nan_to_num(), m1.nan_to_num()
+        m0, m1 = self.map_(lambda t: t.transpose(1, 2).flatten(start_dim=-2), m0, m1)
+        m0, m1 = self.map_(self.to_out, m0, m1)
+        x0 = x0 + self.ffn(torch.cat([x0, m0], -1))
+        x1 = x1 + self.ffn(torch.cat([x1, m1], -1))
+        return x0, x1
+class TransformerLayer(nn.Module):
+    def __init__(self, *args, **kwargs):
+        super().__init__()
+        self.self_attn = SelfBlock(*args, **kwargs)
+        self.cross_attn = CrossBlock(*args, **kwargs)
+    def forward(
+        self,
+        desc0,
+        desc1,
+        encoding0,
+        encoding1,
+        mask0: Optional[torch.Tensor] = None,
+        mask1: Optional[torch.Tensor] = None,
+    ):
+        if mask0 is not None and mask1 is not None:
+            return self.masked_forward(desc0, desc1, encoding0, encoding1, mask0, mask1)
+        else:
+            desc0 = self.self_attn(desc0, encoding0)
+            desc1 = self.self_attn(desc1, encoding1)
+            return self.cross_attn(desc0, desc1)
+    # This part is compiled and allows padding inputs
+    def masked_forward(self, desc0, desc1, encoding0, encoding1, mask0, mask1):
+        mask = mask0 & mask1.transpose(-1, -2)
+        mask0 = mask0 & mask0.transpose(-1, -2)
+        mask1 = mask1 & mask1.transpose(-1, -2)
+        desc0 = self.self_attn(desc0, encoding0, mask0)
+        desc1 = self.self_attn(desc1, encoding1, mask1)
+        return self.cross_attn(desc0, desc1, mask)
+def sigmoid_log_double_softmax(
+    sim: torch.Tensor, z0: torch.Tensor, z1: torch.Tensor
+) -> torch.Tensor:
+    """create the log assignment matrix from logits and similarity"""
+    b, m, n = sim.shape
+    certainties = F.logsigmoid(z0) + F.logsigmoid(z1).transpose(1, 2)
+    scores0 = F.log_softmax(sim, 2)
+    scores1 = F.log_softmax(sim.transpose(-1, -2).contiguous(), 2).transpose(-1, -2)
+    scores = sim.new_full((b, m + 1, n + 1), 0)
+    scores[:, :m, :n] = scores0 + scores1 + certainties
+    scores[:, :-1, -1] = F.logsigmoid(-z0.squeeze(-1))
+    scores[:, -1, :-1] = F.logsigmoid(-z1.squeeze(-1))
+    return scores
+class MatchAssignment(nn.Module):
+    def __init__(self, dim: int) -> None:
+        super().__init__()
+        self.dim = dim
+        self.matchability = nn.Linear(dim, 1, bias=True)
+        self.final_proj = nn.Linear(dim, dim, bias=True)
+    def forward(self, desc0: torch.Tensor, desc1: torch.Tensor):
+        """build assignment matrix from descriptors"""
+        mdesc0, mdesc1 = self.final_proj(desc0), self.final_proj(desc1)
+        _, _, d = mdesc0.shape
+        mdesc0, mdesc1 = mdesc0 / d**0.25, mdesc1 / d**0.25
+        sim = torch.einsum("bmd,bnd->bmn", mdesc0, mdesc1)
+        z0 = self.matchability(desc0)
+        z1 = self.matchability(desc1)
+        scores = sigmoid_log_double_softmax(sim, z0, z1)
+        return scores, sim
+    def get_matchability(self, desc: torch.Tensor):
+        return torch.sigmoid(self.matchability(desc)).squeeze(-1)
+def filter_matches(scores: torch.Tensor, th: float):
+    """obtain matches from a log assignment matrix [Bx M+1 x N+1]"""
+    max0, max1 = scores[:, :-1, :-1].max(2), scores[:, :-1, :-1].max(1)
+    m0, m1 = max0.indices, max1.indices
+    indices0 = torch.arange(m0.shape[1], device=m0.device)[None]
+    indices1 = torch.arange(m1.shape[1], device=m1.device)[None]
+    mutual0 = indices0 == m1.gather(1, m0)
+    mutual1 = indices1 == m0.gather(1, m1)
+    max0_exp = max0.values.exp()
+    zero = max0_exp.new_tensor(0)
+    mscores0 = torch.where(mutual0, max0_exp, zero)
+    mscores1 = torch.where(mutual1, mscores0.gather(1, m1), zero)
+    valid0 = mutual0 & (mscores0 > th)
+    valid1 = mutual1 & valid0.gather(1, m1)
+    m0 = torch.where(valid0, m0, -1)
+    m1 = torch.where(valid1, m1, -1)
+    return m0, m1, mscores0, mscores1
+class LightGlue(nn.Module):
+    default_conf = {
+        "name": "lightglue",  # just for interfacing
+        "input_dim": 256,  # input descriptor dimension (autoselected from weights)
+        "descriptor_dim": 256,
+        "add_scale_ori": False,
+        "n_layers": 9,
+        "num_heads": 4,
+        "flash": True,  # enable FlashAttention if available.
+        "mp": False,  # enable mixed precision
+        "depth_confidence": 0.95,  # early stopping, disable with -1
+        "width_confidence": 0.99,  # point pruning, disable with -1
+        "filter_threshold": 0.1,  # match threshold
+        "weights": None,
+    }
+    # Point pruning involves an overhead (gather).
+    # Therefore, we only activate it if there are enough keypoints.
+    pruning_keypoint_thresholds = {
+        "cpu": -1,
+        "mps": -1,
+        "cuda": 1024,
+        "flash": 1536,
+    }
+    required_data_keys = ["image0", "image1"]
+    version = "v0.1_arxiv"
+    url = "https://github.com/cvg/LightGlue/releases/download/{}/{}_lightglue.pth"
+    features = {
+        "superpoint": {
+            "weights": "superpoint_lightglue",
+            "input_dim": 256,
+        },
+        "disk": {
+            "weights": "disk_lightglue",
+            "input_dim": 128,
+        },
+        "aliked": {
+            "weights": "aliked_lightglue",
+            "input_dim": 128,
+        },
+        "sift": {
+            "weights": "sift_lightglue",
+            "input_dim": 128,
+            "add_scale_ori": True,
+        },
+        "doghardnet": {
+            "weights": "doghardnet_lightglue",
+            "input_dim": 128,
+            "add_scale_ori": True,
+        },
+    }
+    def __init__(self, features="superpoint", **conf) -> None:
+        super().__init__()
+        self.conf = conf = SimpleNamespace(**{**self.default_conf, **conf})
+        if features is not None:
+            if features not in self.features:
+                raise ValueError(
+                    f"Unsupported features: {features} not in "
+                    f"{{{','.join(self.features)}}}"
+                )
+            for k, v in self.features[features].items():
+                setattr(conf, k, v)
+        if conf.input_dim != conf.descriptor_dim:
+            self.input_proj = nn.Linear(conf.input_dim, conf.descriptor_dim, bias=True)
+        else:
+            self.input_proj = nn.Identity()
+        head_dim = conf.descriptor_dim // conf.num_heads
+        self.posenc = LearnableFourierPositionalEncoding(
+            2 + 2 * self.conf.add_scale_ori, head_dim, head_dim
+        )
+        h, n, d = conf.num_heads, conf.n_layers, conf.descriptor_dim
+        self.transformers = nn.ModuleList(
+            [TransformerLayer(d, h, conf.flash) for _ in range(n)]
+        )
+        self.log_assignment = nn.ModuleList([MatchAssignment(d) for _ in range(n)])
+        self.token_confidence = nn.ModuleList(
+            [TokenConfidence(d) for _ in range(n - 1)]
+        )
+        self.register_buffer(
+            "confidence_thresholds",
+            torch.Tensor(
+                [self.confidence_threshold(i) for i in range(self.conf.n_layers)]
+            ),
+        )
+        state_dict = None
+        if features is not None:
+            fname = f"{conf.weights}_{self.version.replace('.', '-')}.pth"
+            state_dict = torch.hub.load_state_dict_from_url(
+                self.url.format(self.version, features), model_dir='./LightGlue/ckpts',file_name="superpoint_lightglue.pth"
+            )
+            self.load_state_dict(state_dict, strict=False)
+        elif conf.weights is not None:
+            path = Path(__file__).parent
+            path = path / "weights/{}.pth".format(self.conf.weights)
+            state_dict = torch.load(str(path), map_location="cpu")
+        if state_dict:
+            # rename old state dict entries
+            for i in range(self.conf.n_layers):
+                pattern = f"self_attn.{i}", f"transformers.{i}.self_attn"
+                state_dict = {k.replace(*pattern): v for k, v in state_dict.items()}
+                pattern = f"cross_attn.{i}", f"transformers.{i}.cross_attn"
+                state_dict = {k.replace(*pattern): v for k, v in state_dict.items()}
+            self.load_state_dict(state_dict, strict=False)
+        # static lengths LightGlue is compiled for (only used with torch.compile)
+        self.static_lengths = None
+    def compile(
+        self, mode="reduce-overhead", static_lengths=[256, 512, 768, 1024, 1280, 1536]
+    ):
+        if self.conf.width_confidence != -1:
+            warnings.warn(
+                "Point pruning is partially disabled for compiled forward.",
+                stacklevel=2,
+            )
+        torch._inductor.cudagraph_mark_step_begin()
+        for i in range(self.conf.n_layers):
+            self.transformers[i].masked_forward = torch.compile(
+                self.transformers[i].masked_forward, mode=mode, fullgraph=True
+            )
+        self.static_lengths = static_lengths
+    def forward(self, data: dict) -> dict:
+        """
+        Match keypoints and descriptors between two images
+        Input (dict):
+            image0: dict
+                keypoints: [B x M x 2]
+                descriptors: [B x M x D]
+                image: [B x C x H x W] or image_size: [B x 2]
+            image1: dict
+                keypoints: [B x N x 2]
+                descriptors: [B x N x D]
+                image: [B x C x H x W] or image_size: [B x 2]
+        Output (dict):
+            matches0: [B x M]
+            matching_scores0: [B x M]
+            matches1: [B x N]
+            matching_scores1: [B x N]
+            matches: List[[Si x 2]]
+            scores: List[[Si]]
+            stop: int
+            prune0: [B x M]
+            prune1: [B x N]
+        """
+        with torch.autocast(enabled=self.conf.mp, device_type="cuda"):
+            return self._forward(data)
+    def _forward(self, data: dict) -> dict:
+        for key in self.required_data_keys:
+            assert key in data, f"Missing key {key} in data"
+        data0, data1 = data["image0"], data["image1"]
+        kpts0, kpts1 = data0["keypoints"], data1["keypoints"]
+        b, m, _ = kpts0.shape
+        b, n, _ = kpts1.shape
+        device = kpts0.device
+        size0, size1 = data0.get("image_size"), data1.get("image_size")
+        kpts0 = normalize_keypoints(kpts0, size0).clone()
+        kpts1 = normalize_keypoints(kpts1, size1).clone()
+        if self.conf.add_scale_ori:
+            kpts0 = torch.cat(
+                [kpts0] + [data0[k].unsqueeze(-1) for k in ("scales", "oris")], -1
+            )
+            kpts1 = torch.cat(
+                [kpts1] + [data1[k].unsqueeze(-1) for k in ("scales", "oris")], -1
+            )
+        desc0 = data0["descriptors"].detach().contiguous()
+        desc1 = data1["descriptors"].detach().contiguous()
+        assert desc0.shape[-1] == self.conf.input_dim
+        assert desc1.shape[-1] == self.conf.input_dim
+        if torch.is_autocast_enabled():
+            desc0 = desc0.half()
+            desc1 = desc1.half()
+        mask0, mask1 = None, None
+        c = max(m, n)
+        do_compile = self.static_lengths and c <= max(self.static_lengths)
+        if do_compile:
+            kn = min([k for k in self.static_lengths if k >= c])
+            desc0, mask0 = pad_to_length(desc0, kn)
+            desc1, mask1 = pad_to_length(desc1, kn)
+            kpts0, _ = pad_to_length(kpts0, kn)
+            kpts1, _ = pad_to_length(kpts1, kn)
+        desc0 = self.input_proj(desc0)
+        desc1 = self.input_proj(desc1)
+        # cache positional embeddings
+        encoding0 = self.posenc(kpts0)
+        encoding1 = self.posenc(kpts1)
+        # GNN + final_proj + assignment
+        do_early_stop = self.conf.depth_confidence > 0
+        do_point_pruning = self.conf.width_confidence > 0 and not do_compile
+        pruning_th = self.pruning_min_kpts(device)
+        if do_point_pruning:
+            ind0 = torch.arange(0, m, device=device)[None]
+            ind1 = torch.arange(0, n, device=device)[None]
+            # We store the index of the layer at which pruning is detected.
+            prune0 = torch.ones_like(ind0)
+            prune1 = torch.ones_like(ind1)
+        token0, token1 = None, None
+        for i in range(self.conf.n_layers):
+            if desc0.shape[1] == 0 or desc1.shape[1] == 0:  # no keypoints
+                break
+            desc0, desc1 = self.transformers[i](
+                desc0, desc1, encoding0, encoding1, mask0=mask0, mask1=mask1
+            )
+            if i == self.conf.n_layers - 1:
+                continue  # no early stopping or adaptive width at last layer
+            if do_early_stop:
+                token0, token1 = self.token_confidence[i](desc0, desc1)
+                if self.check_if_stop(token0[..., :m], token1[..., :n], i, m + n):
+                    break
+            if do_point_pruning and desc0.shape[-2] > pruning_th:
+                scores0 = self.log_assignment[i].get_matchability(desc0)
+                prunemask0 = self.get_pruning_mask(token0, scores0, i)
+                keep0 = torch.where(prunemask0)[1]
+                ind0 = ind0.index_select(1, keep0)
+                desc0 = desc0.index_select(1, keep0)
+                encoding0 = encoding0.index_select(-2, keep0)
+                prune0[:, ind0] += 1
+            if do_point_pruning and desc1.shape[-2] > pruning_th:
+                scores1 = self.log_assignment[i].get_matchability(desc1)
+                prunemask1 = self.get_pruning_mask(token1, scores1, i)
+                keep1 = torch.where(prunemask1)[1]
+                ind1 = ind1.index_select(1, keep1)
+                desc1 = desc1.index_select(1, keep1)
+                encoding1 = encoding1.index_select(-2, keep1)
+                prune1[:, ind1] += 1
+        if desc0.shape[1] == 0 or desc1.shape[1] == 0:  # no keypoints
+            m0 = desc0.new_full((b, m), -1, dtype=torch.long)
+            m1 = desc1.new_full((b, n), -1, dtype=torch.long)
+            mscores0 = desc0.new_zeros((b, m))
+            mscores1 = desc1.new_zeros((b, n))
+            matches = desc0.new_empty((b, 0, 2), dtype=torch.long)
+            mscores = desc0.new_empty((b, 0))
+            if not do_point_pruning:
+                prune0 = torch.ones_like(mscores0) * self.conf.n_layers
+                prune1 = torch.ones_like(mscores1) * self.conf.n_layers
+            return {
+                "matches0": m0,
+                "matches1": m1,
+                "matching_scores0": mscores0,
+                "matching_scores1": mscores1,
+                "stop": i + 1,
+                "matches": matches,
+                "scores": mscores,
+                "prune0": prune0,
+                "prune1": prune1,
+            }
+        desc0, desc1 = desc0[..., :m, :], desc1[..., :n, :]  # remove padding
+        scores, _ = self.log_assignment[i](desc0, desc1)
+        m0, m1, mscores0, mscores1 = filter_matches(scores, self.conf.filter_threshold)
+        matches, mscores = [], []
+        for k in range(b):
+            valid = m0[k] > -1
+            m_indices_0 = torch.where(valid)[0]
+            m_indices_1 = m0[k][valid]
+            if do_point_pruning:
+                m_indices_0 = ind0[k, m_indices_0]
+                m_indices_1 = ind1[k, m_indices_1]
+            matches.append(torch.stack([m_indices_0, m_indices_1], -1))
+            mscores.append(mscores0[k][valid])
+        # TODO: Remove when hloc switches to the compact format.
+        if do_point_pruning:
+            m0_ = torch.full((b, m), -1, device=m0.device, dtype=m0.dtype)
+            m1_ = torch.full((b, n), -1, device=m1.device, dtype=m1.dtype)
+            m0_[:, ind0] = torch.where(m0 == -1, -1, ind1.gather(1, m0.clamp(min=0)))
+            m1_[:, ind1] = torch.where(m1 == -1, -1, ind0.gather(1, m1.clamp(min=0)))
+            mscores0_ = torch.zeros((b, m), device=mscores0.device)
+            mscores1_ = torch.zeros((b, n), device=mscores1.device)
+            mscores0_[:, ind0] = mscores0
+            mscores1_[:, ind1] = mscores1
+            m0, m1, mscores0, mscores1 = m0_, m1_, mscores0_, mscores1_
+        else:
+            prune0 = torch.ones_like(mscores0) * self.conf.n_layers
+            prune1 = torch.ones_like(mscores1) * self.conf.n_layers
+        return {
+            "matches0": m0,
+            "matches1": m1,
+            "matching_scores0": mscores0,
+            "matching_scores1": mscores1,
+            "stop": i + 1,
+            "matches": matches,
+            "scores": mscores,
+            "prune0": prune0,
+            "prune1": prune1,
+        }
+    def confidence_threshold(self, layer_index: int) -> float:
+        """scaled confidence threshold"""
+        threshold = 0.8 + 0.1 * np.exp(-4.0 * layer_index / self.conf.n_layers)
+        return np.clip(threshold, 0, 1)
+    def get_pruning_mask(
+        self, confidences: torch.Tensor, scores: torch.Tensor, layer_index: int
+    ) -> torch.Tensor:
+        """mask points which should be removed"""
+        keep = scores > (1 - self.conf.width_confidence)
+        if confidences is not None:  # Low-confidence points are never pruned.
+            keep |= confidences <= self.confidence_thresholds[layer_index]
+        return keep
+    def check_if_stop(
+        self,
+        confidences0: torch.Tensor,
+        confidences1: torch.Tensor,
+        layer_index: int,
+        num_points: int,
+    ) -> torch.Tensor:
+        """evaluate stopping condition"""
+        confidences = torch.cat([confidences0, confidences1], -1)
+        threshold = self.confidence_thresholds[layer_index]
+        ratio_confident = 1.0 - (confidences < threshold).float().sum() / num_points
+        return ratio_confident > self.conf.depth_confidence
+    def pruning_min_kpts(self, device: torch.device):
+        if self.conf.flash and FLASH_AVAILABLE and device.type == "cuda":
+            return self.pruning_keypoint_thresholds["flash"]
+        else:
+            return self.pruning_keypoint_thresholds[device.type]

LightGlue/lightglue/sift.py ADDED Viewed

	@@ -0,0 +1,216 @@

+import warnings
+import cv2
+import numpy as np
+import torch
+from kornia.color import rgb_to_grayscale
+from packaging import version
+try:
+    import pycolmap
+except ImportError:
+    pycolmap = None
+from .utils import Extractor
+def filter_dog_point(points, scales, angles, image_shape, nms_radius, scores=None):
+    h, w = image_shape
+    ij = np.round(points - 0.5).astype(int).T[::-1]
+    # Remove duplicate points (identical coordinates).
+    # Pick highest scale or score
+    s = scales if scores is None else scores
+    buffer = np.zeros((h, w))
+    np.maximum.at(buffer, tuple(ij), s)
+    keep = np.where(buffer[tuple(ij)] == s)[0]
+    # Pick lowest angle (arbitrary).
+    ij = ij[:, keep]
+    buffer[:] = np.inf
+    o_abs = np.abs(angles[keep])
+    np.minimum.at(buffer, tuple(ij), o_abs)
+    mask = buffer[tuple(ij)] == o_abs
+    ij = ij[:, mask]
+    keep = keep[mask]
+    if nms_radius > 0:
+        # Apply NMS on the remaining points
+        buffer[:] = 0
+        buffer[tuple(ij)] = s[keep]  # scores or scale
+        local_max = torch.nn.functional.max_pool2d(
+            torch.from_numpy(buffer).unsqueeze(0),
+            kernel_size=nms_radius * 2 + 1,
+            stride=1,
+            padding=nms_radius,
+        ).squeeze(0)
+        is_local_max = buffer == local_max.numpy()
+        keep = keep[is_local_max[tuple(ij)]]
+    return keep
+def sift_to_rootsift(x: torch.Tensor, eps=1e-6) -> torch.Tensor:
+    x = torch.nn.functional.normalize(x, p=1, dim=-1, eps=eps)
+    x.clip_(min=eps).sqrt_()
+    return torch.nn.functional.normalize(x, p=2, dim=-1, eps=eps)
+def run_opencv_sift(features: cv2.Feature2D, image: np.ndarray) -> np.ndarray:
+    """
+    Detect keypoints using OpenCV Detector.
+    Optionally, perform description.
+    Args:
+        features: OpenCV based keypoints detector and descriptor
+        image: Grayscale image of uint8 data type
+    Returns:
+        keypoints: 1D array of detected cv2.KeyPoint
+        scores: 1D array of responses
+        descriptors: 1D array of descriptors
+    """
+    detections, descriptors = features.detectAndCompute(image, None)
+    points = np.array([k.pt for k in detections], dtype=np.float32)
+    scores = np.array([k.response for k in detections], dtype=np.float32)
+    scales = np.array([k.size for k in detections], dtype=np.float32)
+    angles = np.deg2rad(np.array([k.angle for k in detections], dtype=np.float32))
+    return points, scores, scales, angles, descriptors
+class SIFT(Extractor):
+    default_conf = {
+        "rootsift": True,
+        "nms_radius": 0,  # None to disable filtering entirely.
+        "max_num_keypoints": 4096,
+        "backend": "opencv",  # in {opencv, pycolmap, pycolmap_cpu, pycolmap_cuda}
+        "detection_threshold": 0.0066667,  # from COLMAP
+        "edge_threshold": 10,
+        "first_octave": -1,  # only used by pycolmap, the default of COLMAP
+        "num_octaves": 4,
+    }
+    preprocess_conf = {
+        "resize": 1024,
+    }
+    required_data_keys = ["image"]
+    def __init__(self, **conf):
+        super().__init__(**conf)  # Update with default configuration.
+        backend = self.conf.backend
+        if backend.startswith("pycolmap"):
+            if pycolmap is None:
+                raise ImportError(
+                    "Cannot find module pycolmap: install it with pip"
+                    "or use backend=opencv."
+                )
+            options = {
+                "peak_threshold": self.conf.detection_threshold,
+                "edge_threshold": self.conf.edge_threshold,
+                "first_octave": self.conf.first_octave,
+                "num_octaves": self.conf.num_octaves,
+                "normalization": pycolmap.Normalization.L2,  # L1_ROOT is buggy.
+            }
+            device = (
+                "auto" if backend == "pycolmap" else backend.replace("pycolmap_", "")
+            )
+            if (
+                backend == "pycolmap_cpu" or not pycolmap.has_cuda
+            ) and pycolmap.__version__ < "0.5.0":
+                warnings.warn(
+                    "The pycolmap CPU SIFT is buggy in version < 0.5.0, "
+                    "consider upgrading pycolmap or use the CUDA version.",
+                    stacklevel=1,
+                )
+            else:
+                options["max_num_features"] = self.conf.max_num_keypoints
+            self.sift = pycolmap.Sift(options=options, device=device)
+        elif backend == "opencv":
+            self.sift = cv2.SIFT_create(
+                contrastThreshold=self.conf.detection_threshold,
+                nfeatures=self.conf.max_num_keypoints,
+                edgeThreshold=self.conf.edge_threshold,
+                nOctaveLayers=self.conf.num_octaves,
+            )
+        else:
+            backends = {"opencv", "pycolmap", "pycolmap_cpu", "pycolmap_cuda"}
+            raise ValueError(
+                f"Unknown backend: {backend} not in " f"{{{','.join(backends)}}}."
+            )
+    def extract_single_image(self, image: torch.Tensor):
+        image_np = image.cpu().numpy().squeeze(0)
+        if self.conf.backend.startswith("pycolmap"):
+            if version.parse(pycolmap.__version__) >= version.parse("0.5.0"):
+                detections, descriptors = self.sift.extract(image_np)
+                scores = None  # Scores are not exposed by COLMAP anymore.
+            else:
+                detections, scores, descriptors = self.sift.extract(image_np)
+            keypoints = detections[:, :2]  # Keep only (x, y).
+            scales, angles = detections[:, -2:].T
+            if scores is not None and (
+                self.conf.backend == "pycolmap_cpu" or not pycolmap.has_cuda
+            ):
+                # Set the scores as a combination of abs. response and scale.
+                scores = np.abs(scores) * scales
+        elif self.conf.backend == "opencv":
+            # TODO: Check if opencv keypoints are already in corner convention
+            keypoints, scores, scales, angles, descriptors = run_opencv_sift(
+                self.sift, (image_np * 255.0).astype(np.uint8)
+            )
+        pred = {
+            "keypoints": keypoints,
+            "scales": scales,
+            "oris": angles,
+            "descriptors": descriptors,
+        }
+        if scores is not None:
+            pred["keypoint_scores"] = scores
+        # sometimes pycolmap returns points outside the image. We remove them
+        if self.conf.backend.startswith("pycolmap"):
+            is_inside = (
+                pred["keypoints"] + 0.5 < np.array([image_np.shape[-2:][::-1]])
+            ).all(-1)
+            pred = {k: v[is_inside] for k, v in pred.items()}
+        if self.conf.nms_radius is not None:
+            keep = filter_dog_point(
+                pred["keypoints"],
+                pred["scales"],
+                pred["oris"],
+                image_np.shape,
+                self.conf.nms_radius,
+                scores=pred.get("keypoint_scores"),
+            )
+            pred = {k: v[keep] for k, v in pred.items()}
+        pred = {k: torch.from_numpy(v) for k, v in pred.items()}
+        if scores is not None:
+            # Keep the k keypoints with highest score
+            num_points = self.conf.max_num_keypoints
+            if num_points is not None and len(pred["keypoints"]) > num_points:
+                indices = torch.topk(pred["keypoint_scores"], num_points).indices
+                pred = {k: v[indices] for k, v in pred.items()}
+        return pred
+    def forward(self, data: dict) -> dict:
+        image = data["image"]
+        if image.shape[1] == 3:
+            image = rgb_to_grayscale(image)
+        device = image.device
+        image = image.cpu()
+        pred = []
+        for k in range(len(image)):
+            img = image[k]
+            if "image_size" in data.keys():
+                # avoid extracting points in padded areas
+                w, h = data["image_size"][k]
+                img = img[:, :h, :w]
+            p = self.extract_single_image(img)
+            pred.append(p)
+        pred = {k: torch.stack([p[k] for p in pred], 0).to(device) for k in pred[0]}
+        if self.conf.rootsift:
+            pred["descriptors"] = sift_to_rootsift(pred["descriptors"])
+        return pred

LightGlue/lightglue/superpoint.py ADDED Viewed

	@@ -0,0 +1,227 @@

+# %BANNER_BEGIN%
+# ---------------------------------------------------------------------
+# %COPYRIGHT_BEGIN%
+#
+#  Magic Leap, Inc. ("COMPANY") CONFIDENTIAL
+#
+#  Unpublished Copyright (c) 2020
+#  Magic Leap, Inc., All Rights Reserved.
+#
+# NOTICE:  All information contained herein is, and remains the property
+# of COMPANY. The intellectual and technical concepts contained herein
+# are proprietary to COMPANY and may be covered by U.S. and Foreign
+# Patents, patents in process, and are protected by trade secret or
+# copyright law.  Dissemination of this information or reproduction of
+# this material is strictly forbidden unless prior written permission is
+# obtained from COMPANY.  Access to the source code contained herein is
+# hereby forbidden to anyone except current COMPANY employees, managers
+# or contractors who have executed Confidentiality and Non-disclosure
+# agreements explicitly covering such access.
+#
+# The copyright notice above does not evidence any actual or intended
+# publication or disclosure  of  this source code, which includes
+# information that is confidential and/or proprietary, and is a trade
+# secret, of  COMPANY.   ANY REPRODUCTION, MODIFICATION, DISTRIBUTION,
+# PUBLIC  PERFORMANCE, OR PUBLIC DISPLAY OF OR THROUGH USE  OF THIS
+# SOURCE CODE  WITHOUT THE EXPRESS WRITTEN CONSENT OF COMPANY IS
+# STRICTLY PROHIBITED, AND IN VIOLATION OF APPLICABLE LAWS AND
+# INTERNATIONAL TREATIES.  THE RECEIPT OR POSSESSION OF  THIS SOURCE
+# CODE AND/OR RELATED INFORMATION DOES NOT CONVEY OR IMPLY ANY RIGHTS
+# TO REPRODUCE, DISCLOSE OR DISTRIBUTE ITS CONTENTS, OR TO MANUFACTURE,
+# USE, OR SELL ANYTHING THAT IT  MAY DESCRIBE, IN WHOLE OR IN PART.
+#
+# %COPYRIGHT_END%
+# ----------------------------------------------------------------------
+# %AUTHORS_BEGIN%
+#
+#  Originating Authors: Paul-Edouard Sarlin
+#
+# %AUTHORS_END%
+# --------------------------------------------------------------------*/
+# %BANNER_END%
+# Adapted by Remi Pautrat, Philipp Lindenberger
+import torch
+from kornia.color import rgb_to_grayscale
+from torch import nn
+from .utils import Extractor
+def simple_nms(scores, nms_radius: int):
+    """Fast Non-maximum suppression to remove nearby points"""
+    assert nms_radius >= 0
+    def max_pool(x):
+        return torch.nn.functional.max_pool2d(
+            x, kernel_size=nms_radius * 2 + 1, stride=1, padding=nms_radius
+        )
+    zeros = torch.zeros_like(scores)
+    max_mask = scores == max_pool(scores)
+    for _ in range(2):
+        supp_mask = max_pool(max_mask.float()) > 0
+        supp_scores = torch.where(supp_mask, zeros, scores)
+        new_max_mask = supp_scores == max_pool(supp_scores)
+        max_mask = max_mask | (new_max_mask & (~supp_mask))
+    return torch.where(max_mask, scores, zeros)
+def top_k_keypoints(keypoints, scores, k):
+    if k >= len(keypoints):
+        return keypoints, scores
+    scores, indices = torch.topk(scores, k, dim=0, sorted=True)
+    return keypoints[indices], scores
+def sample_descriptors(keypoints, descriptors, s: int = 8):
+    """Interpolate descriptors at keypoint locations"""
+    b, c, h, w = descriptors.shape
+    keypoints = keypoints - s / 2 + 0.5
+    keypoints /= torch.tensor(
+        [(w * s - s / 2 - 0.5), (h * s - s / 2 - 0.5)],
+    ).to(
+        keypoints
+    )[None]
+    keypoints = keypoints * 2 - 1  # normalize to (-1, 1)
+    args = {"align_corners": True} if torch.__version__ >= "1.3" else {}
+    descriptors = torch.nn.functional.grid_sample(
+        descriptors, keypoints.view(b, 1, -1, 2), mode="bilinear", **args
+    )
+    descriptors = torch.nn.functional.normalize(
+        descriptors.reshape(b, c, -1), p=2, dim=1
+    )
+    return descriptors
+class SuperPoint(Extractor):
+    """SuperPoint Convolutional Detector and Descriptor
+    SuperPoint: Self-Supervised Interest Point Detection and
+    Description. Daniel DeTone, Tomasz Malisiewicz, and Andrew
+    Rabinovich. In CVPRW, 2019. https://arxiv.org/abs/1712.07629
+    """
+    default_conf = {
+        "descriptor_dim": 256,
+        "nms_radius": 4,
+        "max_num_keypoints": None,
+        "detection_threshold": 0.0005,
+        "remove_borders": 4,
+    }
+    preprocess_conf = {
+        "resize": 1024,
+    }
+    required_data_keys = ["image"]
+    def __init__(self, **conf):
+        super().__init__(**conf)  # Update with default configuration.
+        self.relu = nn.ReLU(inplace=True)
+        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
+        c1, c2, c3, c4, c5 = 64, 64, 128, 128, 256
+        self.conv1a = nn.Conv2d(1, c1, kernel_size=3, stride=1, padding=1)
+        self.conv1b = nn.Conv2d(c1, c1, kernel_size=3, stride=1, padding=1)
+        self.conv2a = nn.Conv2d(c1, c2, kernel_size=3, stride=1, padding=1)
+        self.conv2b = nn.Conv2d(c2, c2, kernel_size=3, stride=1, padding=1)
+        self.conv3a = nn.Conv2d(c2, c3, kernel_size=3, stride=1, padding=1)
+        self.conv3b = nn.Conv2d(c3, c3, kernel_size=3, stride=1, padding=1)
+        self.conv4a = nn.Conv2d(c3, c4, kernel_size=3, stride=1, padding=1)
+        self.conv4b = nn.Conv2d(c4, c4, kernel_size=3, stride=1, padding=1)
+        self.convPa = nn.Conv2d(c4, c5, kernel_size=3, stride=1, padding=1)
+        self.convPb = nn.Conv2d(c5, 65, kernel_size=1, stride=1, padding=0)
+        self.convDa = nn.Conv2d(c4, c5, kernel_size=3, stride=1, padding=1)
+        self.convDb = nn.Conv2d(
+            c5, self.conf.descriptor_dim, kernel_size=1, stride=1, padding=0
+        )
+        url = "https://github.com/cvg/LightGlue/releases/download/v0.1_arxiv/superpoint_v1.pth"  # noqa
+        self.load_state_dict(torch.hub.load_state_dict_from_url(url,model_dir='./LightGlue/ckpts/',file_name='superpoint_v1.pth'))
+        if self.conf.max_num_keypoints is not None and self.conf.max_num_keypoints <= 0:
+            raise ValueError("max_num_keypoints must be positive or None")
+    def forward(self, data: dict) -> dict:
+        """Compute keypoints, scores, descriptors for image"""
+        for key in self.required_data_keys:
+            assert key in data, f"Missing key {key} in data"
+        image = data["image"]
+        if image.shape[1] == 3:
+            image = rgb_to_grayscale(image)
+        # Shared Encoder
+        x = self.relu(self.conv1a(image))
+        x = self.relu(self.conv1b(x))
+        x = self.pool(x)
+        x = self.relu(self.conv2a(x))
+        x = self.relu(self.conv2b(x))
+        x = self.pool(x)
+        x = self.relu(self.conv3a(x))
+        x = self.relu(self.conv3b(x))
+        x = self.pool(x)
+        x = self.relu(self.conv4a(x))
+        x = self.relu(self.conv4b(x))
+        # Compute the dense keypoint scores
+        cPa = self.relu(self.convPa(x))
+        scores = self.convPb(cPa)
+        scores = torch.nn.functional.softmax(scores, 1)[:, :-1]
+        b, _, h, w = scores.shape
+        scores = scores.permute(0, 2, 3, 1).reshape(b, h, w, 8, 8)
+        scores = scores.permute(0, 1, 3, 2, 4).reshape(b, h * 8, w * 8)
+        scores = simple_nms(scores, self.conf.nms_radius)
+        # Discard keypoints near the image borders
+        if self.conf.remove_borders:
+            pad = self.conf.remove_borders
+            scores[:, :pad] = -1
+            scores[:, :, :pad] = -1
+            scores[:, -pad:] = -1
+            scores[:, :, -pad:] = -1
+        # Extract keypoints
+        best_kp = torch.where(scores > self.conf.detection_threshold)
+        scores = scores[best_kp]
+        # Separate into batches
+        keypoints = [
+            torch.stack(best_kp[1:3], dim=-1)[best_kp[0] == i] for i in range(b)
+        ]
+        scores = [scores[best_kp[0] == i] for i in range(b)]
+        # Keep the k keypoints with highest score
+        if self.conf.max_num_keypoints is not None:
+            keypoints, scores = list(
+                zip(
+                    *[
+                        top_k_keypoints(k, s, self.conf.max_num_keypoints)
+                        for k, s in zip(keypoints, scores)
+                    ]
+                )
+            )
+        # Convert (h, w) to (x, y)
+        keypoints = [torch.flip(k, [1]).float() for k in keypoints]
+        # Compute the dense descriptors
+        cDa = self.relu(self.convDa(x))
+        descriptors = self.convDb(cDa)
+        descriptors = torch.nn.functional.normalize(descriptors, p=2, dim=1)
+        # Extract descriptors
+        descriptors = [
+            sample_descriptors(k[None], d[None], 8)[0]
+            for k, d in zip(keypoints, descriptors)
+        ]
+        return {
+            "keypoints": torch.stack(keypoints, 0),
+            "keypoint_scores": torch.stack(scores, 0),
+            "descriptors": torch.stack(descriptors, 0).transpose(-1, -2).contiguous(),
+        }

LightGlue/lightglue/utils.py ADDED Viewed

	@@ -0,0 +1,165 @@

+import collections.abc as collections
+from pathlib import Path
+from types import SimpleNamespace
+from typing import Callable, List, Optional, Tuple, Union
+import cv2
+import kornia
+import numpy as np
+import torch
+class ImagePreprocessor:
+    default_conf = {
+        "resize": None,  # target edge length, None for no resizing
+        "side": "long",
+        "interpolation": "bilinear",
+        "align_corners": None,
+        "antialias": True,
+    }
+    def __init__(self, **conf) -> None:
+        super().__init__()
+        self.conf = {**self.default_conf, **conf}
+        self.conf = SimpleNamespace(**self.conf)
+    def __call__(self, img: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Resize and preprocess an image, return image and resize scale"""
+        h, w = img.shape[-2:]
+        if self.conf.resize is not None:
+            img = kornia.geometry.transform.resize(
+                img,
+                self.conf.resize,
+                side=self.conf.side,
+                antialias=self.conf.antialias,
+                align_corners=self.conf.align_corners,
+            )
+        scale = torch.Tensor([img.shape[-1] / w, img.shape[-2] / h]).to(img)
+        return img, scale
+def map_tensor(input_, func: Callable):
+    string_classes = (str, bytes)
+    if isinstance(input_, string_classes):
+        return input_
+    elif isinstance(input_, collections.Mapping):
+        return {k: map_tensor(sample, func) for k, sample in input_.items()}
+    elif isinstance(input_, collections.Sequence):
+        return [map_tensor(sample, func) for sample in input_]
+    elif isinstance(input_, torch.Tensor):
+        return func(input_)
+    else:
+        return input_
+def batch_to_device(batch: dict, device: str = "cpu", non_blocking: bool = True):
+    """Move batch (dict) to device"""
+    def _func(tensor):
+        return tensor.to(device=device, non_blocking=non_blocking).detach()
+    return map_tensor(batch, _func)
+def rbd(data: dict) -> dict:
+    """Remove batch dimension from elements in data"""
+    return {
+        k: v[0] if isinstance(v, (torch.Tensor, np.ndarray, list)) else v
+        for k, v in data.items()
+    }
+def read_image(path: Path, grayscale: bool = False) -> np.ndarray:
+    """Read an image from path as RGB or grayscale"""
+    if not Path(path).exists():
+        raise FileNotFoundError(f"No image at path {path}.")
+    mode = cv2.IMREAD_GRAYSCALE if grayscale else cv2.IMREAD_COLOR
+    image = cv2.imread(str(path), mode)
+    if image is None:
+        raise IOError(f"Could not read image at {path}.")
+    if not grayscale:
+        image = image[..., ::-1]
+    return image
+def numpy_image_to_torch(image: np.ndarray) -> torch.Tensor:
+    """Normalize the image tensor and reorder the dimensions."""
+    if image.ndim == 3:
+        image = image.transpose((2, 0, 1))  # HxWxC to CxHxW
+    elif image.ndim == 2:
+        image = image[None]  # add channel axis
+    else:
+        raise ValueError(f"Not an image: {image.shape}")
+    return torch.tensor(image / 255.0, dtype=torch.float)
+def resize_image(
+    image: np.ndarray,
+    size: Union[List[int], int],
+    fn: str = "max",
+    interp: Optional[str] = "area",
+) -> np.ndarray:
+    """Resize an image to a fixed size, or according to max or min edge."""
+    h, w = image.shape[:2]
+    fn = {"max": max, "min": min}[fn]
+    if isinstance(size, int):
+        scale = size / fn(h, w)
+        h_new, w_new = int(round(h * scale)), int(round(w * scale))
+        scale = (w_new / w, h_new / h)
+    elif isinstance(size, (tuple, list)):
+        h_new, w_new = size
+        scale = (w_new / w, h_new / h)
+    else:
+        raise ValueError(f"Incorrect new size: {size}")
+    mode = {
+        "linear": cv2.INTER_LINEAR,
+        "cubic": cv2.INTER_CUBIC,
+        "nearest": cv2.INTER_NEAREST,
+        "area": cv2.INTER_AREA,
+    }[interp]
+    return cv2.resize(image, (w_new, h_new), interpolation=mode), scale
+def load_image(path: Path, resize: int = None, **kwargs) -> torch.Tensor:
+    image = read_image(path)
+    if resize is not None:
+        image, _ = resize_image(image, resize, **kwargs)
+    return numpy_image_to_torch(image)
+class Extractor(torch.nn.Module):
+    def __init__(self, **conf):
+        super().__init__()
+        self.conf = SimpleNamespace(**{**self.default_conf, **conf})
+    @torch.no_grad()
+    def extract(self, img: torch.Tensor, **conf) -> dict:
+        """Perform extraction with online resizing"""
+        if img.dim() == 3:
+            img = img[None]  # add batch dim
+        assert img.dim() == 4 and img.shape[0] == 1
+        shape = img.shape[-2:][::-1]
+        img, scales = ImagePreprocessor(**{**self.preprocess_conf, **conf})(img)
+        feats = self.forward({"image": img})
+        feats["image_size"] = torch.tensor(shape)[None].to(img).float()
+        feats["keypoints"] = (feats["keypoints"] + 0.5) / scales[None] - 0.5
+        return feats
+def match_pair(
+    extractor,
+    matcher,
+    image0: torch.Tensor,
+    image1: torch.Tensor,
+    device: str = "cpu",
+    **preprocess,
+):
+    """Match a pair of images (image0, image1) with an extractor and matcher"""
+    feats0 = extractor.extract(image0, **preprocess)
+    feats1 = extractor.extract(image1, **preprocess)
+    matches01 = matcher({"image0": feats0, "image1": feats1})
+    data = [feats0, feats1, matches01]
+    # remove batch dim and move to target device
+    feats0, feats1, matches01 = [batch_to_device(rbd(x), device) for x in data]
+    return feats0, feats1, matches01

LightGlue/lightglue/viz2d.py ADDED Viewed

	@@ -0,0 +1,185 @@

+"""
+2D visualization primitives based on Matplotlib.
+1) Plot images with `plot_images`.
+2) Call `plot_keypoints` or `plot_matches` any number of times.
+3) Optionally: save a .png or .pdf plot (nice in papers!) with `save_plot`.
+"""
+import matplotlib
+import matplotlib.patheffects as path_effects
+import matplotlib.pyplot as plt
+import numpy as np
+import torch
+def cm_RdGn(x):
+    """Custom colormap: red (0) -> yellow (0.5) -> green (1)."""
+    x = np.clip(x, 0, 1)[..., None] * 2
+    c = x * np.array([[0, 1.0, 0]]) + (2 - x) * np.array([[1.0, 0, 0]])
+    return np.clip(c, 0, 1)
+def cm_BlRdGn(x_):
+    """Custom colormap: blue (-1) -> red (0.0) -> green (1)."""
+    x = np.clip(x_, 0, 1)[..., None] * 2
+    c = x * np.array([[0, 1.0, 0, 1.0]]) + (2 - x) * np.array([[1.0, 0, 0, 1.0]])
+    xn = -np.clip(x_, -1, 0)[..., None] * 2
+    cn = xn * np.array([[0, 0.1, 1, 1.0]]) + (2 - xn) * np.array([[1.0, 0, 0, 1.0]])
+    out = np.clip(np.where(x_[..., None] < 0, cn, c), 0, 1)
+    return out
+def cm_prune(x_):
+    """Custom colormap to visualize pruning"""
+    if isinstance(x_, torch.Tensor):
+        x_ = x_.cpu().numpy()
+    max_i = max(x_)
+    norm_x = np.where(x_ == max_i, -1, (x_ - 1) / 9)
+    return cm_BlRdGn(norm_x)
+def plot_images(imgs, titles=None, cmaps="gray", dpi=100, pad=0.5, adaptive=True):
+    """Plot a set of images horizontally.
+    Args:
+        imgs: list of NumPy RGB (H, W, 3) or PyTorch RGB (3, H, W) or mono (H, W).
+        titles: a list of strings, as titles for each image.
+        cmaps: colormaps for monochrome images.
+        adaptive: whether the figure size should fit the image aspect ratios.
+    """
+    # conversion to (H, W, 3) for torch.Tensor
+    imgs = [
+        img.permute(1, 2, 0).cpu().numpy()
+        if (isinstance(img, torch.Tensor) and img.dim() == 3)
+        else img
+        for img in imgs
+    ]
+    n = len(imgs)
+    if not isinstance(cmaps, (list, tuple)):
+        cmaps = [cmaps] * n
+    if adaptive:
+        ratios = [i.shape[1] / i.shape[0] for i in imgs]  # W / H
+    else:
+        ratios = [4 / 3] * n
+    figsize = [sum(ratios) * 4.5, 4.5]
+    fig, ax = plt.subplots(
+        1, n, figsize=figsize, dpi=dpi, gridspec_kw={"width_ratios": ratios}
+    )
+    if n == 1:
+        ax = [ax]
+    for i in range(n):
+        ax[i].imshow(imgs[i], cmap=plt.get_cmap(cmaps[i]))
+        ax[i].get_yaxis().set_ticks([])
+        ax[i].get_xaxis().set_ticks([])
+        ax[i].set_axis_off()
+        for spine in ax[i].spines.values():  # remove frame
+            spine.set_visible(False)
+        if titles:
+            ax[i].set_title(titles[i])
+    fig.tight_layout(pad=pad)
+    return fig, ax
+def plot_keypoints(kpts, colors="lime", ps=4, axes=None, a=1.0):
+    """Plot keypoints for existing images.
+    Args:
+        kpts: list of ndarrays of size (N, 2).
+        colors: string, or list of list of tuples (one for each keypoints).
+        ps: size of the keypoints as float.
+    """
+    if not isinstance(colors, list):
+        colors = [colors] * len(kpts)
+    if not isinstance(a, list):
+        a = [a] * len(kpts)
+    if axes is None:
+        axes = plt.gcf().axes
+    for ax, k, c, alpha in zip(axes, kpts, colors, a):
+        if isinstance(k, torch.Tensor):
+            k = k.cpu().numpy()
+        ax.scatter(k[:, 0], k[:, 1], c=c, s=ps, linewidths=0, alpha=alpha)
+def plot_matches(kpts0, kpts1, color=None, lw=1.5, ps=4, a=1.0, labels=None, axes=None):
+    """Plot matches for a pair of existing images.
+    Args:
+        kpts0, kpts1: corresponding keypoints of size (N, 2).
+        color: color of each match, string or RGB tuple. Random if not given.
+        lw: width of the lines.
+        ps: size of the end points (no endpoint if ps=0)
+        indices: indices of the images to draw the matches on.
+        a: alpha opacity of the match lines.
+    """
+    fig = plt.gcf()
+    if axes is None:
+        ax = fig.axes
+        ax0, ax1 = ax[0], ax[1]
+    else:
+        ax0, ax1 = axes
+    if isinstance(kpts0, torch.Tensor):
+        kpts0 = kpts0.cpu().numpy()
+    if isinstance(kpts1, torch.Tensor):
+        kpts1 = kpts1.cpu().numpy()
+    assert len(kpts0) == len(kpts1)
+    if color is None:
+        color = matplotlib.cm.hsv(np.random.rand(len(kpts0))).tolist()
+    elif len(color) > 0 and not isinstance(color[0], (tuple, list)):
+        color = [color] * len(kpts0)
+    if lw > 0:
+        for i in range(len(kpts0)):
+            line = matplotlib.patches.ConnectionPatch(
+                xyA=(kpts0[i, 0], kpts0[i, 1]),
+                xyB=(kpts1[i, 0], kpts1[i, 1]),
+                coordsA=ax0.transData,
+                coordsB=ax1.transData,
+                axesA=ax0,
+                axesB=ax1,
+                zorder=1,
+                color=color[i],
+                linewidth=lw,
+                clip_on=True,
+                alpha=a,
+                label=None if labels is None else labels[i],
+                picker=5.0,
+            )
+            line.set_annotation_clip(True)
+            fig.add_artist(line)
+    # freeze the axes to prevent the transform to change
+    ax0.autoscale(enable=False)
+    ax1.autoscale(enable=False)
+    if ps > 0:
+        ax0.scatter(kpts0[:, 0], kpts0[:, 1], c=color, s=ps)
+        ax1.scatter(kpts1[:, 0], kpts1[:, 1], c=color, s=ps)
+def add_text(
+    idx,
+    text,
+    pos=(0.01, 0.99),
+    fs=15,
+    color="w",
+    lcolor="k",
+    lwidth=2,
+    ha="left",
+    va="top",
+):
+    ax = plt.gcf().axes[idx]
+    t = ax.text(
+        *pos, text, fontsize=fs, ha=ha, va=va, color=color, transform=ax.transAxes
+    )
+    if lcolor is not None:
+        t.set_path_effects(
+            [
+                path_effects.Stroke(linewidth=lwidth, foreground=lcolor),
+                path_effects.Normal(),
+            ]
+        )
+def save_plot(path, **kw):
+    """Save the current figure without any white margin."""
+    plt.savefig(path, bbox_inches="tight", pad_inches=0, **kw)

LightGlue/pyproject.toml ADDED Viewed

	@@ -0,0 +1,30 @@

+[project]
+name = "lightglue"
+description = "LightGlue: Local Feature Matching at Light Speed"
+version = "0.0"
+authors = [
+    {name = "Philipp Lindenberger"},
+    {name = "Paul-Edouard Sarlin"},
+]
+readme = "README.md"
+requires-python = ">=3.6"
+license = {file = "LICENSE"}
+classifiers = [
+    "Programming Language :: Python :: 3",
+    "License :: OSI Approved :: Apache Software License",
+    "Operating System :: OS Independent",
+]
+urls = {Repository = "https://github.com/cvg/LightGlue/"}
+dynamic = ["dependencies"]
+[project.optional-dependencies]
+dev = ["black==23.12.1", "flake8", "isort"]
+[tool.setuptools]
+packages = ["lightglue"]
+[tool.setuptools.dynamic]
+dependencies = {file = ["requirements.txt"]}
+[tool.isort]
+profile = "black"

LightGlue/requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+# torch>=1.9.1
+# torchvision>=0.3
+# numpy
+# opencv-python
+# matplotlib
+# kornia>=0.6.11

ORIGINAL_README.md ADDED Viewed

	@@ -0,0 +1,115 @@

+# AniDoc: Animation Creation Made Easier
+<a href="https://yihao-meng.github.io/AniDoc_demo/"><img src="https://img.shields.io/static/v1?label=Project&message=Website&color=blue"></a>
+<a href="https://arxiv.org/pdf/2412.14173"><img src="https://img.shields.io/badge/arXiv-2404.12.14173-b31b1b.svg"></a>
+https://github.com/user-attachments/assets/99e1e52a-f0e1-49f5-b81f-e787857901e4
+> <a href="https://yihao-meng.github.io/AniDoc_demo">**AniDoc: Animation Creation Made Easier**</a>
+>
+[Yihao Meng](https://yihao-meng.github.io/)<sup>1,2</sup>, [Hao Ouyang](https://ken-ouyang.github.io/)<sup>2</sup>, [Hanlin Wang](https://openreview.net/profile?id=~Hanlin_Wang2)<sup>3,2</sup>, [Qiuyu Wang](https://github.com/qiuyu96)<sup>2</sup>, [Wen Wang](https://github.com/encounter1997)<sup>4,2</sup>, [Ka Leong Cheng](https://felixcheng97.github.io/)<sup>1,2</sup> , [Zhiheng Liu](https://johanan528.github.io/)<sup>5</sup>, [Yujun Shen](https://shenyujun.github.io/)<sup>2</sup>, [Huamin Qu](http://www.huamin.org/index.htm/)<sup>†,2</sup><br>
+<sup>1</sup>HKUST <sup>2</sup>Ant Group <sup>3</sup>NJU <sup>4</sup>ZJU <sup>5</sup>HKU <sup>†</sup>corresponding author
+> AniDoc colorizes a sequence of sketches based on a character design reference with high fidelity, even when the sketches significantly differ in pose and scale.
+</p>
+**Strongly recommend seeing our [demo page](https://yihao-meng.github.io/AniDoc_demo).**
+## Showcases:
+<p style="text-align: center;">
+  <img src="figure/showcases/image1.gif" alt="GIF" />
+</p>
+<p style="text-align: center;">
+  <img src="figure/showcases/image2.gif" alt="GIF" />
+</p>
+<p style="text-align: center;">
+  <img src="figure/showcases/image3.gif" alt="GIF" />
+</p>
+<p style="text-align: center;">
+  <img src="figure/showcases/image4.gif" alt="GIF" />
+</p>
+## Flexible Usage:
+### Same Reference with Varying Sketches
+<div style="display: flex; flex-direction: column; align-items: center; gap: 20px;">
+<img src="figure/showcases/image29.gif" alt="GIF Animation">
+<img src="figure/showcases/image30.gif" alt="GIF Animation">
+<img src="figure/showcases/image31.gif" alt="GIF Animation"  style="margin-bottom: 40px;">
+<div style="text-align:center; margin-top: -50px; margin-bottom: 70px;font-size: 18px; letter-spacing: 0.2px;">
+        <em>Satoru Gojo from Jujutsu Kaisen</em>
+</div>
+</div>
+### Same Sketch with Different References.
+<div style="display: flex; flex-direction: column; align-items: center; gap: 20px;">
+<img src="figure/showcases/image33.gif" alt="GIF Animation" >
+<img src="figure/showcases/image34.gif" alt="GIF Animation" >
+<img src="figure/showcases/image35.gif" alt="GIF Animation" style="margin-bottom: 40px;">
+<div style="text-align:center; margin-top: -50px; margin-bottom: 70px;font-size: 18px; letter-spacing: 0.2px;">
+        <em>Anya Forger from Spy x Family</em>
+</div>
+</div>
+## TODO List
+- [x] Release the paper and demo page. Visit [https://yihao-meng.github.io/AniDoc_demo/](https://yihao-meng.github.io/AniDoc_demo/)
+- [x] Release the inference code.
+- [ ] Build Gradio Demo
+- [ ] Release the training code.
+- [ ] Release the sparse sketch setting interpolation code.
+## Requirements:
+The training is conducted on 8 A100 GPUs (80GB VRAM), the inference is tested on RTX 5000 (32GB VRAM). In our test, the inference requires about 14GB VRAM.
+## Setup
+```
+git clone https://github.com/yihao-meng/AniDoc.git
+cd AniDoc
+```
+## Environment
+All the tests are conducted in Linux. We suggest running our code in Linux. To set up our environment in Linux, please run:
+```
+conda create -n anidoc python=3.8 -y
+conda activate anidoc
+bash install.sh
+```
+## Checkpoints
+1. please download the pre-trained stable video diffusion (SVD) checkpoints from [here](https://huggingface.co/stabilityai/stable-video-diffusion-img2vid/tree/main), and put the whole folder under `pretrained_weight`, it should look like `./pretrained_weights/stable-video-diffusion-img2vid-xt`
+2. please download the checkpoint for our Unet and ControlNet from [here](https://huggingface.co/Yhmeng1106/anidoc/tree/main), and put the whole folder as `./pretrained_weights/anidoc`.
+3. please download the co_tracker checkpoint from [here](https://huggingface.co/facebook/cotracker/blob/main/cotracker2.pth) and put it as  `./pretrained_weights/cotracker2.pth`.
+## Generate Your Animation!
+To colorize the target lineart sequence with a specific character design, you can run the following command:
+```
+bash  scripts_infer/anidoc_inference.sh
+```
+We provide some test cases in  `data_test` folder. You can also try our model with your own data. You can change the lineart sequence and corresponding character design in the script `anidoc_inference.sh`, where `--control_image` refers to the lineart sequence and `--ref_image` refers to the character design.
+## Citation:
+Don't forget to cite this source if it proves useful in your research!
+```bibtex
+@article{meng2024anidoc,
+      title={AniDoc: Animation Creation Made Easier},
+      author={Yihao Meng and Hao Ouyang and Hanlin Wang and Qiuyu Wang and Wen Wang and Ka Leong Cheng and Zhiheng Liu and Yujun Shen and Huamin Qu},
+      journal={arXiv preprint arXiv:2412.14173},
+      year={2024}
+}
+```

cotracker/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.

cotracker/build/lib/datasets/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.

cotracker/build/lib/datasets/dataclass_utils.py ADDED Viewed

	@@ -0,0 +1,166 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import json
+import dataclasses
+import numpy as np
+from dataclasses import Field, MISSING
+from typing import IO, TypeVar, Type, get_args, get_origin, Union, Any, Tuple
+_X = TypeVar("_X")
+def load_dataclass(f: IO, cls: Type[_X], binary: bool = False) -> _X:
+    """
+    Loads to a @dataclass or collection hierarchy including dataclasses
+    from a json recursively.
+    Call it like load_dataclass(f, typing.List[FrameAnnotationAnnotation]).
+    raises KeyError if json has keys not mapping to the dataclass fields.
+    Args:
+        f: Either a path to a file, or a file opened for writing.
+        cls: The class of the loaded dataclass.
+        binary: Set to True if `f` is a file handle, else False.
+    """
+    if binary:
+        asdict = json.loads(f.read().decode("utf8"))
+    else:
+        asdict = json.load(f)
+    # in the list case, run a faster "vectorized" version
+    cls = get_args(cls)[0]
+    res = list(_dataclass_list_from_dict_list(asdict, cls))
+    return res
+def _resolve_optional(type_: Any) -> Tuple[bool, Any]:
+    """Check whether `type_` is equivalent to `typing.Optional[T]` for some T."""
+    if get_origin(type_) is Union:
+        args = get_args(type_)
+        if len(args) == 2 and args[1] == type(None):  # noqa E721
+            return True, args[0]
+    if type_ is Any:
+        return True, Any
+    return False, type_
+def _unwrap_type(tp):
+    # strips Optional wrapper, if any
+    if get_origin(tp) is Union:
+        args = get_args(tp)
+        if len(args) == 2 and any(a is type(None) for a in args):  # noqa: E721
+            # this is typing.Optional
+            return args[0] if args[1] is type(None) else args[1]  # noqa: E721
+    return tp
+def _get_dataclass_field_default(field: Field) -> Any:
+    if field.default_factory is not MISSING:
+        # pyre-fixme[29]: `Union[dataclasses._MISSING_TYPE,
+        #  dataclasses._DefaultFactory[typing.Any]]` is not a function.
+        return field.default_factory()
+    elif field.default is not MISSING:
+        return field.default
+    else:
+        return None
+def _dataclass_list_from_dict_list(dlist, typeannot):
+    """
+    Vectorised version of `_dataclass_from_dict`.
+    The output should be equivalent to
+    `[_dataclass_from_dict(d, typeannot) for d in dlist]`.
+    Args:
+        dlist: list of objects to convert.
+        typeannot: type of each of those objects.
+    Returns:
+        iterator or list over converted objects of the same length as `dlist`.
+    Raises:
+        ValueError: it assumes the objects have None's in consistent places across
+            objects, otherwise it would ignore some values. This generally holds for
+            auto-generated annotations, but otherwise use `_dataclass_from_dict`.
+    """
+    cls = get_origin(typeannot) or typeannot
+    if typeannot is Any:
+        return dlist
+    if all(obj is None for obj in dlist):  # 1st recursion base: all None nodes
+        return dlist
+    if any(obj is None for obj in dlist):
+        # filter out Nones and recurse on the resulting list
+        idx_notnone = [(i, obj) for i, obj in enumerate(dlist) if obj is not None]
+        idx, notnone = zip(*idx_notnone)
+        converted = _dataclass_list_from_dict_list(notnone, typeannot)
+        res = [None] * len(dlist)
+        for i, obj in zip(idx, converted):
+            res[i] = obj
+        return res
+    is_optional, contained_type = _resolve_optional(typeannot)
+    if is_optional:
+        return _dataclass_list_from_dict_list(dlist, contained_type)
+    # otherwise, we dispatch by the type of the provided annotation to convert to
+    if issubclass(cls, tuple) and hasattr(cls, "_fields"):  # namedtuple
+        # For namedtuple, call the function recursively on the lists of corresponding keys
+        types = cls.__annotations__.values()
+        dlist_T = zip(*dlist)
+        res_T = [
+            _dataclass_list_from_dict_list(key_list, tp) for key_list, tp in zip(dlist_T, types)
+        ]
+        return [cls(*converted_as_tuple) for converted_as_tuple in zip(*res_T)]
+    elif issubclass(cls, (list, tuple)):
+        # For list/tuple, call the function recursively on the lists of corresponding positions
+        types = get_args(typeannot)
+        if len(types) == 1:  # probably List; replicate for all items
+            types = types * len(dlist[0])
+        dlist_T = zip(*dlist)
+        res_T = (
+            _dataclass_list_from_dict_list(pos_list, tp) for pos_list, tp in zip(dlist_T, types)
+        )
+        if issubclass(cls, tuple):
+            return list(zip(*res_T))
+        else:
+            return [cls(converted_as_tuple) for converted_as_tuple in zip(*res_T)]
+    elif issubclass(cls, dict):
+        # For the dictionary, call the function recursively on concatenated keys and vertices
+        key_t, val_t = get_args(typeannot)
+        all_keys_res = _dataclass_list_from_dict_list(
+            [k for obj in dlist for k in obj.keys()], key_t
+        )
+        all_vals_res = _dataclass_list_from_dict_list(
+            [k for obj in dlist for k in obj.values()], val_t
+        )
+        indices = np.cumsum([len(obj) for obj in dlist])
+        assert indices[-1] == len(all_keys_res)
+        keys = np.split(list(all_keys_res), indices[:-1])
+        all_vals_res_iter = iter(all_vals_res)
+        return [cls(zip(k, all_vals_res_iter)) for k in keys]
+    elif not dataclasses.is_dataclass(typeannot):
+        return dlist
+    # dataclass node: 2nd recursion base; call the function recursively on the lists
+    # of the corresponding fields
+    assert dataclasses.is_dataclass(cls)
+    fieldtypes = {
+        f.name: (_unwrap_type(f.type), _get_dataclass_field_default(f))
+        for f in dataclasses.fields(typeannot)
+    }
+    # NOTE the default object is shared here
+    key_lists = (
+        _dataclass_list_from_dict_list([obj.get(k, default) for obj in dlist], type_)
+        for k, (type_, default) in fieldtypes.items()
+    )
+    transposed = zip(*key_lists)
+    return [cls(*vals_as_tuple) for vals_as_tuple in transposed]

cotracker/build/lib/datasets/dr_dataset.py ADDED Viewed

	@@ -0,0 +1,161 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import os
+import gzip
+import torch
+import numpy as np
+import torch.utils.data as data
+from collections import defaultdict
+from dataclasses import dataclass
+from typing import List, Optional, Any, Dict, Tuple
+from cotracker.datasets.utils import CoTrackerData
+from cotracker.datasets.dataclass_utils import load_dataclass
+@dataclass
+class ImageAnnotation:
+    # path to jpg file, relative w.r.t. dataset_root
+    path: str
+    # H x W
+    size: Tuple[int, int]
+@dataclass
+class DynamicReplicaFrameAnnotation:
+    """A dataclass used to load annotations from json."""
+    # can be used to join with `SequenceAnnotation`
+    sequence_name: str
+    # 0-based, continuous frame number within sequence
+    frame_number: int
+    # timestamp in seconds from the video start
+    frame_timestamp: float
+    image: ImageAnnotation
+    meta: Optional[Dict[str, Any]] = None
+    camera_name: Optional[str] = None
+    trajectories: Optional[str] = None
+class DynamicReplicaDataset(data.Dataset):
+    def __init__(
+        self,
+        root,
+        split="valid",
+        traj_per_sample=256,
+        crop_size=None,
+        sample_len=-1,
+        only_first_n_samples=-1,
+        rgbd_input=False,
+    ):
+        super(DynamicReplicaDataset, self).__init__()
+        self.root = root
+        self.sample_len = sample_len
+        self.split = split
+        self.traj_per_sample = traj_per_sample
+        self.rgbd_input = rgbd_input
+        self.crop_size = crop_size
+        frame_annotations_file = f"frame_annotations_{split}.jgz"
+        self.sample_list = []
+        with gzip.open(
+            os.path.join(root, split, frame_annotations_file), "rt", encoding="utf8"
+        ) as zipfile:
+            frame_annots_list = load_dataclass(zipfile, List[DynamicReplicaFrameAnnotation])
+        seq_annot = defaultdict(list)
+        for frame_annot in frame_annots_list:
+            if frame_annot.camera_name == "left":
+                seq_annot[frame_annot.sequence_name].append(frame_annot)
+        for seq_name in seq_annot.keys():
+            seq_len = len(seq_annot[seq_name])
+            step = self.sample_len if self.sample_len > 0 else seq_len
+            counter = 0
+            for ref_idx in range(0, seq_len, step):
+                sample = seq_annot[seq_name][ref_idx : ref_idx + step]
+                self.sample_list.append(sample)
+                counter += 1
+                if only_first_n_samples > 0 and counter >= only_first_n_samples:
+                    break
+    def __len__(self):
+        return len(self.sample_list)
+    def crop(self, rgbs, trajs):
+        T, N, _ = trajs.shape
+        S = len(rgbs)
+        H, W = rgbs[0].shape[:2]
+        assert S == T
+        H_new = H
+        W_new = W
+        # simple random crop
+        y0 = 0 if self.crop_size[0] >= H_new else (H_new - self.crop_size[0]) // 2
+        x0 = 0 if self.crop_size[1] >= W_new else (W_new - self.crop_size[1]) // 2
+        rgbs = [rgb[y0 : y0 + self.crop_size[0], x0 : x0 + self.crop_size[1]] for rgb in rgbs]
+        trajs[:, :, 0] -= x0
+        trajs[:, :, 1] -= y0
+        return rgbs, trajs
+    def __getitem__(self, index):
+        sample = self.sample_list[index]
+        T = len(sample)
+        rgbs, visibilities, traj_2d = [], [], []
+        H, W = sample[0].image.size
+        image_size = (H, W)
+        for i in range(T):
+            traj_path = os.path.join(self.root, self.split, sample[i].trajectories["path"])
+            traj = torch.load(traj_path)
+            visibilities.append(traj["verts_inds_vis"].numpy())
+            rgbs.append(traj["img"].numpy())
+            traj_2d.append(traj["traj_2d"].numpy()[..., :2])
+        traj_2d = np.stack(traj_2d)
+        visibility = np.stack(visibilities)
+        T, N, D = traj_2d.shape
+        # subsample trajectories for augmentations
+        visible_inds_sampled = torch.randperm(N)[: self.traj_per_sample]
+        traj_2d = traj_2d[:, visible_inds_sampled]
+        visibility = visibility[:, visible_inds_sampled]
+        if self.crop_size is not None:
+            rgbs, traj_2d = self.crop(rgbs, traj_2d)
+            H, W, _ = rgbs[0].shape
+            image_size = self.crop_size
+        visibility[traj_2d[:, :, 0] > image_size[1] - 1] = False
+        visibility[traj_2d[:, :, 0] < 0] = False
+        visibility[traj_2d[:, :, 1] > image_size[0] - 1] = False
+        visibility[traj_2d[:, :, 1] < 0] = False
+        # filter out points that're visible for less than 10 frames
+        visible_inds_resampled = visibility.sum(0) > 10
+        traj_2d = torch.from_numpy(traj_2d[:, visible_inds_resampled])
+        visibility = torch.from_numpy(visibility[:, visible_inds_resampled])
+        rgbs = np.stack(rgbs, 0)
+        video = torch.from_numpy(rgbs).reshape(T, H, W, 3).permute(0, 3, 1, 2).float()
+        return CoTrackerData(
+            video=video,
+            trajectory=traj_2d,
+            visibility=visibility,
+            valid=torch.ones(T, N),
+            seq_name=sample[0].sequence_name,
+        )

cotracker/build/lib/datasets/kubric_movif_dataset.py ADDED Viewed

	@@ -0,0 +1,441 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import os
+import torch
+import cv2
+import imageio
+import numpy as np
+from cotracker.datasets.utils import CoTrackerData
+from torchvision.transforms import ColorJitter, GaussianBlur
+from PIL import Image
+class CoTrackerDataset(torch.utils.data.Dataset):
+    def __init__(
+        self,
+        data_root,
+        crop_size=(384, 512),
+        seq_len=24,
+        traj_per_sample=768,
+        sample_vis_1st_frame=False,
+        use_augs=False,
+    ):
+        super(CoTrackerDataset, self).__init__()
+        np.random.seed(0)
+        torch.manual_seed(0)
+        self.data_root = data_root
+        self.seq_len = seq_len
+        self.traj_per_sample = traj_per_sample
+        self.sample_vis_1st_frame = sample_vis_1st_frame
+        self.use_augs = use_augs
+        self.crop_size = crop_size
+        # photometric augmentation
+        self.photo_aug = ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.25 / 3.14)
+        self.blur_aug = GaussianBlur(11, sigma=(0.1, 2.0))
+        self.blur_aug_prob = 0.25
+        self.color_aug_prob = 0.25
+        # occlusion augmentation
+        self.eraser_aug_prob = 0.5
+        self.eraser_bounds = [2, 100]
+        self.eraser_max = 10
+        # occlusion augmentation
+        self.replace_aug_prob = 0.5
+        self.replace_bounds = [2, 100]
+        self.replace_max = 10
+        # spatial augmentations
+        self.pad_bounds = [0, 100]
+        self.crop_size = crop_size
+        self.resize_lim = [0.25, 2.0]  # sample resizes from here
+        self.resize_delta = 0.2
+        self.max_crop_offset = 50
+        self.do_flip = True
+        self.h_flip_prob = 0.5
+        self.v_flip_prob = 0.5
+    def getitem_helper(self, index):
+        return NotImplementedError
+    def __getitem__(self, index):
+        gotit = False
+        sample, gotit = self.getitem_helper(index)
+        if not gotit:
+            print("warning: sampling failed")
+            # fake sample, so we can still collate
+            sample = CoTrackerData(
+                video=torch.zeros((self.seq_len, 3, self.crop_size[0], self.crop_size[1])),
+                trajectory=torch.zeros((self.seq_len, self.traj_per_sample, 2)),
+                visibility=torch.zeros((self.seq_len, self.traj_per_sample)),
+                valid=torch.zeros((self.seq_len, self.traj_per_sample)),
+            )
+        return sample, gotit
+    def add_photometric_augs(self, rgbs, trajs, visibles, eraser=True, replace=True):
+        T, N, _ = trajs.shape
+        S = len(rgbs)
+        H, W = rgbs[0].shape[:2]
+        assert S == T
+        if eraser:
+            ############ eraser transform (per image after the first) ############
+            rgbs = [rgb.astype(np.float32) for rgb in rgbs]
+            for i in range(1, S):
+                if np.random.rand() < self.eraser_aug_prob:
+                    for _ in range(
+                        np.random.randint(1, self.eraser_max + 1)
+                    ):  # number of times to occlude
+                        xc = np.random.randint(0, W)
+                        yc = np.random.randint(0, H)
+                        dx = np.random.randint(self.eraser_bounds[0], self.eraser_bounds[1])
+                        dy = np.random.randint(self.eraser_bounds[0], self.eraser_bounds[1])
+                        x0 = np.clip(xc - dx / 2, 0, W - 1).round().astype(np.int32)
+                        x1 = np.clip(xc + dx / 2, 0, W - 1).round().astype(np.int32)
+                        y0 = np.clip(yc - dy / 2, 0, H - 1).round().astype(np.int32)
+                        y1 = np.clip(yc + dy / 2, 0, H - 1).round().astype(np.int32)
+                        mean_color = np.mean(rgbs[i][y0:y1, x0:x1, :].reshape(-1, 3), axis=0)
+                        rgbs[i][y0:y1, x0:x1, :] = mean_color
+                        occ_inds = np.logical_and(
+                            np.logical_and(trajs[i, :, 0] >= x0, trajs[i, :, 0] < x1),
+                            np.logical_and(trajs[i, :, 1] >= y0, trajs[i, :, 1] < y1),
+                        )
+                        visibles[i, occ_inds] = 0
+            rgbs = [rgb.astype(np.uint8) for rgb in rgbs]
+        if replace:
+            rgbs_alt = [
+                np.array(self.photo_aug(Image.fromarray(rgb)), dtype=np.uint8) for rgb in rgbs
+            ]
+            rgbs_alt = [
+                np.array(self.photo_aug(Image.fromarray(rgb)), dtype=np.uint8) for rgb in rgbs_alt
+            ]
+            ############ replace transform (per image after the first) ############
+            rgbs = [rgb.astype(np.float32) for rgb in rgbs]
+            rgbs_alt = [rgb.astype(np.float32) for rgb in rgbs_alt]
+            for i in range(1, S):
+                if np.random.rand() < self.replace_aug_prob:
+                    for _ in range(
+                        np.random.randint(1, self.replace_max + 1)
+                    ):  # number of times to occlude
+                        xc = np.random.randint(0, W)
+                        yc = np.random.randint(0, H)
+                        dx = np.random.randint(self.replace_bounds[0], self.replace_bounds[1])
+                        dy = np.random.randint(self.replace_bounds[0], self.replace_bounds[1])
+                        x0 = np.clip(xc - dx / 2, 0, W - 1).round().astype(np.int32)
+                        x1 = np.clip(xc + dx / 2, 0, W - 1).round().astype(np.int32)
+                        y0 = np.clip(yc - dy / 2, 0, H - 1).round().astype(np.int32)
+                        y1 = np.clip(yc + dy / 2, 0, H - 1).round().astype(np.int32)
+                        wid = x1 - x0
+                        hei = y1 - y0
+                        y00 = np.random.randint(0, H - hei)
+                        x00 = np.random.randint(0, W - wid)
+                        fr = np.random.randint(0, S)
+                        rep = rgbs_alt[fr][y00 : y00 + hei, x00 : x00 + wid, :]
+                        rgbs[i][y0:y1, x0:x1, :] = rep
+                        occ_inds = np.logical_and(
+                            np.logical_and(trajs[i, :, 0] >= x0, trajs[i, :, 0] < x1),
+                            np.logical_and(trajs[i, :, 1] >= y0, trajs[i, :, 1] < y1),
+                        )
+                        visibles[i, occ_inds] = 0
+            rgbs = [rgb.astype(np.uint8) for rgb in rgbs]
+        ############ photometric augmentation ############
+        if np.random.rand() < self.color_aug_prob:
+            # random per-frame amount of aug
+            rgbs = [np.array(self.photo_aug(Image.fromarray(rgb)), dtype=np.uint8) for rgb in rgbs]
+        if np.random.rand() < self.blur_aug_prob:
+            # random per-frame amount of blur
+            rgbs = [np.array(self.blur_aug(Image.fromarray(rgb)), dtype=np.uint8) for rgb in rgbs]
+        return rgbs, trajs, visibles
+    def add_spatial_augs(self, rgbs, trajs, visibles):
+        T, N, __ = trajs.shape
+        S = len(rgbs)
+        H, W = rgbs[0].shape[:2]
+        assert S == T
+        rgbs = [rgb.astype(np.float32) for rgb in rgbs]
+        ############ spatial transform ############
+        # padding
+        pad_x0 = np.random.randint(self.pad_bounds[0], self.pad_bounds[1])
+        pad_x1 = np.random.randint(self.pad_bounds[0], self.pad_bounds[1])
+        pad_y0 = np.random.randint(self.pad_bounds[0], self.pad_bounds[1])
+        pad_y1 = np.random.randint(self.pad_bounds[0], self.pad_bounds[1])
+        rgbs = [np.pad(rgb, ((pad_y0, pad_y1), (pad_x0, pad_x1), (0, 0))) for rgb in rgbs]
+        trajs[:, :, 0] += pad_x0
+        trajs[:, :, 1] += pad_y0
+        H, W = rgbs[0].shape[:2]
+        # scaling + stretching
+        scale = np.random.uniform(self.resize_lim[0], self.resize_lim[1])
+        scale_x = scale
+        scale_y = scale
+        H_new = H
+        W_new = W
+        scale_delta_x = 0.0
+        scale_delta_y = 0.0
+        rgbs_scaled = []
+        for s in range(S):
+            if s == 1:
+                scale_delta_x = np.random.uniform(-self.resize_delta, self.resize_delta)
+                scale_delta_y = np.random.uniform(-self.resize_delta, self.resize_delta)
+            elif s > 1:
+                scale_delta_x = (
+                    scale_delta_x * 0.8
+                    + np.random.uniform(-self.resize_delta, self.resize_delta) * 0.2
+                )
+                scale_delta_y = (
+                    scale_delta_y * 0.8
+                    + np.random.uniform(-self.resize_delta, self.resize_delta) * 0.2
+                )
+            scale_x = scale_x + scale_delta_x
+            scale_y = scale_y + scale_delta_y
+            # bring h/w closer
+            scale_xy = (scale_x + scale_y) * 0.5
+            scale_x = scale_x * 0.5 + scale_xy * 0.5
+            scale_y = scale_y * 0.5 + scale_xy * 0.5
+            # don't get too crazy
+            scale_x = np.clip(scale_x, 0.2, 2.0)
+            scale_y = np.clip(scale_y, 0.2, 2.0)
+            H_new = int(H * scale_y)
+            W_new = int(W * scale_x)
+            # make it at least slightly bigger than the crop area,
+            # so that the random cropping can add diversity
+            H_new = np.clip(H_new, self.crop_size[0] + 10, None)
+            W_new = np.clip(W_new, self.crop_size[1] + 10, None)
+            # recompute scale in case we clipped
+            scale_x = (W_new - 1) / float(W - 1)
+            scale_y = (H_new - 1) / float(H - 1)
+            rgbs_scaled.append(cv2.resize(rgbs[s], (W_new, H_new), interpolation=cv2.INTER_LINEAR))
+            trajs[s, :, 0] *= scale_x
+            trajs[s, :, 1] *= scale_y
+        rgbs = rgbs_scaled
+        ok_inds = visibles[0, :] > 0
+        vis_trajs = trajs[:, ok_inds]  # S,?,2
+        if vis_trajs.shape[1] > 0:
+            mid_x = np.mean(vis_trajs[0, :, 0])
+            mid_y = np.mean(vis_trajs[0, :, 1])
+        else:
+            mid_y = self.crop_size[0]
+            mid_x = self.crop_size[1]
+        x0 = int(mid_x - self.crop_size[1] // 2)
+        y0 = int(mid_y - self.crop_size[0] // 2)
+        offset_x = 0
+        offset_y = 0
+        for s in range(S):
+            # on each frame, shift a bit more
+            if s == 1:
+                offset_x = np.random.randint(-self.max_crop_offset, self.max_crop_offset)
+                offset_y = np.random.randint(-self.max_crop_offset, self.max_crop_offset)
+            elif s > 1:
+                offset_x = int(
+                    offset_x * 0.8
+                    + np.random.randint(-self.max_crop_offset, self.max_crop_offset + 1) * 0.2
+                )
+                offset_y = int(
+                    offset_y * 0.8
+                    + np.random.randint(-self.max_crop_offset, self.max_crop_offset + 1) * 0.2
+                )
+            x0 = x0 + offset_x
+            y0 = y0 + offset_y
+            H_new, W_new = rgbs[s].shape[:2]
+            if H_new == self.crop_size[0]:
+                y0 = 0
+            else:
+                y0 = min(max(0, y0), H_new - self.crop_size[0] - 1)
+            if W_new == self.crop_size[1]:
+                x0 = 0
+            else:
+                x0 = min(max(0, x0), W_new - self.crop_size[1] - 1)
+            rgbs[s] = rgbs[s][y0 : y0 + self.crop_size[0], x0 : x0 + self.crop_size[1]]
+            trajs[s, :, 0] -= x0
+            trajs[s, :, 1] -= y0
+        H_new = self.crop_size[0]
+        W_new = self.crop_size[1]
+        # flip
+        h_flipped = False
+        v_flipped = False
+        if self.do_flip:
+            # h flip
+            if np.random.rand() < self.h_flip_prob:
+                h_flipped = True
+                rgbs = [rgb[:, ::-1] for rgb in rgbs]
+            # v flip
+            if np.random.rand() < self.v_flip_prob:
+                v_flipped = True
+                rgbs = [rgb[::-1] for rgb in rgbs]
+        if h_flipped:
+            trajs[:, :, 0] = W_new - trajs[:, :, 0]
+        if v_flipped:
+            trajs[:, :, 1] = H_new - trajs[:, :, 1]
+        return rgbs, trajs
+    def crop(self, rgbs, trajs):
+        T, N, _ = trajs.shape
+        S = len(rgbs)
+        H, W = rgbs[0].shape[:2]
+        assert S == T
+        ############ spatial transform ############
+        H_new = H
+        W_new = W
+        # simple random crop
+        y0 = 0 if self.crop_size[0] >= H_new else np.random.randint(0, H_new - self.crop_size[0])
+        x0 = 0 if self.crop_size[1] >= W_new else np.random.randint(0, W_new - self.crop_size[1])
+        rgbs = [rgb[y0 : y0 + self.crop_size[0], x0 : x0 + self.crop_size[1]] for rgb in rgbs]
+        trajs[:, :, 0] -= x0
+        trajs[:, :, 1] -= y0
+        return rgbs, trajs
+class KubricMovifDataset(CoTrackerDataset):
+    def __init__(
+        self,
+        data_root,
+        crop_size=(384, 512),
+        seq_len=24,
+        traj_per_sample=768,
+        sample_vis_1st_frame=False,
+        use_augs=False,
+    ):
+        super(KubricMovifDataset, self).__init__(
+            data_root=data_root,
+            crop_size=crop_size,
+            seq_len=seq_len,
+            traj_per_sample=traj_per_sample,
+            sample_vis_1st_frame=sample_vis_1st_frame,
+            use_augs=use_augs,
+        )
+        self.pad_bounds = [0, 25]
+        self.resize_lim = [0.75, 1.25]  # sample resizes from here
+        self.resize_delta = 0.05
+        self.max_crop_offset = 15
+        self.seq_names = [
+            fname
+            for fname in os.listdir(data_root)
+            if os.path.isdir(os.path.join(data_root, fname))
+        ]
+        print("found %d unique videos in %s" % (len(self.seq_names), self.data_root))
+    def getitem_helper(self, index):
+        gotit = True
+        seq_name = self.seq_names[index]
+        npy_path = os.path.join(self.data_root, seq_name, seq_name + ".npy")
+        rgb_path = os.path.join(self.data_root, seq_name, "frames")
+        img_paths = sorted(os.listdir(rgb_path))
+        rgbs = []
+        for i, img_path in enumerate(img_paths):
+            rgbs.append(imageio.v2.imread(os.path.join(rgb_path, img_path)))
+        rgbs = np.stack(rgbs)
+        annot_dict = np.load(npy_path, allow_pickle=True).item()
+        traj_2d = annot_dict["coords"]
+        visibility = annot_dict["visibility"]
+        # random crop
+        assert self.seq_len <= len(rgbs)
+        if self.seq_len < len(rgbs):
+            start_ind = np.random.choice(len(rgbs) - self.seq_len, 1)[0]
+            rgbs = rgbs[start_ind : start_ind + self.seq_len]
+            traj_2d = traj_2d[:, start_ind : start_ind + self.seq_len]
+            visibility = visibility[:, start_ind : start_ind + self.seq_len]
+        traj_2d = np.transpose(traj_2d, (1, 0, 2))
+        visibility = np.transpose(np.logical_not(visibility), (1, 0))
+        if self.use_augs:
+            rgbs, traj_2d, visibility = self.add_photometric_augs(rgbs, traj_2d, visibility)
+            rgbs, traj_2d = self.add_spatial_augs(rgbs, traj_2d, visibility)
+        else:
+            rgbs, traj_2d = self.crop(rgbs, traj_2d)
+        visibility[traj_2d[:, :, 0] > self.crop_size[1] - 1] = False
+        visibility[traj_2d[:, :, 0] < 0] = False
+        visibility[traj_2d[:, :, 1] > self.crop_size[0] - 1] = False
+        visibility[traj_2d[:, :, 1] < 0] = False
+        visibility = torch.from_numpy(visibility)
+        traj_2d = torch.from_numpy(traj_2d)
+        visibile_pts_first_frame_inds = (visibility[0]).nonzero(as_tuple=False)[:, 0]
+        if self.sample_vis_1st_frame:
+            visibile_pts_inds = visibile_pts_first_frame_inds
+        else:
+            visibile_pts_mid_frame_inds = (visibility[self.seq_len // 2]).nonzero(as_tuple=False)[
+                :, 0
+            ]
+            visibile_pts_inds = torch.cat(
+                (visibile_pts_first_frame_inds, visibile_pts_mid_frame_inds), dim=0
+            )
+        point_inds = torch.randperm(len(visibile_pts_inds))[: self.traj_per_sample]
+        if len(point_inds) < self.traj_per_sample:
+            gotit = False
+        visible_inds_sampled = visibile_pts_inds[point_inds]
+        trajs = traj_2d[:, visible_inds_sampled].float()
+        visibles = visibility[:, visible_inds_sampled]
+        valids = torch.ones((self.seq_len, self.traj_per_sample))
+        rgbs = torch.from_numpy(np.stack(rgbs)).permute(0, 3, 1, 2).float()
+        sample = CoTrackerData(
+            video=rgbs,
+            trajectory=trajs,
+            visibility=visibles,
+            valid=valids,
+            seq_name=seq_name,
+        )
+        return sample, gotit
+    def __len__(self):
+        return len(self.seq_names)

cotracker/build/lib/datasets/tap_vid_datasets.py ADDED Viewed

	@@ -0,0 +1,209 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import os
+import io
+import glob
+import torch
+import pickle
+import numpy as np
+import mediapy as media
+from PIL import Image
+from typing import Mapping, Tuple, Union
+from cotracker.datasets.utils import CoTrackerData
+DatasetElement = Mapping[str, Mapping[str, Union[np.ndarray, str]]]
+def resize_video(video: np.ndarray, output_size: Tuple[int, int]) -> np.ndarray:
+    """Resize a video to output_size."""
+    # If you have a GPU, consider replacing this with a GPU-enabled resize op,
+    # such as a jitted jax.image.resize.  It will make things faster.
+    return media.resize_video(video, output_size)
+def sample_queries_first(
+    target_occluded: np.ndarray,
+    target_points: np.ndarray,
+    frames: np.ndarray,
+) -> Mapping[str, np.ndarray]:
+    """Package a set of frames and tracks for use in TAPNet evaluations.
+    Given a set of frames and tracks with no query points, use the first
+    visible point in each track as the query.
+    Args:
+      target_occluded: Boolean occlusion flag, of shape [n_tracks, n_frames],
+        where True indicates occluded.
+      target_points: Position, of shape [n_tracks, n_frames, 2], where each point
+        is [x,y] scaled between 0 and 1.
+      frames: Video tensor, of shape [n_frames, height, width, 3].  Scaled between
+        -1 and 1.
+    Returns:
+      A dict with the keys:
+        video: Video tensor of shape [1, n_frames, height, width, 3]
+        query_points: Query points of shape [1, n_queries, 3] where
+          each point is [t, y, x] scaled to the range [-1, 1]
+        target_points: Target points of shape [1, n_queries, n_frames, 2] where
+          each point is [x, y] scaled to the range [-1, 1]
+    """
+    valid = np.sum(~target_occluded, axis=1) > 0
+    target_points = target_points[valid, :]
+    target_occluded = target_occluded[valid, :]
+    query_points = []
+    for i in range(target_points.shape[0]):
+        index = np.where(target_occluded[i] == 0)[0][0]
+        x, y = target_points[i, index, 0], target_points[i, index, 1]
+        query_points.append(np.array([index, y, x]))  # [t, y, x]
+    query_points = np.stack(query_points, axis=0)
+    return {
+        "video": frames[np.newaxis, ...],
+        "query_points": query_points[np.newaxis, ...],
+        "target_points": target_points[np.newaxis, ...],
+        "occluded": target_occluded[np.newaxis, ...],
+    }
+def sample_queries_strided(
+    target_occluded: np.ndarray,
+    target_points: np.ndarray,
+    frames: np.ndarray,
+    query_stride: int = 5,
+) -> Mapping[str, np.ndarray]:
+    """Package a set of frames and tracks for use in TAPNet evaluations.
+    Given a set of frames and tracks with no query points, sample queries
+    strided every query_stride frames, ignoring points that are not visible
+    at the selected frames.
+    Args:
+      target_occluded: Boolean occlusion flag, of shape [n_tracks, n_frames],
+        where True indicates occluded.
+      target_points: Position, of shape [n_tracks, n_frames, 2], where each point
+        is [x,y] scaled between 0 and 1.
+      frames: Video tensor, of shape [n_frames, height, width, 3].  Scaled between
+        -1 and 1.
+      query_stride: When sampling query points, search for un-occluded points
+        every query_stride frames and convert each one into a query.
+    Returns:
+      A dict with the keys:
+        video: Video tensor of shape [1, n_frames, height, width, 3].  The video
+          has floats scaled to the range [-1, 1].
+        query_points: Query points of shape [1, n_queries, 3] where
+          each point is [t, y, x] scaled to the range [-1, 1].
+        target_points: Target points of shape [1, n_queries, n_frames, 2] where
+          each point is [x, y] scaled to the range [-1, 1].
+        trackgroup: Index of the original track that each query point was
+          sampled from.  This is useful for visualization.
+    """
+    tracks = []
+    occs = []
+    queries = []
+    trackgroups = []
+    total = 0
+    trackgroup = np.arange(target_occluded.shape[0])
+    for i in range(0, target_occluded.shape[1], query_stride):
+        mask = target_occluded[:, i] == 0
+        query = np.stack(
+            [
+                i * np.ones(target_occluded.shape[0:1]),
+                target_points[:, i, 1],
+                target_points[:, i, 0],
+            ],
+            axis=-1,
+        )
+        queries.append(query[mask])
+        tracks.append(target_points[mask])
+        occs.append(target_occluded[mask])
+        trackgroups.append(trackgroup[mask])
+        total += np.array(np.sum(target_occluded[:, i] == 0))
+    return {
+        "video": frames[np.newaxis, ...],
+        "query_points": np.concatenate(queries, axis=0)[np.newaxis, ...],
+        "target_points": np.concatenate(tracks, axis=0)[np.newaxis, ...],
+        "occluded": np.concatenate(occs, axis=0)[np.newaxis, ...],
+        "trackgroup": np.concatenate(trackgroups, axis=0)[np.newaxis, ...],
+    }
+class TapVidDataset(torch.utils.data.Dataset):
+    def __init__(
+        self,
+        data_root,
+        dataset_type="davis",
+        resize_to_256=True,
+        queried_first=True,
+    ):
+        self.dataset_type = dataset_type
+        self.resize_to_256 = resize_to_256
+        self.queried_first = queried_first
+        if self.dataset_type == "kinetics":
+            all_paths = glob.glob(os.path.join(data_root, "*_of_0010.pkl"))
+            points_dataset = []
+            for pickle_path in all_paths:
+                with open(pickle_path, "rb") as f:
+                    data = pickle.load(f)
+                    points_dataset = points_dataset + data
+            self.points_dataset = points_dataset
+        else:
+            with open(data_root, "rb") as f:
+                self.points_dataset = pickle.load(f)
+            if self.dataset_type == "davis":
+                self.video_names = list(self.points_dataset.keys())
+        print("found %d unique videos in %s" % (len(self.points_dataset), data_root))
+    def __getitem__(self, index):
+        if self.dataset_type == "davis":
+            video_name = self.video_names[index]
+        else:
+            video_name = index
+        video = self.points_dataset[video_name]
+        frames = video["video"]
+        if isinstance(frames[0], bytes):
+            # TAP-Vid is stored and JPEG bytes rather than `np.ndarray`s.
+            def decode(frame):
+                byteio = io.BytesIO(frame)
+                img = Image.open(byteio)
+                return np.array(img)
+            frames = np.array([decode(frame) for frame in frames])
+        target_points = self.points_dataset[video_name]["points"]
+        if self.resize_to_256:
+            frames = resize_video(frames, [256, 256])
+            target_points *= np.array([255, 255])  # 1 should be mapped to 256-1
+        else:
+            target_points *= np.array([frames.shape[2] - 1, frames.shape[1] - 1])
+        target_occ = self.points_dataset[video_name]["occluded"]
+        if self.queried_first:
+            converted = sample_queries_first(target_occ, target_points, frames)
+        else:
+            converted = sample_queries_strided(target_occ, target_points, frames)
+        assert converted["target_points"].shape[1] == converted["query_points"].shape[1]
+        trajs = torch.from_numpy(converted["target_points"])[0].permute(1, 0, 2).float()  # T, N, D
+        rgbs = torch.from_numpy(frames).permute(0, 3, 1, 2).float()
+        visibles = torch.logical_not(torch.from_numpy(converted["occluded"]))[0].permute(
+            1, 0
+        )  # T, N
+        query_points = torch.from_numpy(converted["query_points"])[0]  # T, N
+        return CoTrackerData(
+            rgbs,
+            trajs,
+            visibles,
+            seq_name=str(video_name),
+            query_points=query_points,
+        )
+    def __len__(self):
+        return len(self.points_dataset)

cotracker/build/lib/datasets/utils.py ADDED Viewed

	@@ -0,0 +1,106 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+import dataclasses
+import torch.nn.functional as F
+from dataclasses import dataclass
+from typing import Any, Optional
+@dataclass(eq=False)
+class CoTrackerData:
+    """
+    Dataclass for storing video tracks data.
+    """
+    video: torch.Tensor  # B, S, C, H, W
+    trajectory: torch.Tensor  # B, S, N, 2
+    visibility: torch.Tensor  # B, S, N
+    # optional data
+    valid: Optional[torch.Tensor] = None  # B, S, N
+    segmentation: Optional[torch.Tensor] = None  # B, S, 1, H, W
+    seq_name: Optional[str] = None
+    query_points: Optional[torch.Tensor] = None  # TapVID evaluation format
+def collate_fn(batch):
+    """
+    Collate function for video tracks data.
+    """
+    video = torch.stack([b.video for b in batch], dim=0)
+    trajectory = torch.stack([b.trajectory for b in batch], dim=0)
+    visibility = torch.stack([b.visibility for b in batch], dim=0)
+    query_points = segmentation = None
+    if batch[0].query_points is not None:
+        query_points = torch.stack([b.query_points for b in batch], dim=0)
+    if batch[0].segmentation is not None:
+        segmentation = torch.stack([b.segmentation for b in batch], dim=0)
+    seq_name = [b.seq_name for b in batch]
+    return CoTrackerData(
+        video=video,
+        trajectory=trajectory,
+        visibility=visibility,
+        segmentation=segmentation,
+        seq_name=seq_name,
+        query_points=query_points,
+    )
+def collate_fn_train(batch):
+    """
+    Collate function for video tracks data during training.
+    """
+    gotit = [gotit for _, gotit in batch]
+    video = torch.stack([b.video for b, _ in batch], dim=0)
+    trajectory = torch.stack([b.trajectory for b, _ in batch], dim=0)
+    visibility = torch.stack([b.visibility for b, _ in batch], dim=0)
+    valid = torch.stack([b.valid for b, _ in batch], dim=0)
+    seq_name = [b.seq_name for b, _ in batch]
+    return (
+        CoTrackerData(
+            video=video,
+            trajectory=trajectory,
+            visibility=visibility,
+            valid=valid,
+            seq_name=seq_name,
+        ),
+        gotit,
+    )
+def try_to_cuda(t: Any) -> Any:
+    """
+    Try to move the input variable `t` to a cuda device.
+    Args:
+        t: Input.
+    Returns:
+        t_cuda: `t` moved to a cuda device, if supported.
+    """
+    try:
+        t = t.float().cuda()
+    except AttributeError:
+        pass
+    return t
+def dataclass_to_cuda_(obj):
+    """
+    Move all contents of a dataclass to cuda inplace if supported.
+    Args:
+        batch: Input dataclass.
+    Returns:
+        batch_cuda: `batch` moved to a cuda device, if supported.
+    """
+    for f in dataclasses.fields(obj):
+        setattr(obj, f.name, try_to_cuda(getattr(obj, f.name)))
+    return obj

cotracker/build/lib/evaluation/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.

cotracker/build/lib/evaluation/core/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.

cotracker/build/lib/evaluation/core/eval_utils.py ADDED Viewed

	@@ -0,0 +1,138 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import numpy as np
+from typing import Iterable, Mapping, Tuple, Union
+def compute_tapvid_metrics(
+    query_points: np.ndarray,
+    gt_occluded: np.ndarray,
+    gt_tracks: np.ndarray,
+    pred_occluded: np.ndarray,
+    pred_tracks: np.ndarray,
+    query_mode: str,
+) -> Mapping[str, np.ndarray]:
+    """Computes TAP-Vid metrics (Jaccard, Pts. Within Thresh, Occ. Acc.)
+    See the TAP-Vid paper for details on the metric computation.  All inputs are
+    given in raster coordinates.  The first three arguments should be the direct
+    outputs of the reader: the 'query_points', 'occluded', and 'target_points'.
+    The paper metrics assume these are scaled relative to 256x256 images.
+    pred_occluded and pred_tracks are your algorithm's predictions.
+    This function takes a batch of inputs, and computes metrics separately for
+    each video.  The metrics for the full benchmark are a simple mean of the
+    metrics across the full set of videos.  These numbers are between 0 and 1,
+    but the paper multiplies them by 100 to ease reading.
+    Args:
+       query_points: The query points, an in the format [t, y, x].  Its size is
+         [b, n, 3], where b is the batch size and n is the number of queries
+       gt_occluded: A boolean array of shape [b, n, t], where t is the number
+         of frames.  True indicates that the point is occluded.
+       gt_tracks: The target points, of shape [b, n, t, 2].  Each point is
+         in the format [x, y]
+       pred_occluded: A boolean array of predicted occlusions, in the same
+         format as gt_occluded.
+       pred_tracks: An array of track predictions from your algorithm, in the
+         same format as gt_tracks.
+       query_mode: Either 'first' or 'strided', depending on how queries are
+         sampled.  If 'first', we assume the prior knowledge that all points
+         before the query point are occluded, and these are removed from the
+         evaluation.
+    Returns:
+        A dict with the following keys:
+        occlusion_accuracy: Accuracy at predicting occlusion.
+        pts_within_{x} for x in [1, 2, 4, 8, 16]: Fraction of points
+          predicted to be within the given pixel threshold, ignoring occlusion
+          prediction.
+        jaccard_{x} for x in [1, 2, 4, 8, 16]: Jaccard metric for the given
+          threshold
+        average_pts_within_thresh: average across pts_within_{x}
+        average_jaccard: average across jaccard_{x}
+    """
+    metrics = {}
+    # Fixed bug is described in:
+    # https://github.com/facebookresearch/co-tracker/issues/20
+    eye = np.eye(gt_tracks.shape[2], dtype=np.int32)
+    if query_mode == "first":
+        # evaluate frames after the query frame
+        query_frame_to_eval_frames = np.cumsum(eye, axis=1) - eye
+    elif query_mode == "strided":
+        # evaluate all frames except the query frame
+        query_frame_to_eval_frames = 1 - eye
+    else:
+        raise ValueError("Unknown query mode " + query_mode)
+    query_frame = query_points[..., 0]
+    query_frame = np.round(query_frame).astype(np.int32)
+    evaluation_points = query_frame_to_eval_frames[query_frame] > 0
+    # Occlusion accuracy is simply how often the predicted occlusion equals the
+    # ground truth.
+    occ_acc = np.sum(
+        np.equal(pred_occluded, gt_occluded) & evaluation_points,
+        axis=(1, 2),
+    ) / np.sum(evaluation_points)
+    metrics["occlusion_accuracy"] = occ_acc
+    # Next, convert the predictions and ground truth positions into pixel
+    # coordinates.
+    visible = np.logical_not(gt_occluded)
+    pred_visible = np.logical_not(pred_occluded)
+    all_frac_within = []
+    all_jaccard = []
+    for thresh in [1, 2, 4, 8, 16]:
+        # True positives are points that are within the threshold and where both
+        # the prediction and the ground truth are listed as visible.
+        within_dist = np.sum(
+            np.square(pred_tracks - gt_tracks),
+            axis=-1,
+        ) < np.square(thresh)
+        is_correct = np.logical_and(within_dist, visible)
+        # Compute the frac_within_threshold, which is the fraction of points
+        # within the threshold among points that are visible in the ground truth,
+        # ignoring whether they're predicted to be visible.
+        count_correct = np.sum(
+            is_correct & evaluation_points,
+            axis=(1, 2),
+        )
+        count_visible_points = np.sum(visible & evaluation_points, axis=(1, 2))
+        frac_correct = count_correct / count_visible_points
+        metrics["pts_within_" + str(thresh)] = frac_correct
+        all_frac_within.append(frac_correct)
+        true_positives = np.sum(
+            is_correct & pred_visible & evaluation_points, axis=(1, 2)
+        )
+        # The denominator of the jaccard metric is the true positives plus
+        # false positives plus false negatives.  However, note that true positives
+        # plus false negatives is simply the number of points in the ground truth
+        # which is easier to compute than trying to compute all three quantities.
+        # Thus we just add the number of points in the ground truth to the number
+        # of false positives.
+        #
+        # False positives are simply points that are predicted to be visible,
+        # but the ground truth is not visible or too far from the prediction.
+        gt_positives = np.sum(visible & evaluation_points, axis=(1, 2))
+        false_positives = (~visible) & pred_visible
+        false_positives = false_positives | ((~within_dist) & pred_visible)
+        false_positives = np.sum(false_positives & evaluation_points, axis=(1, 2))
+        jaccard = true_positives / (gt_positives + false_positives)
+        metrics["jaccard_" + str(thresh)] = jaccard
+        all_jaccard.append(jaccard)
+    metrics["average_jaccard"] = np.mean(
+        np.stack(all_jaccard, axis=1),
+        axis=1,
+    )
+    metrics["average_pts_within_thresh"] = np.mean(
+        np.stack(all_frac_within, axis=1),
+        axis=1,
+    )
+    return metrics

cotracker/build/lib/evaluation/core/evaluator.py ADDED Viewed

	@@ -0,0 +1,253 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from collections import defaultdict
+import os
+from typing import Optional
+import torch
+from tqdm import tqdm
+import numpy as np
+from torch.utils.tensorboard import SummaryWriter
+from cotracker.datasets.utils import dataclass_to_cuda_
+from cotracker.utils.visualizer import Visualizer
+from cotracker.models.core.model_utils import reduce_masked_mean
+from cotracker.evaluation.core.eval_utils import compute_tapvid_metrics
+import logging
+class Evaluator:
+    """
+    A class defining the CoTracker evaluator.
+    """
+    def __init__(self, exp_dir) -> None:
+        # Visualization
+        self.exp_dir = exp_dir
+        os.makedirs(exp_dir, exist_ok=True)
+        self.visualization_filepaths = defaultdict(lambda: defaultdict(list))
+        self.visualize_dir = os.path.join(exp_dir, "visualisations")
+    def compute_metrics(self, metrics, sample, pred_trajectory, dataset_name):
+        if isinstance(pred_trajectory, tuple):
+            pred_trajectory, pred_visibility = pred_trajectory
+        else:
+            pred_visibility = None
+        if "tapvid" in dataset_name:
+            B, T, N, D = sample.trajectory.shape
+            traj = sample.trajectory.clone()
+            thr = 0.9
+            if pred_visibility is None:
+                logging.warning("visibility is NONE")
+                pred_visibility = torch.zeros_like(sample.visibility)
+            if not pred_visibility.dtype == torch.bool:
+                pred_visibility = pred_visibility > thr
+            query_points = sample.query_points.clone().cpu().numpy()
+            pred_visibility = pred_visibility[:, :, :N]
+            pred_trajectory = pred_trajectory[:, :, :N]
+            gt_tracks = traj.permute(0, 2, 1, 3).cpu().numpy()
+            gt_occluded = (
+                torch.logical_not(sample.visibility.clone().permute(0, 2, 1)).cpu().numpy()
+            )
+            pred_occluded = (
+                torch.logical_not(pred_visibility.clone().permute(0, 2, 1)).cpu().numpy()
+            )
+            pred_tracks = pred_trajectory.permute(0, 2, 1, 3).cpu().numpy()
+            out_metrics = compute_tapvid_metrics(
+                query_points,
+                gt_occluded,
+                gt_tracks,
+                pred_occluded,
+                pred_tracks,
+                query_mode="strided" if "strided" in dataset_name else "first",
+            )
+            metrics[sample.seq_name[0]] = out_metrics
+            for metric_name in out_metrics.keys():
+                if "avg" not in metrics:
+                    metrics["avg"] = {}
+                metrics["avg"][metric_name] = np.mean(
+                    [v[metric_name] for k, v in metrics.items() if k != "avg"]
+                )
+            logging.info(f"Metrics: {out_metrics}")
+            logging.info(f"avg: {metrics['avg']}")
+            print("metrics", out_metrics)
+            print("avg", metrics["avg"])
+        elif dataset_name == "dynamic_replica" or dataset_name == "pointodyssey":
+            *_, N, _ = sample.trajectory.shape
+            B, T, N = sample.visibility.shape
+            H, W = sample.video.shape[-2:]
+            device = sample.video.device
+            out_metrics = {}
+            d_vis_sum = d_occ_sum = d_sum_all = 0.0
+            thrs = [1, 2, 4, 8, 16]
+            sx_ = (W - 1) / 255.0
+            sy_ = (H - 1) / 255.0
+            sc_py = np.array([sx_, sy_]).reshape([1, 1, 2])
+            sc_pt = torch.from_numpy(sc_py).float().to(device)
+            __, first_visible_inds = torch.max(sample.visibility, dim=1)
+            frame_ids_tensor = torch.arange(T, device=device)[None, :, None].repeat(B, 1, N)
+            start_tracking_mask = frame_ids_tensor > (first_visible_inds.unsqueeze(1))
+            for thr in thrs:
+                d_ = (
+                    torch.norm(
+                        pred_trajectory[..., :2] / sc_pt - sample.trajectory[..., :2] / sc_pt,
+                        dim=-1,
+                    )
+                    < thr
+                ).float()  # B,S-1,N
+                d_occ = (
+                    reduce_masked_mean(d_, (1 - sample.visibility) * start_tracking_mask).item()
+                    * 100.0
+                )
+                d_occ_sum += d_occ
+                out_metrics[f"accuracy_occ_{thr}"] = d_occ
+                d_vis = (
+                    reduce_masked_mean(d_, sample.visibility * start_tracking_mask).item() * 100.0
+                )
+                d_vis_sum += d_vis
+                out_metrics[f"accuracy_vis_{thr}"] = d_vis
+                d_all = reduce_masked_mean(d_, start_tracking_mask).item() * 100.0
+                d_sum_all += d_all
+                out_metrics[f"accuracy_{thr}"] = d_all
+            d_occ_avg = d_occ_sum / len(thrs)
+            d_vis_avg = d_vis_sum / len(thrs)
+            d_all_avg = d_sum_all / len(thrs)
+            sur_thr = 50
+            dists = torch.norm(
+                pred_trajectory[..., :2] / sc_pt - sample.trajectory[..., :2] / sc_pt,
+                dim=-1,
+            )  # B,S,N
+            dist_ok = 1 - (dists > sur_thr).float() * sample.visibility  # B,S,N
+            survival = torch.cumprod(dist_ok, dim=1)  # B,S,N
+            out_metrics["survival"] = torch.mean(survival).item() * 100.0
+            out_metrics["accuracy_occ"] = d_occ_avg
+            out_metrics["accuracy_vis"] = d_vis_avg
+            out_metrics["accuracy"] = d_all_avg
+            metrics[sample.seq_name[0]] = out_metrics
+            for metric_name in out_metrics.keys():
+                if "avg" not in metrics:
+                    metrics["avg"] = {}
+                metrics["avg"][metric_name] = float(
+                    np.mean([v[metric_name] for k, v in metrics.items() if k != "avg"])
+                )
+            logging.info(f"Metrics: {out_metrics}")
+            logging.info(f"avg: {metrics['avg']}")
+            print("metrics", out_metrics)
+            print("avg", metrics["avg"])
+    @torch.no_grad()
+    def evaluate_sequence(
+        self,
+        model,
+        test_dataloader: torch.utils.data.DataLoader,
+        dataset_name: str,
+        train_mode=False,
+        visualize_every: int = 1,
+        writer: Optional[SummaryWriter] = None,
+        step: Optional[int] = 0,
+    ):
+        metrics = {}
+        vis = Visualizer(
+            save_dir=self.exp_dir,
+            fps=7,
+        )
+        for ind, sample in enumerate(tqdm(test_dataloader)):
+            if isinstance(sample, tuple):
+                sample, gotit = sample
+                if not all(gotit):
+                    print("batch is None")
+                    continue
+            if torch.cuda.is_available():
+                dataclass_to_cuda_(sample)
+                device = torch.device("cuda")
+            else:
+                device = torch.device("cpu")
+            if (
+                not train_mode
+                and hasattr(model, "sequence_len")
+                and (sample.visibility[:, : model.sequence_len].sum() == 0)
+            ):
+                print(f"skipping batch {ind}")
+                continue
+            if "tapvid" in dataset_name:
+                queries = sample.query_points.clone().float()
+                queries = torch.stack(
+                    [
+                        queries[:, :, 0],
+                        queries[:, :, 2],
+                        queries[:, :, 1],
+                    ],
+                    dim=2,
+                ).to(device)
+            else:
+                queries = torch.cat(
+                    [
+                        torch.zeros_like(sample.trajectory[:, 0, :, :1]),
+                        sample.trajectory[:, 0],
+                    ],
+                    dim=2,
+                ).to(device)
+            pred_tracks = model(sample.video, queries)
+            if "strided" in dataset_name:
+                inv_video = sample.video.flip(1).clone()
+                inv_queries = queries.clone()
+                inv_queries[:, :, 0] = inv_video.shape[1] - inv_queries[:, :, 0] - 1
+                pred_trj, pred_vsb = pred_tracks
+                inv_pred_trj, inv_pred_vsb = model(inv_video, inv_queries)
+                inv_pred_trj = inv_pred_trj.flip(1)
+                inv_pred_vsb = inv_pred_vsb.flip(1)
+                mask = pred_trj == 0
+                pred_trj[mask] = inv_pred_trj[mask]
+                pred_vsb[mask[:, :, :, 0]] = inv_pred_vsb[mask[:, :, :, 0]]
+                pred_tracks = pred_trj, pred_vsb
+            if dataset_name == "badja" or dataset_name == "fastcapture":
+                seq_name = sample.seq_name[0]
+            else:
+                seq_name = str(ind)
+            if ind % visualize_every == 0:
+                vis.visualize(
+                    sample.video,
+                    pred_tracks[0] if isinstance(pred_tracks, tuple) else pred_tracks,
+                    filename=dataset_name + "_" + seq_name,
+                    writer=writer,
+                    step=step,
+                )
+            self.compute_metrics(metrics, sample, pred_tracks, dataset_name)
+        return metrics

cotracker/build/lib/evaluation/evaluate.py ADDED Viewed

	@@ -0,0 +1,169 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import json
+import os
+from dataclasses import dataclass, field
+import hydra
+import numpy as np
+import torch
+from omegaconf import OmegaConf
+from cotracker.datasets.tap_vid_datasets import TapVidDataset
+from cotracker.datasets.dr_dataset import DynamicReplicaDataset
+from cotracker.datasets.utils import collate_fn
+from cotracker.models.evaluation_predictor import EvaluationPredictor
+from cotracker.evaluation.core.evaluator import Evaluator
+from cotracker.models.build_cotracker import (
+    build_cotracker,
+)
+@dataclass(eq=False)
+class DefaultConfig:
+    # Directory where all outputs of the experiment will be saved.
+    exp_dir: str = "./outputs"
+    # Name of the dataset to be used for the evaluation.
+    dataset_name: str = "tapvid_davis_first"
+    # The root directory of the dataset.
+    dataset_root: str = "./"
+    # Path to the pre-trained model checkpoint to be used for the evaluation.
+    # The default value is the path to a specific CoTracker model checkpoint.
+    checkpoint: str = "./checkpoints/cotracker2.pth"
+    # EvaluationPredictor parameters
+    # The size (N) of the support grid used in the predictor.
+    # The total number of points is (N*N).
+    grid_size: int = 5
+    # The size (N) of the local support grid.
+    local_grid_size: int = 8
+    # A flag indicating whether to evaluate one ground truth point at a time.
+    single_point: bool = True
+    # The number of iterative updates for each sliding window.
+    n_iters: int = 6
+    seed: int = 0
+    gpu_idx: int = 0
+    # Override hydra's working directory to current working dir,
+    # also disable storing the .hydra logs:
+    hydra: dict = field(
+        default_factory=lambda: {
+            "run": {"dir": "."},
+            "output_subdir": None,
+        }
+    )
+def run_eval(cfg: DefaultConfig):
+    """
+    The function evaluates CoTracker on a specified benchmark dataset based on a provided configuration.
+    Args:
+        cfg (DefaultConfig): An instance of DefaultConfig class which includes:
+            - exp_dir (str): The directory path for the experiment.
+            - dataset_name (str): The name of the dataset to be used.
+            - dataset_root (str): The root directory of the dataset.
+            - checkpoint (str): The path to the CoTracker model's checkpoint.
+            - single_point (bool): A flag indicating whether to evaluate one ground truth point at a time.
+            - n_iters (int): The number of iterative updates for each sliding window.
+            - seed (int): The seed for setting the random state for reproducibility.
+            - gpu_idx (int): The index of the GPU to be used.
+    """
+    # Creating the experiment directory if it doesn't exist
+    os.makedirs(cfg.exp_dir, exist_ok=True)
+    # Saving the experiment configuration to a .yaml file in the experiment directory
+    cfg_file = os.path.join(cfg.exp_dir, "expconfig.yaml")
+    with open(cfg_file, "w") as f:
+        OmegaConf.save(config=cfg, f=f)
+    evaluator = Evaluator(cfg.exp_dir)
+    cotracker_model = build_cotracker(cfg.checkpoint)
+    # Creating the EvaluationPredictor object
+    predictor = EvaluationPredictor(
+        cotracker_model,
+        grid_size=cfg.grid_size,
+        local_grid_size=cfg.local_grid_size,
+        single_point=cfg.single_point,
+        n_iters=cfg.n_iters,
+    )
+    if torch.cuda.is_available():
+        predictor.model = predictor.model.cuda()
+    # Setting the random seeds
+    torch.manual_seed(cfg.seed)
+    np.random.seed(cfg.seed)
+    # Constructing the specified dataset
+    curr_collate_fn = collate_fn
+    if "tapvid" in cfg.dataset_name:
+        dataset_type = cfg.dataset_name.split("_")[1]
+        if dataset_type == "davis":
+            data_root = os.path.join(cfg.dataset_root, "tapvid_davis", "tapvid_davis.pkl")
+        elif dataset_type == "kinetics":
+            data_root = os.path.join(
+                cfg.dataset_root, "/kinetics/kinetics-dataset/k700-2020/tapvid_kinetics"
+            )
+        test_dataset = TapVidDataset(
+            dataset_type=dataset_type,
+            data_root=data_root,
+            queried_first=not "strided" in cfg.dataset_name,
+        )
+    elif cfg.dataset_name == "dynamic_replica":
+        test_dataset = DynamicReplicaDataset(sample_len=300, only_first_n_samples=1)
+    # Creating the DataLoader object
+    test_dataloader = torch.utils.data.DataLoader(
+        test_dataset,
+        batch_size=1,
+        shuffle=False,
+        num_workers=14,
+        collate_fn=curr_collate_fn,
+    )
+    # Timing and conducting the evaluation
+    import time
+    start = time.time()
+    evaluate_result = evaluator.evaluate_sequence(
+        predictor,
+        test_dataloader,
+        dataset_name=cfg.dataset_name,
+    )
+    end = time.time()
+    print(end - start)
+    # Saving the evaluation results to a .json file
+    evaluate_result = evaluate_result["avg"]
+    print("evaluate_result", evaluate_result)
+    result_file = os.path.join(cfg.exp_dir, f"result_eval_.json")
+    evaluate_result["time"] = end - start
+    print(f"Dumping eval results to {result_file}.")
+    with open(result_file, "w") as f:
+        json.dump(evaluate_result, f)
+cs = hydra.core.config_store.ConfigStore.instance()
+cs.store(name="default_config_eval", node=DefaultConfig)
+@hydra.main(config_path="./configs/", config_name="default_config_eval")
+def evaluate(cfg: DefaultConfig) -> None:
+    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+    os.environ["CUDA_VISIBLE_DEVICES"] = str(cfg.gpu_idx)
+    run_eval(cfg)
+if __name__ == "__main__":
+    evaluate()

cotracker/build/lib/models/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.

cotracker/build/lib/models/build_cotracker.py ADDED Viewed

	@@ -0,0 +1,33 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+from cotracker.models.core.cotracker.cotracker import CoTracker2
+def build_cotracker(
+    checkpoint: str,
+):
+    if checkpoint is None:
+        return build_cotracker()
+    model_name = checkpoint.split("/")[-1].split(".")[0]
+    if model_name == "cotracker":
+        return build_cotracker(checkpoint=checkpoint)
+    else:
+        raise ValueError(f"Unknown model name {model_name}")
+def build_cotracker(checkpoint=None):
+    cotracker = CoTracker2(stride=4, window_len=8, add_space_attn=True)
+    if checkpoint is not None:
+        with open(checkpoint, "rb") as f:
+            state_dict = torch.load(f, map_location="cpu")
+            if "model" in state_dict:
+                state_dict = state_dict["model"]
+        cotracker.load_state_dict(state_dict)
+    return cotracker

cotracker/build/lib/models/core/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.

cotracker/build/lib/models/core/cotracker/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.

cotracker/build/lib/models/core/cotracker/blocks.py ADDED Viewed

	@@ -0,0 +1,367 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from functools import partial
+from typing import Callable
+import collections
+from torch import Tensor
+from itertools import repeat
+from cotracker.models.core.model_utils import bilinear_sampler
+# From PyTorch internals
+def _ntuple(n):
+    def parse(x):
+        if isinstance(x, collections.abc.Iterable) and not isinstance(x, str):
+            return tuple(x)
+        return tuple(repeat(x, n))
+    return parse
+def exists(val):
+    return val is not None
+def default(val, d):
+    return val if exists(val) else d
+to_2tuple = _ntuple(2)
+class Mlp(nn.Module):
+    """MLP as used in Vision Transformer, MLP-Mixer and related networks"""
+    def __init__(
+        self,
+        in_features,
+        hidden_features=None,
+        out_features=None,
+        act_layer=nn.GELU,
+        norm_layer=None,
+        bias=True,
+        drop=0.0,
+        use_conv=False,
+    ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        bias = to_2tuple(bias)
+        drop_probs = to_2tuple(drop)
+        linear_layer = partial(nn.Conv2d, kernel_size=1) if use_conv else nn.Linear
+        self.fc1 = linear_layer(in_features, hidden_features, bias=bias[0])
+        self.act = act_layer()
+        self.drop1 = nn.Dropout(drop_probs[0])
+        self.norm = norm_layer(hidden_features) if norm_layer is not None else nn.Identity()
+        self.fc2 = linear_layer(hidden_features, out_features, bias=bias[1])
+        self.drop2 = nn.Dropout(drop_probs[1])
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop1(x)
+        x = self.fc2(x)
+        x = self.drop2(x)
+        return x
+class ResidualBlock(nn.Module):
+    def __init__(self, in_planes, planes, norm_fn="group", stride=1):
+        super(ResidualBlock, self).__init__()
+        self.conv1 = nn.Conv2d(
+            in_planes,
+            planes,
+            kernel_size=3,
+            padding=1,
+            stride=stride,
+            padding_mode="zeros",
+        )
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, padding=1, padding_mode="zeros")
+        self.relu = nn.ReLU(inplace=True)
+        num_groups = planes // 8
+        if norm_fn == "group":
+            self.norm1 = nn.GroupNorm(num_groups=num_groups, num_channels=planes)
+            self.norm2 = nn.GroupNorm(num_groups=num_groups, num_channels=planes)
+            if not stride == 1:
+                self.norm3 = nn.GroupNorm(num_groups=num_groups, num_channels=planes)
+        elif norm_fn == "batch":
+            self.norm1 = nn.BatchNorm2d(planes)
+            self.norm2 = nn.BatchNorm2d(planes)
+            if not stride == 1:
+                self.norm3 = nn.BatchNorm2d(planes)
+        elif norm_fn == "instance":
+            self.norm1 = nn.InstanceNorm2d(planes)
+            self.norm2 = nn.InstanceNorm2d(planes)
+            if not stride == 1:
+                self.norm3 = nn.InstanceNorm2d(planes)
+        elif norm_fn == "none":
+            self.norm1 = nn.Sequential()
+            self.norm2 = nn.Sequential()
+            if not stride == 1:
+                self.norm3 = nn.Sequential()
+        if stride == 1:
+            self.downsample = None
+        else:
+            self.downsample = nn.Sequential(
+                nn.Conv2d(in_planes, planes, kernel_size=1, stride=stride), self.norm3
+            )
+    def forward(self, x):
+        y = x
+        y = self.relu(self.norm1(self.conv1(y)))
+        y = self.relu(self.norm2(self.conv2(y)))
+        if self.downsample is not None:
+            x = self.downsample(x)
+        return self.relu(x + y)
+class BasicEncoder(nn.Module):
+    def __init__(self, input_dim=3, output_dim=128, stride=4):
+        super(BasicEncoder, self).__init__()
+        self.stride = stride
+        self.norm_fn = "instance"
+        self.in_planes = output_dim // 2
+        self.norm1 = nn.InstanceNorm2d(self.in_planes)
+        self.norm2 = nn.InstanceNorm2d(output_dim * 2)
+        self.conv1 = nn.Conv2d(
+            input_dim,
+            self.in_planes,
+            kernel_size=7,
+            stride=2,
+            padding=3,
+            padding_mode="zeros",
+        )
+        self.relu1 = nn.ReLU(inplace=True)
+        self.layer1 = self._make_layer(output_dim // 2, stride=1)
+        self.layer2 = self._make_layer(output_dim // 4 * 3, stride=2)
+        self.layer3 = self._make_layer(output_dim, stride=2)
+        self.layer4 = self._make_layer(output_dim, stride=2)
+        self.conv2 = nn.Conv2d(
+            output_dim * 3 + output_dim // 4,
+            output_dim * 2,
+            kernel_size=3,
+            padding=1,
+            padding_mode="zeros",
+        )
+        self.relu2 = nn.ReLU(inplace=True)
+        self.conv3 = nn.Conv2d(output_dim * 2, output_dim, kernel_size=1)
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
+            elif isinstance(m, (nn.InstanceNorm2d)):
+                if m.weight is not None:
+                    nn.init.constant_(m.weight, 1)
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+    def _make_layer(self, dim, stride=1):
+        layer1 = ResidualBlock(self.in_planes, dim, self.norm_fn, stride=stride)
+        layer2 = ResidualBlock(dim, dim, self.norm_fn, stride=1)
+        layers = (layer1, layer2)
+        self.in_planes = dim
+        return nn.Sequential(*layers)
+    def forward(self, x):
+        _, _, H, W = x.shape
+        x = self.conv1(x)
+        x = self.norm1(x)
+        x = self.relu1(x)
+        a = self.layer1(x)
+        b = self.layer2(a)
+        c = self.layer3(b)
+        d = self.layer4(c)
+        def _bilinear_intepolate(x):
+            return F.interpolate(
+                x,
+                (H // self.stride, W // self.stride),
+                mode="bilinear",
+                align_corners=True,
+            )
+        a = _bilinear_intepolate(a)
+        b = _bilinear_intepolate(b)
+        c = _bilinear_intepolate(c)
+        d = _bilinear_intepolate(d)
+        x = self.conv2(torch.cat([a, b, c, d], dim=1))
+        x = self.norm2(x)
+        x = self.relu2(x)
+        x = self.conv3(x)
+        return x
+class CorrBlock:
+    def __init__(
+        self,
+        fmaps,
+        num_levels=4,
+        radius=4,
+        multiple_track_feats=False,
+        padding_mode="zeros",
+    ):
+        B, S, C, H, W = fmaps.shape
+        self.S, self.C, self.H, self.W = S, C, H, W
+        self.padding_mode = padding_mode
+        self.num_levels = num_levels
+        self.radius = radius
+        self.fmaps_pyramid = []
+        self.multiple_track_feats = multiple_track_feats
+        self.fmaps_pyramid.append(fmaps)
+        for i in range(self.num_levels - 1):
+            fmaps_ = fmaps.reshape(B * S, C, H, W)
+            fmaps_ = F.avg_pool2d(fmaps_, 2, stride=2)
+            _, _, H, W = fmaps_.shape
+            fmaps = fmaps_.reshape(B, S, C, H, W)
+            self.fmaps_pyramid.append(fmaps)
+    def sample(self, coords):
+        r = self.radius
+        B, S, N, D = coords.shape
+        assert D == 2
+        H, W = self.H, self.W
+        out_pyramid = []
+        for i in range(self.num_levels):
+            corrs = self.corrs_pyramid[i]  # B, S, N, H, W
+            *_, H, W = corrs.shape
+            dx = torch.linspace(-r, r, 2 * r + 1)
+            dy = torch.linspace(-r, r, 2 * r + 1)
+            delta = torch.stack(torch.meshgrid(dy, dx, indexing="ij"), axis=-1).to(coords.device)
+            centroid_lvl = coords.reshape(B * S * N, 1, 1, 2) / 2**i
+            delta_lvl = delta.view(1, 2 * r + 1, 2 * r + 1, 2)
+            coords_lvl = centroid_lvl + delta_lvl
+            corrs = bilinear_sampler(
+                corrs.reshape(B * S * N, 1, H, W),
+                coords_lvl,
+                padding_mode=self.padding_mode,
+            )
+            corrs = corrs.view(B, S, N, -1)
+            out_pyramid.append(corrs)
+        out = torch.cat(out_pyramid, dim=-1)  # B, S, N, LRR*2
+        out = out.permute(0, 2, 1, 3).contiguous().view(B * N, S, -1).float()
+        return out
+    def corr(self, targets):
+        B, S, N, C = targets.shape
+        if self.multiple_track_feats:
+            targets_split = targets.split(C // self.num_levels, dim=-1)
+            B, S, N, C = targets_split[0].shape
+        assert C == self.C
+        assert S == self.S
+        fmap1 = targets
+        self.corrs_pyramid = []
+        for i, fmaps in enumerate(self.fmaps_pyramid):
+            *_, H, W = fmaps.shape
+            fmap2s = fmaps.view(B, S, C, H * W)  # B S C H W ->  B S C (H W)
+            if self.multiple_track_feats:
+                fmap1 = targets_split[i]
+            corrs = torch.matmul(fmap1, fmap2s)
+            corrs = corrs.view(B, S, N, H, W)  # B S N (H W) -> B S N H W
+            corrs = corrs / torch.sqrt(torch.tensor(C).float())
+            self.corrs_pyramid.append(corrs)
+class Attention(nn.Module):
+    def __init__(self, query_dim, context_dim=None, num_heads=8, dim_head=48, qkv_bias=False):
+        super().__init__()
+        inner_dim = dim_head * num_heads
+        context_dim = default(context_dim, query_dim)
+        self.scale = dim_head**-0.5
+        self.heads = num_heads
+        self.to_q = nn.Linear(query_dim, inner_dim, bias=qkv_bias)
+        self.to_kv = nn.Linear(context_dim, inner_dim * 2, bias=qkv_bias)
+        self.to_out = nn.Linear(inner_dim, query_dim)
+    def forward(self, x, context=None, attn_bias=None):
+        B, N1, C = x.shape
+        h = self.heads
+        q = self.to_q(x).reshape(B, N1, h, C // h).permute(0, 2, 1, 3)
+        context = default(context, x)
+        k, v = self.to_kv(context).chunk(2, dim=-1)
+        N2 = context.shape[1]
+        k = k.reshape(B, N2, h, C // h).permute(0, 2, 1, 3)
+        v = v.reshape(B, N2, h, C // h).permute(0, 2, 1, 3)
+        sim = (q @ k.transpose(-2, -1)) * self.scale
+        if attn_bias is not None:
+            sim = sim + attn_bias
+        attn = sim.softmax(dim=-1)
+        x = (attn @ v).transpose(1, 2).reshape(B, N1, C)
+        return self.to_out(x)
+class AttnBlock(nn.Module):
+    def __init__(
+        self,
+        hidden_size,
+        num_heads,
+        attn_class: Callable[..., nn.Module] = Attention,
+        mlp_ratio=4.0,
+        **block_kwargs
+    ):
+        super().__init__()
+        self.norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.attn = attn_class(hidden_size, num_heads=num_heads, qkv_bias=True, **block_kwargs)
+        self.norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        mlp_hidden_dim = int(hidden_size * mlp_ratio)
+        approx_gelu = lambda: nn.GELU(approximate="tanh")
+        self.mlp = Mlp(
+            in_features=hidden_size,
+            hidden_features=mlp_hidden_dim,
+            act_layer=approx_gelu,
+            drop=0,
+        )
+    def forward(self, x, mask=None):
+        attn_bias = mask
+        if mask is not None:
+            mask = (
+                (mask[:, None] * mask[:, :, None])
+                .unsqueeze(1)
+                .expand(-1, self.attn.num_heads, -1, -1)
+            )
+            max_neg_value = -torch.finfo(x.dtype).max
+            attn_bias = (~mask) * max_neg_value
+        x = x + self.attn(self.norm1(x), attn_bias=attn_bias)
+        x = x + self.mlp(self.norm2(x))
+        return x

cotracker/build/lib/models/core/cotracker/cotracker.py ADDED Viewed

	@@ -0,0 +1,503 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from cotracker.models.core.model_utils import sample_features4d, sample_features5d
+from cotracker.models.core.embeddings import (
+    get_2d_embedding,
+    get_1d_sincos_pos_embed_from_grid,
+    get_2d_sincos_pos_embed,
+)
+from cotracker.models.core.cotracker.blocks import (
+    Mlp,
+    BasicEncoder,
+    AttnBlock,
+    CorrBlock,
+    Attention,
+)
+torch.manual_seed(0)
+class CoTracker2(nn.Module):
+    def __init__(
+        self,
+        window_len=8,
+        stride=4,
+        add_space_attn=True,
+        num_virtual_tracks=64,
+        model_resolution=(384, 512),
+    ):
+        super(CoTracker2, self).__init__()
+        self.window_len = window_len
+        self.stride = stride
+        self.hidden_dim = 256
+        self.latent_dim = 128
+        self.add_space_attn = add_space_attn
+        self.fnet = BasicEncoder(output_dim=self.latent_dim)
+        self.num_virtual_tracks = num_virtual_tracks
+        self.model_resolution = model_resolution
+        self.input_dim = 456
+        self.updateformer = EfficientUpdateFormer(
+            space_depth=6,
+            time_depth=6,
+            input_dim=self.input_dim,
+            hidden_size=384,
+            output_dim=self.latent_dim + 2,
+            mlp_ratio=4.0,
+            add_space_attn=add_space_attn,
+            num_virtual_tracks=num_virtual_tracks,
+        )
+        time_grid = torch.linspace(0, window_len - 1, window_len).reshape(1, window_len, 1)
+        self.register_buffer(
+            "time_emb", get_1d_sincos_pos_embed_from_grid(self.input_dim, time_grid[0])
+        )
+        self.register_buffer(
+            "pos_emb",
+            get_2d_sincos_pos_embed(
+                embed_dim=self.input_dim,
+                grid_size=(
+                    model_resolution[0] // stride,
+                    model_resolution[1] // stride,
+                ),
+            ),
+        )
+        self.norm = nn.GroupNorm(1, self.latent_dim)
+        self.track_feat_updater = nn.Sequential(
+            nn.Linear(self.latent_dim, self.latent_dim),
+            nn.GELU(),
+        )
+        self.vis_predictor = nn.Sequential(
+            nn.Linear(self.latent_dim, 1),
+        )
+    def forward_window(
+        self,
+        fmaps,
+        coords,
+        track_feat=None,
+        vis=None,
+        track_mask=None,
+        attention_mask=None,
+        iters=4,
+    ):
+        # B = batch size
+        # S = number of frames in the window)
+        # N = number of tracks
+        # C = channels of a point feature vector
+        # E = positional embedding size
+        # LRR = local receptive field radius
+        # D = dimension of the transformer input tokens
+        # track_feat = B S N C
+        # vis = B S N 1
+        # track_mask = B S N 1
+        # attention_mask = B S N
+        B, S_init, N, __ = track_mask.shape
+        B, S, *_ = fmaps.shape
+        track_mask = F.pad(track_mask, (0, 0, 0, 0, 0, S - S_init), "constant")
+        track_mask_vis = (
+            torch.cat([track_mask, vis], dim=-1).permute(0, 2, 1, 3).reshape(B * N, S, 2)
+        )
+        corr_block = CorrBlock(
+            fmaps,
+            num_levels=4,
+            radius=3,
+            padding_mode="border",
+        )
+        sampled_pos_emb = (
+            sample_features4d(self.pos_emb.repeat(B, 1, 1, 1), coords[:, 0])
+            .reshape(B * N, self.input_dim)
+            .unsqueeze(1)
+        )  # B E N -> (B N) 1 E
+        coord_preds = []
+        for __ in range(iters):
+            coords = coords.detach()  # B S N 2
+            corr_block.corr(track_feat)
+            # Sample correlation features around each point
+            fcorrs = corr_block.sample(coords)  # (B N) S LRR
+            # Get the flow embeddings
+            flows = (coords - coords[:, 0:1]).permute(0, 2, 1, 3).reshape(B * N, S, 2)
+            flow_emb = get_2d_embedding(flows, 64, cat_coords=True)  # N S E
+            track_feat_ = track_feat.permute(0, 2, 1, 3).reshape(B * N, S, self.latent_dim)
+            transformer_input = torch.cat([flow_emb, fcorrs, track_feat_, track_mask_vis], dim=2)
+            x = transformer_input + sampled_pos_emb + self.time_emb
+            x = x.view(B, N, S, -1)  # (B N) S D -> B N S D
+            delta = self.updateformer(
+                x,
+                attention_mask.reshape(B * S, N),  # B S N -> (B S) N
+            )
+            delta_coords = delta[..., :2].permute(0, 2, 1, 3)
+            coords = coords + delta_coords
+            coord_preds.append(coords * self.stride)
+            delta_feats_ = delta[..., 2:].reshape(B * N * S, self.latent_dim)
+            track_feat_ = track_feat.permute(0, 2, 1, 3).reshape(B * N * S, self.latent_dim)
+            track_feat_ = self.track_feat_updater(self.norm(delta_feats_)) + track_feat_
+            track_feat = track_feat_.reshape(B, N, S, self.latent_dim).permute(
+                0, 2, 1, 3
+            )  # (B N S) C -> B S N C
+        vis_pred = self.vis_predictor(track_feat).reshape(B, S, N)
+        return coord_preds, vis_pred
+    def get_track_feat(self, fmaps, queried_frames, queried_coords):
+        sample_frames = queried_frames[:, None, :, None]
+        sample_coords = torch.cat(
+            [
+                sample_frames,
+                queried_coords[:, None],
+            ],
+            dim=-1,
+        )
+        sample_track_feats = sample_features5d(fmaps, sample_coords)
+        return sample_track_feats
+    def init_video_online_processing(self):
+        self.online_ind = 0
+        self.online_track_feat = None
+        self.online_coords_predicted = None
+        self.online_vis_predicted = None
+    def forward(self, video, queries, iters=4, is_train=False, is_online=False):
+        """Predict tracks
+        Args:
+            video (FloatTensor[B, T, 3]): input videos.
+            queries (FloatTensor[B, N, 3]): point queries.
+            iters (int, optional): number of updates. Defaults to 4.
+            is_train (bool, optional): enables training mode. Defaults to False.
+            is_online (bool, optional): enables online mode. Defaults to False. Before enabling, call model.init_video_online_processing().
+        Returns:
+            - coords_predicted (FloatTensor[B, T, N, 2]):
+            - vis_predicted (FloatTensor[B, T, N]):
+            - train_data: `None` if `is_train` is false, otherwise:
+                - all_vis_predictions (List[FloatTensor[B, S, N, 1]]):
+                - all_coords_predictions (List[FloatTensor[B, S, N, 2]]):
+                - mask (BoolTensor[B, T, N]):
+        """
+        B, T, C, H, W = video.shape
+        B, N, __ = queries.shape
+        S = self.window_len
+        device = queries.device
+        # B = batch size
+        # S = number of frames in the window of the padded video
+        # S_trimmed = actual number of frames in the window
+        # N = number of tracks
+        # C = color channels (3 for RGB)
+        # E = positional embedding size
+        # LRR = local receptive field radius
+        # D = dimension of the transformer input tokens
+        # video = B T C H W
+        # queries = B N 3
+        # coords_init = B S N 2
+        # vis_init = B S N 1
+        assert S >= 2  # A tracker needs at least two frames to track something
+        if is_online:
+            assert T <= S, "Online mode: video chunk must be <= window size."
+            assert self.online_ind is not None, "Call model.init_video_online_processing() first."
+            assert not is_train, "Training not supported in online mode."
+        step = S // 2  # How much the sliding window moves at every step
+        video = 2 * (video / 255.0) - 1.0
+        # The first channel is the frame number
+        # The rest are the coordinates of points we want to track
+        queried_frames = queries[:, :, 0].long()
+        queried_coords = queries[..., 1:]
+        queried_coords = queried_coords / self.stride
+        # We store our predictions here
+        coords_predicted = torch.zeros((B, T, N, 2), device=device)
+        vis_predicted = torch.zeros((B, T, N), device=device)
+        if is_online:
+            if self.online_coords_predicted is None:
+                # Init online predictions with zeros
+                self.online_coords_predicted = coords_predicted
+                self.online_vis_predicted = vis_predicted
+            else:
+                # Pad online predictions with zeros for the current window
+                pad = min(step, T - step)
+                coords_predicted = F.pad(
+                    self.online_coords_predicted, (0, 0, 0, 0, 0, pad), "constant"
+                )
+                vis_predicted = F.pad(self.online_vis_predicted, (0, 0, 0, pad), "constant")
+        all_coords_predictions, all_vis_predictions = [], []
+        # Pad the video so that an integer number of sliding windows fit into it
+        # TODO: we may drop this requirement because the transformer should not care
+        # TODO: pad the features instead of the video
+        pad = S - T if is_online else (S - T % S) % S  # We don't want to pad if T % S == 0
+        video = F.pad(video.reshape(B, 1, T, C * H * W), (0, 0, 0, pad), "replicate").reshape(
+            B, -1, C, H, W
+        )
+        # Compute convolutional features for the video or for the current chunk in case of online mode
+        fmaps = self.fnet(video.reshape(-1, C, H, W)).reshape(
+            B, -1, self.latent_dim, H // self.stride, W // self.stride
+        )
+        # We compute track features
+        track_feat = self.get_track_feat(
+            fmaps,
+            queried_frames - self.online_ind if is_online else queried_frames,
+            queried_coords,
+        ).repeat(1, S, 1, 1)
+        if is_online:
+            # We update track features for the current window
+            sample_frames = queried_frames[:, None, :, None]  # B 1 N 1
+            left = 0 if self.online_ind == 0 else self.online_ind + step
+            right = self.online_ind + S
+            sample_mask = (sample_frames >= left) & (sample_frames < right)
+            if self.online_track_feat is None:
+                self.online_track_feat = torch.zeros_like(track_feat, device=device)
+            self.online_track_feat += track_feat * sample_mask
+            track_feat = self.online_track_feat.clone()
+        # We process ((num_windows - 1) * step + S) frames in total, so there are
+        # (ceil((T - S) / step) + 1) windows
+        num_windows = (T - S + step - 1) // step + 1
+        # We process only the current video chunk in the online mode
+        indices = [self.online_ind] if is_online else range(0, step * num_windows, step)
+        coords_init = queried_coords.reshape(B, 1, N, 2).expand(B, S, N, 2).float()
+        vis_init = torch.ones((B, S, N, 1), device=device).float() * 10
+        for ind in indices:
+            # We copy over coords and vis for tracks that are queried
+            # by the end of the previous window, which is ind + overlap
+            if ind > 0:
+                overlap = S - step
+                copy_over = (queried_frames < ind + overlap)[:, None, :, None]  # B 1 N 1
+                coords_prev = torch.nn.functional.pad(
+                    coords_predicted[:, ind : ind + overlap] / self.stride,
+                    (0, 0, 0, 0, 0, step),
+                    "replicate",
+                )  # B S N 2
+                vis_prev = torch.nn.functional.pad(
+                    vis_predicted[:, ind : ind + overlap, :, None].clone(),
+                    (0, 0, 0, 0, 0, step),
+                    "replicate",
+                )  # B S N 1
+                coords_init = torch.where(
+                    copy_over.expand_as(coords_init), coords_prev, coords_init
+                )
+                vis_init = torch.where(copy_over.expand_as(vis_init), vis_prev, vis_init)
+            # The attention mask is 1 for the spatio-temporal points within
+            # a track which is updated in the current window
+            attention_mask = (queried_frames < ind + S).reshape(B, 1, N).repeat(1, S, 1)  # B S N
+            # The track mask is 1 for the spatio-temporal points that actually
+            # need updating: only after begin queried, and not if contained
+            # in a previous window
+            track_mask = (
+                queried_frames[:, None, :, None]
+                <= torch.arange(ind, ind + S, device=device)[None, :, None, None]
+            ).contiguous()  # B S N 1
+            if ind > 0:
+                track_mask[:, :overlap, :, :] = False
+            # Predict the coordinates and visibility for the current window
+            coords, vis = self.forward_window(
+                fmaps=fmaps if is_online else fmaps[:, ind : ind + S],
+                coords=coords_init,
+                track_feat=attention_mask.unsqueeze(-1) * track_feat,
+                vis=vis_init,
+                track_mask=track_mask,
+                attention_mask=attention_mask,
+                iters=iters,
+            )
+            S_trimmed = T if is_online else min(T - ind, S)  # accounts for last window duration
+            coords_predicted[:, ind : ind + S] = coords[-1][:, :S_trimmed]
+            vis_predicted[:, ind : ind + S] = vis[:, :S_trimmed]
+            if is_train:
+                all_coords_predictions.append([coord[:, :S_trimmed] for coord in coords])
+                all_vis_predictions.append(torch.sigmoid(vis[:, :S_trimmed]))
+        if is_online:
+            self.online_ind += step
+            self.online_coords_predicted = coords_predicted
+            self.online_vis_predicted = vis_predicted
+        vis_predicted = torch.sigmoid(vis_predicted)
+        if is_train:
+            mask = queried_frames[:, None] <= torch.arange(0, T, device=device)[None, :, None]
+            train_data = (all_coords_predictions, all_vis_predictions, mask)
+        else:
+            train_data = None
+        return coords_predicted, vis_predicted, train_data
+class EfficientUpdateFormer(nn.Module):
+    """
+    Transformer model that updates track estimates.
+    """
+    def __init__(
+        self,
+        space_depth=6,
+        time_depth=6,
+        input_dim=320,
+        hidden_size=384,
+        num_heads=8,
+        output_dim=130,
+        mlp_ratio=4.0,
+        add_space_attn=True,
+        num_virtual_tracks=64,
+    ):
+        super().__init__()
+        self.out_channels = 2
+        self.num_heads = num_heads
+        self.hidden_size = hidden_size
+        self.add_space_attn = add_space_attn
+        self.input_transform = torch.nn.Linear(input_dim, hidden_size, bias=True)
+        self.flow_head = torch.nn.Linear(hidden_size, output_dim, bias=True)
+        self.num_virtual_tracks = num_virtual_tracks
+        self.virual_tracks = nn.Parameter(torch.randn(1, num_virtual_tracks, 1, hidden_size))
+        self.time_blocks = nn.ModuleList(
+            [
+                AttnBlock(
+                    hidden_size,
+                    num_heads,
+                    mlp_ratio=mlp_ratio,
+                    attn_class=Attention,
+                )
+                for _ in range(time_depth)
+            ]
+        )
+        if add_space_attn:
+            self.space_virtual_blocks = nn.ModuleList(
+                [
+                    AttnBlock(
+                        hidden_size,
+                        num_heads,
+                        mlp_ratio=mlp_ratio,
+                        attn_class=Attention,
+                    )
+                    for _ in range(space_depth)
+                ]
+            )
+            self.space_point2virtual_blocks = nn.ModuleList(
+                [
+                    CrossAttnBlock(hidden_size, hidden_size, num_heads, mlp_ratio=mlp_ratio)
+                    for _ in range(space_depth)
+                ]
+            )
+            self.space_virtual2point_blocks = nn.ModuleList(
+                [
+                    CrossAttnBlock(hidden_size, hidden_size, num_heads, mlp_ratio=mlp_ratio)
+                    for _ in range(space_depth)
+                ]
+            )
+            assert len(self.time_blocks) >= len(self.space_virtual2point_blocks)
+        self.initialize_weights()
+    def initialize_weights(self):
+        def _basic_init(module):
+            if isinstance(module, nn.Linear):
+                torch.nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.constant_(module.bias, 0)
+        self.apply(_basic_init)
+    def forward(self, input_tensor, mask=None):
+        tokens = self.input_transform(input_tensor)
+        B, _, T, _ = tokens.shape
+        virtual_tokens = self.virual_tracks.repeat(B, 1, T, 1)
+        tokens = torch.cat([tokens, virtual_tokens], dim=1)
+        _, N, _, _ = tokens.shape
+        j = 0
+        for i in range(len(self.time_blocks)):
+            time_tokens = tokens.contiguous().view(B * N, T, -1)  # B N T C -> (B N) T C
+            time_tokens = self.time_blocks[i](time_tokens)
+            tokens = time_tokens.view(B, N, T, -1)  # (B N) T C -> B N T C
+            if self.add_space_attn and (
+                i % (len(self.time_blocks) // len(self.space_virtual_blocks)) == 0
+            ):
+                space_tokens = (
+                    tokens.permute(0, 2, 1, 3).contiguous().view(B * T, N, -1)
+                )  # B N T C -> (B T) N C
+                point_tokens = space_tokens[:, : N - self.num_virtual_tracks]
+                virtual_tokens = space_tokens[:, N - self.num_virtual_tracks :]
+                virtual_tokens = self.space_virtual2point_blocks[j](
+                    virtual_tokens, point_tokens, mask=mask
+                )
+                virtual_tokens = self.space_virtual_blocks[j](virtual_tokens)
+                point_tokens = self.space_point2virtual_blocks[j](
+                    point_tokens, virtual_tokens, mask=mask
+                )
+                space_tokens = torch.cat([point_tokens, virtual_tokens], dim=1)
+                tokens = space_tokens.view(B, T, N, -1).permute(0, 2, 1, 3)  # (B T) N C -> B N T C
+                j += 1
+        tokens = tokens[:, : N - self.num_virtual_tracks]
+        flow = self.flow_head(tokens)
+        return flow
+class CrossAttnBlock(nn.Module):
+    def __init__(self, hidden_size, context_dim, num_heads=1, mlp_ratio=4.0, **block_kwargs):
+        super().__init__()
+        self.norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.norm_context = nn.LayerNorm(hidden_size)
+        self.cross_attn = Attention(
+            hidden_size, context_dim=context_dim, num_heads=num_heads, qkv_bias=True, **block_kwargs
+        )
+        self.norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        mlp_hidden_dim = int(hidden_size * mlp_ratio)
+        approx_gelu = lambda: nn.GELU(approximate="tanh")
+        self.mlp = Mlp(
+            in_features=hidden_size,
+            hidden_features=mlp_hidden_dim,
+            act_layer=approx_gelu,
+            drop=0,
+        )
+    def forward(self, x, context, mask=None):
+        if mask is not None:
+            if mask.shape[1] == x.shape[1]:
+                mask = mask[:, None, :, None].expand(
+                    -1, self.cross_attn.heads, -1, context.shape[1]
+                )
+            else:
+                mask = mask[:, None, None].expand(-1, self.cross_attn.heads, x.shape[1], -1)
+            max_neg_value = -torch.finfo(x.dtype).max
+            attn_bias = (~mask) * max_neg_value
+        x = x + self.cross_attn(
+            self.norm1(x), context=self.norm_context(context), attn_bias=attn_bias
+        )
+        x = x + self.mlp(self.norm2(x))
+        return x

cotracker/build/lib/models/core/cotracker/losses.py ADDED Viewed

	@@ -0,0 +1,61 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+import torch.nn.functional as F
+from cotracker.models.core.model_utils import reduce_masked_mean
+EPS = 1e-6
+def balanced_ce_loss(pred, gt, valid=None):
+    total_balanced_loss = 0.0
+    for j in range(len(gt)):
+        B, S, N = gt[j].shape
+        # pred and gt are the same shape
+        for (a, b) in zip(pred[j].size(), gt[j].size()):
+            assert a == b  # some shape mismatch!
+        # if valid is not None:
+        for (a, b) in zip(pred[j].size(), valid[j].size()):
+            assert a == b  # some shape mismatch!
+        pos = (gt[j] > 0.95).float()
+        neg = (gt[j] < 0.05).float()
+        label = pos * 2.0 - 1.0
+        a = -label * pred[j]
+        b = F.relu(a)
+        loss = b + torch.log(torch.exp(-b) + torch.exp(a - b))
+        pos_loss = reduce_masked_mean(loss, pos * valid[j])
+        neg_loss = reduce_masked_mean(loss, neg * valid[j])
+        balanced_loss = pos_loss + neg_loss
+        total_balanced_loss += balanced_loss / float(N)
+    return total_balanced_loss
+def sequence_loss(flow_preds, flow_gt, vis, valids, gamma=0.8):
+    """Loss function defined over sequence of flow predictions"""
+    total_flow_loss = 0.0
+    for j in range(len(flow_gt)):
+        B, S, N, D = flow_gt[j].shape
+        assert D == 2
+        B, S1, N = vis[j].shape
+        B, S2, N = valids[j].shape
+        assert S == S1
+        assert S == S2
+        n_predictions = len(flow_preds[j])
+        flow_loss = 0.0
+        for i in range(n_predictions):
+            i_weight = gamma ** (n_predictions - i - 1)
+            flow_pred = flow_preds[j][i]
+            i_loss = (flow_pred - flow_gt[j]).abs()  # B, S, N, 2
+            i_loss = torch.mean(i_loss, dim=3)  # B, S, N
+            flow_loss += i_weight * reduce_masked_mean(i_loss, valids[j])
+        flow_loss = flow_loss / n_predictions
+        total_flow_loss += flow_loss / float(N)
+    return total_flow_loss

cotracker/build/lib/models/core/embeddings.py ADDED Viewed

	@@ -0,0 +1,120 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import Tuple, Union
+import torch
+def get_2d_sincos_pos_embed(
+    embed_dim: int, grid_size: Union[int, Tuple[int, int]]
+) -> torch.Tensor:
+    """
+    This function initializes a grid and generates a 2D positional embedding using sine and cosine functions.
+    It is a wrapper of get_2d_sincos_pos_embed_from_grid.
+    Args:
+    - embed_dim: The embedding dimension.
+    - grid_size: The grid size.
+    Returns:
+    - pos_embed: The generated 2D positional embedding.
+    """
+    if isinstance(grid_size, tuple):
+        grid_size_h, grid_size_w = grid_size
+    else:
+        grid_size_h = grid_size_w = grid_size
+    grid_h = torch.arange(grid_size_h, dtype=torch.float)
+    grid_w = torch.arange(grid_size_w, dtype=torch.float)
+    grid = torch.meshgrid(grid_w, grid_h, indexing="xy")
+    grid = torch.stack(grid, dim=0)
+    grid = grid.reshape([2, 1, grid_size_h, grid_size_w])
+    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
+    return pos_embed.reshape(1, grid_size_h, grid_size_w, -1).permute(0, 3, 1, 2)
+def get_2d_sincos_pos_embed_from_grid(
+    embed_dim: int, grid: torch.Tensor
+) -> torch.Tensor:
+    """
+    This function generates a 2D positional embedding from a given grid using sine and cosine functions.
+    Args:
+    - embed_dim: The embedding dimension.
+    - grid: The grid to generate the embedding from.
+    Returns:
+    - emb: The generated 2D positional embedding.
+    """
+    assert embed_dim % 2 == 0
+    # use half of dimensions to encode grid_h
+    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])  # (H*W, D/2)
+    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])  # (H*W, D/2)
+    emb = torch.cat([emb_h, emb_w], dim=2)  # (H*W, D)
+    return emb
+def get_1d_sincos_pos_embed_from_grid(
+    embed_dim: int, pos: torch.Tensor
+) -> torch.Tensor:
+    """
+    This function generates a 1D positional embedding from a given grid using sine and cosine functions.
+    Args:
+    - embed_dim: The embedding dimension.
+    - pos: The position to generate the embedding from.
+    Returns:
+    - emb: The generated 1D positional embedding.
+    """
+    assert embed_dim % 2 == 0
+    omega = torch.arange(embed_dim // 2, dtype=torch.double)
+    omega /= embed_dim / 2.0
+    omega = 1.0 / 10000**omega  # (D/2,)
+    pos = pos.reshape(-1)  # (M,)
+    out = torch.einsum("m,d->md", pos, omega)  # (M, D/2), outer product
+    emb_sin = torch.sin(out)  # (M, D/2)
+    emb_cos = torch.cos(out)  # (M, D/2)
+    emb = torch.cat([emb_sin, emb_cos], dim=1)  # (M, D)
+    return emb[None].float()
+def get_2d_embedding(xy: torch.Tensor, C: int, cat_coords: bool = True) -> torch.Tensor:
+    """
+    This function generates a 2D positional embedding from given coordinates using sine and cosine functions.
+    Args:
+    - xy: The coordinates to generate the embedding from.
+    - C: The size of the embedding.
+    - cat_coords: A flag to indicate whether to concatenate the original coordinates to the embedding.
+    Returns:
+    - pe: The generated 2D positional embedding.
+    """
+    B, N, D = xy.shape
+    assert D == 2
+    x = xy[:, :, 0:1]
+    y = xy[:, :, 1:2]
+    div_term = (
+        torch.arange(0, C, 2, device=xy.device, dtype=torch.float32) * (1000.0 / C)
+    ).reshape(1, 1, int(C / 2))
+    pe_x = torch.zeros(B, N, C, device=xy.device, dtype=torch.float32)
+    pe_y = torch.zeros(B, N, C, device=xy.device, dtype=torch.float32)
+    pe_x[:, :, 0::2] = torch.sin(x * div_term)
+    pe_x[:, :, 1::2] = torch.cos(x * div_term)
+    pe_y[:, :, 0::2] = torch.sin(y * div_term)
+    pe_y[:, :, 1::2] = torch.cos(y * div_term)
+    pe = torch.cat([pe_x, pe_y], dim=2)  # (B, N, C*3)
+    if cat_coords:
+        pe = torch.cat([xy, pe], dim=2)  # (B, N, C*3+3)
+    return pe

cotracker/build/lib/models/core/model_utils.py ADDED Viewed

	@@ -0,0 +1,271 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+import torch.nn.functional as F
+from typing import Optional, Tuple
+EPS = 1e-6
+def smart_cat(tensor1, tensor2, dim):
+    if tensor1 is None:
+        return tensor2
+    return torch.cat([tensor1, tensor2], dim=dim)
+def get_points_on_a_grid(
+    size: int,
+    extent: Tuple[float, ...],
+    center: Optional[Tuple[float, ...]] = None,
+    device: Optional[torch.device] = torch.device("cpu"),
+    shift_grid: bool = False,
+):
+    r"""Get a grid of points covering a rectangular region
+    `get_points_on_a_grid(size, extent)` generates a :attr:`size` by
+    :attr:`size` grid fo points distributed to cover a rectangular area
+    specified by `extent`.
+    The `extent` is a pair of integer :math:`(H,W)` specifying the height
+    and width of the rectangle.
+    Optionally, the :attr:`center` can be specified as a pair :math:`(c_y,c_x)`
+    specifying the vertical and horizontal center coordinates. The center
+    defaults to the middle of the extent.
+    Points are distributed uniformly within the rectangle leaving a margin
+    :math:`m=W/64` from the border.
+    It returns a :math:`(1, \text{size} \times \text{size}, 2)` tensor of
+    points :math:`P_{ij}=(x_i, y_i)` where
+    .. math::
+        P_{ij} = \left(
+             c_x + m -\frac{W}{2} + \frac{W - 2m}{\text{size} - 1}\, j,~
+             c_y + m -\frac{H}{2} + \frac{H - 2m}{\text{size} - 1}\, i
+        \right)
+    Points are returned in row-major order.
+    Args:
+        size (int): grid size.
+        extent (tuple): height and with of the grid extent.
+        center (tuple, optional): grid center.
+        device (str, optional): Defaults to `"cpu"`.
+    Returns:
+        Tensor: grid.
+    """
+    if size == 1:
+        return torch.tensor([extent[1] / 2, extent[0] / 2], device=device)[None, None]
+    if center is None:
+        center = [extent[0] / 2, extent[1] / 2]
+    margin = extent[1] / 64
+    range_y = (margin - extent[0] / 2 + center[0], extent[0] / 2 + center[0] - margin)
+    range_x = (margin - extent[1] / 2 + center[1], extent[1] / 2 + center[1] - margin)
+    grid_y, grid_x = torch.meshgrid(
+        torch.linspace(*range_y, size, device=device),
+        torch.linspace(*range_x, size, device=device),
+        indexing="ij",
+    )
+    if shift_grid:
+        # shift the grid randomly
+        # grid_x: (10, 10)
+        # grid_y: (10, 10)
+        shift_x = (range_x[1] - range_x[0]) / (size - 1)
+        shift_y = (range_y[1] - range_y[0]) / (size - 1)
+        grid_x = grid_x + torch.randn_like(grid_x) / 3 * shift_x / 2
+        grid_y = grid_y + torch.randn_like(grid_y) / 3 * shift_y / 2
+        # stay within the bounds
+        grid_x = torch.clamp(grid_x, range_x[0], range_x[1])
+        grid_y = torch.clamp(grid_y, range_y[0], range_y[1])
+    return torch.stack([grid_x, grid_y], dim=-1).reshape(1, -1, 2)
+def reduce_masked_mean(input, mask, dim=None, keepdim=False):
+    r"""Masked mean
+    `reduce_masked_mean(x, mask)` computes the mean of a tensor :attr:`input`
+    over a mask :attr:`mask`, returning
+    .. math::
+        \text{output} =
+        \frac
+        {\sum_{i=1}^N \text{input}_i \cdot \text{mask}_i}
+        {\epsilon + \sum_{i=1}^N \text{mask}_i}
+    where :math:`N` is the number of elements in :attr:`input` and
+    :attr:`mask`, and :math:`\epsilon` is a small constant to avoid
+    division by zero.
+    `reduced_masked_mean(x, mask, dim)` computes the mean of a tensor
+    :attr:`input` over a mask :attr:`mask` along a dimension :attr:`dim`.
+    Optionally, the dimension can be kept in the output by setting
+    :attr:`keepdim` to `True`. Tensor :attr:`mask` must be broadcastable to
+    the same dimension as :attr:`input`.
+    The interface is similar to `torch.mean()`.
+    Args:
+        inout (Tensor): input tensor.
+        mask (Tensor): mask.
+        dim (int, optional): Dimension to sum over. Defaults to None.
+        keepdim (bool, optional): Keep the summed dimension. Defaults to False.
+    Returns:
+        Tensor: mean tensor.
+    """
+    mask = mask.expand_as(input)
+    prod = input * mask
+    if dim is None:
+        numer = torch.sum(prod)
+        denom = torch.sum(mask)
+    else:
+        numer = torch.sum(prod, dim=dim, keepdim=keepdim)
+        denom = torch.sum(mask, dim=dim, keepdim=keepdim)
+    mean = numer / (EPS + denom)
+    return mean
+def bilinear_sampler(input, coords, align_corners=True, padding_mode="border"):
+    r"""Sample a tensor using bilinear interpolation
+    `bilinear_sampler(input, coords)` samples a tensor :attr:`input` at
+    coordinates :attr:`coords` using bilinear interpolation. It is the same
+    as `torch.nn.functional.grid_sample()` but with a different coordinate
+    convention.
+    The input tensor is assumed to be of shape :math:`(B, C, H, W)`, where
+    :math:`B` is the batch size, :math:`C` is the number of channels,
+    :math:`H` is the height of the image, and :math:`W` is the width of the
+    image. The tensor :attr:`coords` of shape :math:`(B, H_o, W_o, 2)` is
+    interpreted as an array of 2D point coordinates :math:`(x_i,y_i)`.
+    Alternatively, the input tensor can be of size :math:`(B, C, T, H, W)`,
+    in which case sample points are triplets :math:`(t_i,x_i,y_i)`. Note
+    that in this case the order of the components is slightly different
+    from `grid_sample()`, which would expect :math:`(x_i,y_i,t_i)`.
+    If `align_corners` is `True`, the coordinate :math:`x` is assumed to be
+    in the range :math:`[0,W-1]`, with 0 corresponding to the center of the
+    left-most image pixel :math:`W-1` to the center of the right-most
+    pixel.
+    If `align_corners` is `False`, the coordinate :math:`x` is assumed to
+    be in the range :math:`[0,W]`, with 0 corresponding to the left edge of
+    the left-most pixel :math:`W` to the right edge of the right-most
+    pixel.
+    Similar conventions apply to the :math:`y` for the range
+    :math:`[0,H-1]` and :math:`[0,H]` and to :math:`t` for the range
+    :math:`[0,T-1]` and :math:`[0,T]`.
+    Args:
+        input (Tensor): batch of input images.
+        coords (Tensor): batch of coordinates.
+        align_corners (bool, optional): Coordinate convention. Defaults to `True`.
+        padding_mode (str, optional): Padding mode. Defaults to `"border"`.
+    Returns:
+        Tensor: sampled points.
+    """
+    sizes = input.shape[2:]
+    assert len(sizes) in [2, 3]
+    if len(sizes) == 3:
+        # t x y -> x y t to match dimensions T H W in grid_sample
+        coords = coords[..., [1, 2, 0]]
+    if align_corners:
+        coords = coords * torch.tensor(
+            [2 / max(size - 1, 1) for size in reversed(sizes)], device=coords.device
+        )
+    else:
+        coords = coords * torch.tensor([2 / size for size in reversed(sizes)], device=coords.device)
+    coords -= 1
+    return F.grid_sample(input, coords, align_corners=align_corners, padding_mode=padding_mode)
+def sample_features4d(input, coords):
+    r"""Sample spatial features
+    `sample_features4d(input, coords)` samples the spatial features
+    :attr:`input` represented by a 4D tensor :math:`(B, C, H, W)`.
+    The field is sampled at coordinates :attr:`coords` using bilinear
+    interpolation. :attr:`coords` is assumed to be of shape :math:`(B, R,
+    3)`, where each sample has the format :math:`(x_i, y_i)`. This uses the
+    same convention as :func:`bilinear_sampler` with `align_corners=True`.
+    The output tensor has one feature per point, and has shape :math:`(B,
+    R, C)`.
+    Args:
+        input (Tensor): spatial features.
+        coords (Tensor): points.
+    Returns:
+        Tensor: sampled features.
+    """
+    B, _, _, _ = input.shape
+    # B R 2 -> B R 1 2
+    coords = coords.unsqueeze(2)
+    # B C R 1
+    feats = bilinear_sampler(input, coords)
+    return feats.permute(0, 2, 1, 3).view(
+        B, -1, feats.shape[1] * feats.shape[3]
+    )  # B C R 1 -> B R C
+def sample_features5d(input, coords):
+    r"""Sample spatio-temporal features
+    `sample_features5d(input, coords)` works in the same way as
+    :func:`sample_features4d` but for spatio-temporal features and points:
+    :attr:`input` is a 5D tensor :math:`(B, T, C, H, W)`, :attr:`coords` is
+    a :math:`(B, R1, R2, 3)` tensor of spatio-temporal point :math:`(t_i,
+    x_i, y_i)`. The output tensor has shape :math:`(B, R1, R2, C)`.
+    Args:
+        input (Tensor): spatio-temporal features.
+        coords (Tensor): spatio-temporal points.
+    Returns:
+        Tensor: sampled features.
+    """
+    B, T, _, _, _ = input.shape
+    # B T C H W -> B C T H W
+    input = input.permute(0, 2, 1, 3, 4)
+    # B R1 R2 3 -> B R1 R2 1 3
+    coords = coords.unsqueeze(3)
+    # B C R1 R2 1
+    feats = bilinear_sampler(input, coords)
+    return feats.permute(0, 2, 3, 1, 4).view(
+        B, feats.shape[2], feats.shape[3], feats.shape[1]
+    )  # B C R1 R2 1 -> B R1 R2 C

cotracker/build/lib/models/evaluation_predictor.py ADDED Viewed

	@@ -0,0 +1,104 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+import torch.nn.functional as F
+from typing import Tuple
+from cotracker.models.core.cotracker.cotracker import CoTracker2
+from cotracker.models.core.model_utils import get_points_on_a_grid
+class EvaluationPredictor(torch.nn.Module):
+    def __init__(
+        self,
+        cotracker_model: CoTracker2,
+        interp_shape: Tuple[int, int] = (384, 512),
+        grid_size: int = 5,
+        local_grid_size: int = 8,
+        single_point: bool = True,
+        n_iters: int = 6,
+    ) -> None:
+        super(EvaluationPredictor, self).__init__()
+        self.grid_size = grid_size
+        self.local_grid_size = local_grid_size
+        self.single_point = single_point
+        self.interp_shape = interp_shape
+        self.n_iters = n_iters
+        self.model = cotracker_model
+        self.model.eval()
+    def forward(self, video, queries):
+        queries = queries.clone()
+        B, T, C, H, W = video.shape
+        B, N, D = queries.shape
+        assert D == 3
+        video = video.reshape(B * T, C, H, W)
+        video = F.interpolate(video, tuple(self.interp_shape), mode="bilinear", align_corners=True)
+        video = video.reshape(B, T, 3, self.interp_shape[0], self.interp_shape[1])
+        device = video.device
+        queries[:, :, 1] *= (self.interp_shape[1] - 1) / (W - 1)
+        queries[:, :, 2] *= (self.interp_shape[0] - 1) / (H - 1)
+        if self.single_point:
+            traj_e = torch.zeros((B, T, N, 2), device=device)
+            vis_e = torch.zeros((B, T, N), device=device)
+            for pind in range((N)):
+                query = queries[:, pind : pind + 1]
+                t = query[0, 0, 0].long()
+                traj_e_pind, vis_e_pind = self._process_one_point(video, query)
+                traj_e[:, t:, pind : pind + 1] = traj_e_pind[:, :, :1]
+                vis_e[:, t:, pind : pind + 1] = vis_e_pind[:, :, :1]
+        else:
+            if self.grid_size > 0:
+                xy = get_points_on_a_grid(self.grid_size, video.shape[3:])
+                xy = torch.cat([torch.zeros_like(xy[:, :, :1]), xy], dim=2).to(device)  #
+                queries = torch.cat([queries, xy], dim=1)  #
+            traj_e, vis_e, __ = self.model(
+                video=video,
+                queries=queries,
+                iters=self.n_iters,
+            )
+        traj_e[:, :, :, 0] *= (W - 1) / float(self.interp_shape[1] - 1)
+        traj_e[:, :, :, 1] *= (H - 1) / float(self.interp_shape[0] - 1)
+        return traj_e, vis_e
+    def _process_one_point(self, video, query):
+        t = query[0, 0, 0].long()
+        device = query.device
+        if self.local_grid_size > 0:
+            xy_target = get_points_on_a_grid(
+                self.local_grid_size,
+                (50, 50),
+                [query[0, 0, 2].item(), query[0, 0, 1].item()],
+            )
+            xy_target = torch.cat([torch.zeros_like(xy_target[:, :, :1]), xy_target], dim=2).to(
+                device
+            )  #
+            query = torch.cat([query, xy_target], dim=1)  #
+        if self.grid_size > 0:
+            xy = get_points_on_a_grid(self.grid_size, video.shape[3:])
+            xy = torch.cat([torch.zeros_like(xy[:, :, :1]), xy], dim=2).to(device)  #
+            query = torch.cat([query, xy], dim=1)  #
+        # crop the video to start from the queried frame
+        query[0, 0, 0] = 0
+        traj_e_pind, vis_e_pind, __ = self.model(
+            video=video[:, t:], queries=query, iters=self.n_iters
+        )
+        return traj_e_pind, vis_e_pind

cotracker/build/lib/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.