Spaces:

pablovela5620
/

sam3d-body-rerun

Running on T4

App Files Files Community

pablovela5620 commited on 8 days ago

Commit

6da47c0

1 Parent(s): cf43f05

init commit

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +6 -0
.gitignore +234 -0
LICENSE-APACHE +201 -0
LICENSE-MIT +25 -0
README.md +82 -12
app.py +43 -0
data/example-data/Amir-Khan-Lamont-Peterson_2689582.jpg +3 -0
data/example-data/BNAAHPYGMYSE26U6C6T7VA6544.jpg +3 -0
data/example-data/Canelo-Alvarez-b4d59f2080464e4d996177f5ce9792ee.jpg +3 -0
data/example-data/Planche.jpg +3 -0
data/example-data/yoga-example.jpg +3 -0
pixi.lock +0 -0
pyproject.toml +149 -0
src/sam3d_body/__init__.py +12 -0
src/sam3d_body/api/demo.py +241 -0
src/sam3d_body/api/visualization.py +425 -0
src/sam3d_body/build_models.py +56 -0
src/sam3d_body/data/__init__.py +1 -0
src/sam3d_body/data/transforms/__init__.py +21 -0
src/sam3d_body/data/transforms/bbox_utils.py +380 -0
src/sam3d_body/data/transforms/common.py +345 -0
src/sam3d_body/data/utils/io.py +114 -0
src/sam3d_body/data/utils/prepare_batch.py +99 -0
src/sam3d_body/gradio_ui/sam3d_body_ui.py +164 -0
src/sam3d_body/metadata/__init__.py +79 -0
src/sam3d_body/metadata/mhr70.py +915 -0
src/sam3d_body/models/__init__.py +1 -0
src/sam3d_body/models/backbones/__init__.py +35 -0
src/sam3d_body/models/backbones/dinov3.py +69 -0
src/sam3d_body/models/backbones/vit.py +658 -0
src/sam3d_body/models/decoders/__init__.py +32 -0
src/sam3d_body/models/decoders/keypoint_prompt_sampler.py +183 -0
src/sam3d_body/models/decoders/prompt_encoder.py +256 -0
src/sam3d_body/models/decoders/promptable_decoder.py +194 -0
src/sam3d_body/models/heads/__init__.py +28 -0
src/sam3d_body/models/heads/camera_head.py +110 -0
src/sam3d_body/models/heads/mhr_head.py +369 -0
src/sam3d_body/models/meta_arch/__init__.py +3 -0
src/sam3d_body/models/meta_arch/base_lightning_module.py +48 -0
src/sam3d_body/models/meta_arch/base_model.py +162 -0
src/sam3d_body/models/meta_arch/sam3d_body.py +1728 -0
src/sam3d_body/models/modules/__init__.py +18 -0
src/sam3d_body/models/modules/camera_embed.py +111 -0
src/sam3d_body/models/modules/drop_path.py +42 -0
src/sam3d_body/models/modules/geometry_utils.py +304 -0
src/sam3d_body/models/modules/layer_scale.py +45 -0
src/sam3d_body/models/modules/mhr_utils.py +392 -0
src/sam3d_body/models/modules/misc.py +31 -0
src/sam3d_body/models/modules/swiglu_ffn.py +96 -0
src/sam3d_body/models/modules/transformer.py +651 -0

.gitattributes CHANGED Viewed

@@ -1,3 +1,9 @@
 *.7z filter=lfs diff=lfs merge=lfs -text
 *.arrow filter=lfs diff=lfs merge=lfs -text
 *.bin filter=lfs diff=lfs merge=lfs -text

+# LFS/Xet-managed assets
+*.jpg filter=lfs diff=lfs merge=lfs -text
+*.jpeg filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text
+*.gif filter=lfs diff=lfs merge=lfs -text
 *.7z filter=lfs diff=lfs merge=lfs -text
 *.arrow filter=lfs diff=lfs merge=lfs -text
 *.bin filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,234 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[codz]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#   Usually these files are written by a python script from a template
+#   before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py.cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+# Pipfile.lock
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+# uv.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+# poetry.lock
+# poetry.toml
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#   pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
+#   https://pdm-project.org/en/latest/usage/project/#working-with-version-control
+# pdm.lock
+# pdm.toml
+.pdm-python
+.pdm-build/
+# pixi
+#   Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
+# pixi.lock
+#   Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
+#   in the .venv directory. It is recommended not to include this directory in version control.
+.pixi
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# Redis
+*.rdb
+*.aof
+*.pid
+# RabbitMQ
+mnesia/
+rabbitmq/
+rabbitmq-data/
+# ActiveMQ
+activemq-data/
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.envrc
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#   JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#   be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#   and can be added to the global gitignore or merged into this file.  For a more nuclear
+#   option (not recommended) you can uncomment the following to ignore the entire idea folder.
+# .idea/
+# Abstra
+#   Abstra is an AI-powered process automation framework.
+#   Ignore directories containing user credentials, local state, and settings.
+#   Learn more at https://abstra.io/docs
+.abstra/
+# Visual Studio Code
+#   Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
+#   that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
+#   and can be added to the global gitignore or merged into this file. However, if you prefer,
+#   you could uncomment the following to ignore the entire vscode folder
+# .vscode/
+# Ruff stuff:
+.ruff_cache/
+# PyPI configuration file
+.pypirc
+# Marimo
+marimo/_static/
+marimo/_lsp/
+__marimo__/
+# Streamlit
+.streamlit/secrets.toml
+# pixi environments
+.pixi/*
+!.pixi/config.toml
+_checkpoints/*
+# START Ruler Generated Files
+/.codex/config.json
+/.codex/config.json.bak
+/.codex/config.toml
+/.codex/config.toml.bak
+/.vscode/mcp.json
+/.vscode/mcp.json.bak
+/AGENTS.md
+/AGENTS.md.bak
+# END Ruler Generated Files

LICENSE-APACHE ADDED Viewed

	@@ -0,0 +1,201 @@

+                              Apache License
+                        Version 2.0, January 2004
+                     http://www.apache.org/licenses/
+TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+1. Definitions.
+   "License" shall mean the terms and conditions for use, reproduction,
+   and distribution as defined by Sections 1 through 9 of this document.
+   "Licensor" shall mean the copyright owner or entity authorized by
+   the copyright owner that is granting the License.
+   "Legal Entity" shall mean the union of the acting entity and all
+   other entities that control, are controlled by, or are under common
+   control with that entity. For the purposes of this definition,
+   "control" means (i) the power, direct or indirect, to cause the
+   direction or management of such entity, whether by contract or
+   otherwise, or (ii) ownership of fifty percent (50%) or more of the
+   outstanding shares, or (iii) beneficial ownership of such entity.
+   "You" (or "Your") shall mean an individual or Legal Entity
+   exercising permissions granted by this License.
+   "Source" form shall mean the preferred form for making modifications,
+   including but not limited to software source code, documentation
+   source, and configuration files.
+   "Object" form shall mean any form resulting from mechanical
+   transformation or translation of a Source form, including but
+   not limited to compiled object code, generated documentation,
+   and conversions to other media types.
+   "Work" shall mean the work of authorship, whether in Source or
+   Object form, made available under the License, as indicated by a
+   copyright notice that is included in or attached to the work
+   (an example is provided in the Appendix below).
+   "Derivative Works" shall mean any work, whether in Source or Object
+   form, that is based on (or derived from) the Work and for which the
+   editorial revisions, annotations, elaborations, or other modifications
+   represent, as a whole, an original work of authorship. For the purposes
+   of this License, Derivative Works shall not include works that remain
+   separable from, or merely link (or bind by name) to the interfaces of,
+   the Work and Derivative Works thereof.
+   "Contribution" shall mean any work of authorship, including
+   the original version of the Work and any modifications or additions
+   to that Work or Derivative Works thereof, that is intentionally
+   submitted to Licensor for inclusion in the Work by the copyright owner
+   or by an individual or Legal Entity authorized to submit on behalf of
+   the copyright owner. For the purposes of this definition, "submitted"
+   means any form of electronic, verbal, or written communication sent
+   to the Licensor or its representatives, including but not limited to
+   communication on electronic mailing lists, source code control systems,
+   and issue tracking systems that are managed by, or on behalf of, the
+   Licensor for the purpose of discussing and improving the Work, but
+   excluding communication that is conspicuously marked or otherwise
+   designated in writing by the copyright owner as "Not a Contribution."
+   "Contributor" shall mean Licensor and any individual or Legal Entity
+   on behalf of whom a Contribution has been received by Licensor and
+   subsequently incorporated within the Work.
+2. Grant of Copyright License. Subject to the terms and conditions of
+   this License, each Contributor hereby grants to You a perpetual,
+   worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+   copyright license to reproduce, prepare Derivative Works of,
+   publicly display, publicly perform, sublicense, and distribute the
+   Work and such Derivative Works in Source or Object form.
+3. Grant of Patent License. Subject to the terms and conditions of
+   this License, each Contributor hereby grants to You a perpetual,
+   worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+   (except as stated in this section) patent license to make, have made,
+   use, offer to sell, sell, import, and otherwise transfer the Work,
+   where such license applies only to those patent claims licensable
+   by such Contributor that are necessarily infringed by their
+   Contribution(s) alone or by combination of their Contribution(s)
+   with the Work to which such Contribution(s) was submitted. If You
+   institute patent litigation against any entity (including a
+   cross-claim or counterclaim in a lawsuit) alleging that the Work
+   or a Contribution incorporated within the Work constitutes direct
+   or contributory patent infringement, then any patent licenses
+   granted to You under this License for that Work shall terminate
+   as of the date such litigation is filed.
+4. Redistribution. You may reproduce and distribute copies of the
+   Work or Derivative Works thereof in any medium, with or without
+   modifications, and in Source or Object form, provided that You
+   meet the following conditions:
+   (a) You must give any other recipients of the Work or
+       Derivative Works a copy of this License; and
+   (b) You must cause any modified files to carry prominent notices
+       stating that You changed the files; and
+   (c) You must retain, in the Source form of any Derivative Works
+       that You distribute, all copyright, patent, trademark, and
+       attribution notices from the Source form of the Work,
+       excluding those notices that do not pertain to any part of
+       the Derivative Works; and
+   (d) If the Work includes a "NOTICE" text file as part of its
+       distribution, then any Derivative Works that You distribute must
+       include a readable copy of the attribution notices contained
+       within such NOTICE file, excluding those notices that do not
+       pertain to any part of the Derivative Works, in at least one
+       of the following places: within a NOTICE text file distributed
+       as part of the Derivative Works; within the Source form or
+       documentation, if provided along with the Derivative Works; or,
+       within a display generated by the Derivative Works, if and
+       wherever such third-party notices normally appear. The contents
+       of the NOTICE file are for informational purposes only and
+       do not modify the License. You may add Your own attribution
+       notices within Derivative Works that You distribute, alongside
+       or as an addendum to the NOTICE text from the Work, provided
+       that such additional attribution notices cannot be construed
+       as modifying the License.
+   You may add Your own copyright statement to Your modifications and
+   may provide additional or different license terms and conditions
+   for use, reproduction, or distribution of Your modifications, or
+   for any such Derivative Works as a whole, provided Your use,
+   reproduction, and distribution of the Work otherwise complies with
+   the conditions stated in this License.
+5. Submission of Contributions. Unless You explicitly state otherwise,
+   any Contribution intentionally submitted for inclusion in the Work
+   by You to the Licensor shall be under the terms and conditions of
+   this License, without any additional terms or conditions.
+   Notwithstanding the above, nothing herein shall supersede or modify
+   the terms of any separate license agreement you may have executed
+   with Licensor regarding such Contributions.
+6. Trademarks. This License does not grant permission to use the trade
+   names, trademarks, service marks, or product names of the Licensor,
+   except as required for reasonable and customary use in describing the
+   origin of the Work and reproducing the content of the NOTICE file.
+7. Disclaimer of Warranty. Unless required by applicable law or
+   agreed to in writing, Licensor provides the Work (and each
+   Contributor provides its Contributions) on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+   implied, including, without limitation, any warranties or conditions
+   of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+   PARTICULAR PURPOSE. You are solely responsible for determining the
+   appropriateness of using or redistributing the Work and assume any
+   risks associated with Your exercise of permissions under this License.
+8. Limitation of Liability. In no event and under no legal theory,
+   whether in tort (including negligence), contract, or otherwise,
+   unless required by applicable law (such as deliberate and grossly
+   negligent acts) or agreed to in writing, shall any Contributor be
+   liable to You for damages, including any direct, indirect, special,
+   incidental, or consequential damages of any character arising as a
+   result of this License or out of the use or inability to use the
+   Work (including but not limited to damages for loss of goodwill,
+   work stoppage, computer failure or malfunction, or any and all
+   other commercial damages or losses), even if such Contributor
+   has been advised of the possibility of such damages.
+9. Accepting Warranty or Additional Liability. While redistributing
+   the Work or Derivative Works thereof, You may choose to offer,
+   and charge a fee for, acceptance of support, warranty, indemnity,
+   or other liability obligations and/or rights consistent with this
+   License. However, in accepting such obligations, You may act only
+   on Your own behalf and on Your sole responsibility, not on behalf
+   of any other Contributor, and only if You agree to indemnify,
+   defend, and hold each Contributor harmless for any liability
+   incurred by, or claims asserted against, such Contributor by reason
+   of your accepting any such warranty or additional liability.
+END OF TERMS AND CONDITIONS
+APPENDIX: How to apply the Apache License to your work.
+   To apply the Apache License to your work, attach the following
+   boilerplate notice, with the fields enclosed by brackets "[]"
+   replaced with your own identifying information. (Don't include
+   the brackets!)  The text should be enclosed in the appropriate
+   comment syntax for the file format. We also recommend that a
+   file or class name and description of purpose be included on the
+   same "printed page" as the copyright notice for easier
+   identification within third-party archives.
+Copyright [yyyy] [name of copyright owner]
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.

LICENSE-MIT ADDED Viewed

	@@ -0,0 +1,25 @@

+Copyright (c) 2022 Rerun Technologies AB <opensource@rerun.io>
+Permission is hereby granted, free of charge, to any
+person obtaining a copy of this software and associated
+documentation files (the "Software"), to deal in the
+Software without restriction, including without
+limitation the rights to use, copy, modify, merge,
+publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software
+is furnished to do so, subject to the following
+conditions:
+The above copyright notice and this permission notice
+shall be included in all copies or substantial portions
+of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
+ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
+TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
+PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
+SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
+IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.

README.md CHANGED Viewed

@@ -1,12 +1,82 @@
----
-title: Sam3d Body Rerun
-emoji: 🏆
-colorFrom: gray
-colorTo: yellow
-sdk: gradio
-sdk_version: 6.0.1
-app_file: app.py
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# SAM3D Body with Rerun
+An unofficial playground for Meta's SAM3D Body (DINOv3) with promptable SAM3 masks and live Rerun visualization. Uses **Rerun** for 3D inspection, **Gradio** for the UI, and **Pixi** for one-command setup.
+<p align="center">
+  <a title="Rerun" href="https://rerun.io" target="_blank" rel="noopener noreferrer">
+    <img src="https://img.shields.io/badge/Rerun-0.27%2B-0b82f9" alt="Rerun badge">
+  </a>
+  <a title="Pixi" href="https://pixi.sh/latest/" target="_blank" rel="noopener noreferrer">
+    <img src="https://img.shields.io/badge/Install%20with-Pixi-16A34A" alt="Pixi badge">
+  </a>
+  <a title="CUDA" href="https://developer.nvidia.com/cuda-toolkit" target="_blank" rel="noopener noreferrer">
+    <img src="https://img.shields.io/badge/CUDA-12.9%2B-76b900" alt="CUDA badge">
+  </a>
+  <a title="GitHub" href="https://github.com/rerun-io/sam3d-body-rerun" target="_blank" rel="noopener noreferrer">
+    <img src="https://img.shields.io/github/stars/rerun-io/sam3d-body-rerun?label=GitHub%20%E2%98%85&logo=github&color=C8C" alt="GitHub stars">
+  </a>
+</p>
+<p align="center">
+  <!-- Drop your GIF/MP4 here once ready -->
+  <img src="media/sam3d-body-demo.gif" alt="example output" width="720" />
+</p>
+## Installation
+### Using Pixi
+Make sure you have the [Pixi](https://pixi.sh/latest/#installation) package manager installed.
+TL;DR install Pixi:
+```bash
+curl -fsSL https://pixi.sh/install.sh | sh
+```
+Restart your shell so the new `pixi` binary is on `PATH`.
+This is Linux only with an NVIDIA GPU.
+The SAM3 and SAM3D Body checkpoints are gated on Hugging Face—request access for both [facebook/sam-3d-body-dinov3](https://huggingface.co/facebook/sam-3d-body-dinov3) and [facebook/sam3](https://huggingface.co/facebook/sam3), then authenticate either by setting `HF_TOKEN=<your token>` or running `huggingface-cli login` before the first download (see Meta's install notes).
+First run will download HF checkpoints for SAM3, SAM3D Body, and the relative-depth model.
+```bash
+git clone https://github.com/rerun-io/sam3d-body-rerun.git
+cd sam3d-body-rerun
+pixi run app
+```
+All commands can be listed with `pixi task list`.
+## Usage
+### Gradio App
+```bash
+pixi run app
+```
+Opens the Gradio UI with an embedded streaming Rerun viewer. Try the bundled samples in `data/example-data` or upload your own RGB image; toggle “Log relative depth” to stream predicted depth.
+### CLI
+From a dev shell (for tyro + dev deps):
+```
+pixi run cli
+```
+OR
+```bash
+pixi shell -e dev
+python tool/demo.py --help
+```
+Run on a folder of images and configure Rerun output/recordings via the CLI flags.
+### Promptable SAM3 sandbox
+If you just want SAM3 masks without 3D reconstruction:
+```bash
+pixi run -e dev python tool/gradio_sam3.py
+```
+## Acknowledgements
+Thanks to the original projects that make this demo possible:
+- [facebook/sam-3d-body-dinov3](https://huggingface.co/facebook/sam-3d-body-dinov3) — SAM3D Body checkpoints and assets.
+- [facebook/sam3](https://huggingface.co/facebook/sam3) — promptable concept segmentation.
+- Relative depth/FOV from `MogeV1Predictor` in [monopriors](https://github.com/pablovela5620/monoprior).
+- Built with [Rerun](https://rerun.io/), [Gradio](https://www.gradio.app/), and [Pixi](https://pixi.sh/latest/).
+Dual licensed under Apache 2.0 and MIT for the code in this repository; upstream models/assets retain their original licenses (see `LICENSE-APACHE` and `LICENSE-MIT` for this repo).

app.py ADDED Viewed

	@@ -0,0 +1,43 @@

+import os
+import subprocess
+from pathlib import Path
+PIXI_PATH = Path("/home/user/.pixi/bin/pixi")
+PIXI_VERSION = "0.59.0"
+MOCK_CUDA_VERSION = "12.9"
+# Pretend CUDA 12.9 is available so pixi can solve environments on machines without GPUs.
+os.environ.setdefault("CONDA_OVERRIDE_CUDA", MOCK_CUDA_VERSION)
+def check_and_install_pixi() -> None:
+    try:
+        subprocess.check_call(f"{PIXI_PATH} --version", shell=True)
+    except subprocess.CalledProcessError:
+        print("pixi not found. Installing pixi...")
+        # Install pixi using the provided installation script
+        subprocess.check_call(
+            f"PIXI_VERSION=v{PIXI_VERSION} curl -fsSL https://pixi.sh/install.sh | bash",
+            shell=True,
+        )
+        subprocess.check_call(f"{PIXI_PATH} self-update --version {PIXI_VERSION}", shell=True)
+        subprocess.check_call(f"{PIXI_PATH} --version", shell=True)
+def run_command(command: str) -> None:
+    try:
+        subprocess.check_call(command, shell=True)
+    except subprocess.CalledProcessError as e:
+        print(f"run command {command}. Error: {e}")
+if __name__ == "__main__":
+    check_and_install_pixi()
+    # install lsof
+    # run_command(command=f"{PIXI_PATH} global install lsof")
+    # # kill anything running on port 7860
+    # run_command(command=f"{PIXI_PATH.parent}/lsof -t -i:7860 | xargs -r kill")
+    # clean current environment
+    run_command(command=f"{PIXI_PATH} clean")
+    # run spaces app
+    run_command(command=f"{PIXI_PATH} run app")

data/example-data/Amir-Khan-Lamont-Peterson_2689582.jpg ADDED Viewed

Git LFS Details

SHA256: 85013a25f46cad9ba86bc05786b48dfb6e5a2d5dfa9f19328997480ec23226e5
Pointer size: 131 Bytes
Size of remote file: 155 kB

data/example-data/BNAAHPYGMYSE26U6C6T7VA6544.jpg ADDED Viewed

Git LFS Details

SHA256: c5d64d944c10ffde20645075b9078b7359899dd019062ceaa6fd54b18be21042
Pointer size: 131 Bytes
Size of remote file: 719 kB

data/example-data/Canelo-Alvarez-b4d59f2080464e4d996177f5ce9792ee.jpg ADDED Viewed

Git LFS Details

SHA256: bc029593f9dae5bd0473148fe9b920d6e708220b126c1b0a09bb9b48bfa999be
Pointer size: 131 Bytes
Size of remote file: 134 kB

data/example-data/Planche.jpg ADDED Viewed

Git LFS Details

SHA256: 898a2376f2adac0676408cc5c563b8f50df9966caa3299d4013b0476dd5cdbbe
Pointer size: 131 Bytes
Size of remote file: 216 kB

data/example-data/yoga-example.jpg ADDED Viewed

Git LFS Details

SHA256: 260c554cb3e8cc582a37873951f05ee10e99631e2c858d7b77d246554212fdae
Pointer size: 130 Bytes
Size of remote file: 50.6 kB

pixi.lock ADDED Viewed

The diff for this file is too large to render. See raw diff

pyproject.toml ADDED Viewed

	@@ -0,0 +1,149 @@

+[project]
+authors = [{ name = "pablo vela", email = "pablovela5620@gmail.com" }]
+dependencies = [
+    "jaxtyping<0.3.0",
+    "numpy>=2.0",
+    "einops>=0.8.0",
+    "icecream>=2.1.3",
+    "opencv-python>=4.10.0",
+    "pyserde>=0.20.0",
+    "rerun-sdk>=0.27.0",
+    "tyro>=0.9.1",
+    "tqdm",
+    "hf-transfer>=0.1.9",
+    "lovely-numpy>=0.2.13,<0.3",
+    "pandas>=2.3.3",
+    "braceexpand>=0.1.7,<0.2",
+    "roma>=1.5.4,<2",
+    "pytorch-lightning>=2.5.6,<3",
+    "yacs>=0.1.8,<0.2",
+    "omegaconf>=2.3.0,<3",
+    "termcolor>=3.2.0,<4",
+    "gradio-rerun>=0.27.0",
+    "spaces>=0.43.0",
+]
+name = "sam3d_body"
+requires-python = ">= 3.12"
+version = "0.1.0"
+[build-system]
+build-backend = "hatchling.build"
+requires = ["hatchling"]
+[tool.hatch.metadata]
+allow-direct-references = true
+[tool.pixi.workspace]
+channels = ["conda-forge"]
+platforms = ["linux-64"]
+preview = ["pixi-build"]
+[tool.pixi.pypi-options]
+no-build-isolation = ["detectron2", "moge"]
+[tool.pixi.pypi-options.dependency-overrides]
+# Allow iopath >=0.1.10 even though detectron2 pins <0.1.10, so it can satisfy sam-2.
+iopath = ">=0.1.10"
+gradio = ">=5.45.0,<6"
+[tool.pixi.pypi-dependencies]
+sam3d_body = { path = ".", editable = true }
+moge = { git = "https://github.com/microsoft/MoGe.git" }
+simplecv = { git = "https://github.com/pablovela5620/simplecv.git", branch = "main" }
+timm = ">=0.9"
+transformers = { git = "https://github.com/huggingface/transformers.git", rev = "d08b98b965176ea9cf8c8e8b24995c955b7e2ec9" }
+monopriors = { git = "https://github.com/pablovela5620/monoprior.git" }
+[tool.pixi.tasks]
+app = "python tool/gradio_sam3d_body.py"
+cli = "python tool/demo.py --image-folder data/example-data"
+[tool.pixi.feature.cuda129.system-requirements]
+cuda = "12.9"
+[tool.pixi.feature.cuda129.dependencies]
+# CUDA Build Tools
+cuda-compiler = "*"
+cuda-version = "12.9.*"
+cuda-cudart-dev = "*"
+cuda-crt = "*"
+libcusparse-dev = "*"
+cuda-driver-dev = "*"
+cuda-nvcc = "*"
+cuda-nvrtc-dev = "*"
+cuda-nvtx = "*"
+cuda-nvtx-dev = "*"
+cuda-nvml-dev = "*"
+cuda-profiler-api = "*"
+# CUDA Libraries
+cudnn = "*"
+libcublas-dev = "*"
+libcudss-dev = "*"
+libcufile-dev = "*"
+libcufft-dev = "*"
+libcurand-dev = "*"
+libcusolver-dev = "*"
+cusparselt = "*"
+libnvjitlink = "*"
+# cuda129 end
+[tool.pixi.feature.gpu.dependencies]
+pytorch-gpu = ">=2.8.0"
+torchvision = "*"
+[tool.pixi.feature.dev.dependencies]
+beartype = "*"
+pyrefly = ">=0.42.2,<0.43"
+ruff = ">=0.14.5,<0.15"
+[tool.pixi.feature.dev.pypi-dependencies]
+types-tqdm = "*"
+[tool.pixi.environments]
+cuda128 = { features = [
+    "cuda129",
+], solve-group = "cuda129", no-default-feature = true }
+default = { features = ["gpu", "cuda129"], solve-group = "cuda129" }
+dev = { features = ["dev", "gpu", "cuda129"], solve-group = "cuda129" }
+[tool.pixi.dependencies]
+av = ">=16.0.1,<17"
+gradio = ">=5.45.0,<6"
+huggingface_hub = ">=1.0,<2"
+tomlkit = "==0.12.0"
+audioop-lts = "*"
+pydub = "*"
+open3d = ">=0.19.0,<0.20"
+[tool.ruff]
+line-length = 150
+[tool.ruff.lint]
+select = [
+    # pycodestyle
+    "E",
+    # Pyflakes
+    "F",
+    # pyupgrade
+    "UP",
+    # flake8-bugbear
+    "B",
+    # flake8-simplify
+    "SIM",
+    # isort
+    "I",
+]
+ignore = [
+    "E501",  # Line too long.
+    "F722",  # Forward annotation false positive from jaxtyping. Should be caught by pyright.
+    "F821",  # Forward annotation false positive from jaxtyping. Should be caught by pyright.
+    "UP037", # Remove quotes from type, false positive when using jaxtyping
+    "UP040", # Beartype fails if not using this for typealias
+]
+[tool.pyrefly]
+project-includes = ["**/*"]
+project-excludes = ["**/node_modules", "**/__pycache__", "**/*venv/**/*"]

src/sam3d_body/__init__.py ADDED Viewed

	@@ -0,0 +1,12 @@

+import os
+# Only enable beartype when running in the 'dev' environment
+# Check the PIXI_ENVIRONMENT_NAME environment variable set by pixi
+if os.environ.get("PIXI_ENVIRONMENT_NAME") == "dev":
+    try:
+        from beartype.claw import beartype_this_package
+        beartype_this_package()
+    except ImportError:
+        # beartype not available even in dev environment
+        pass

src/sam3d_body/api/demo.py ADDED Viewed

	@@ -0,0 +1,241 @@

+"""Minimal standalone demo wiring for SAM 3D Body with Rerun visualization."""
+import os
+from dataclasses import dataclass
+from glob import glob
+from pathlib import Path
+from typing import Literal, TypedDict
+import cv2
+import numpy as np
+import rerun as rr
+import rerun.blueprint as rrb
+import torch
+from jaxtyping import Float32, UInt8
+from monopriors.relative_depth_models import BaseRelativePredictor, RelativeDepthPrediction, get_relative_predictor
+from numpy import ndarray
+from serde import serde
+from simplecv.rerun_log_utils import RerunTyroConfig
+from torch import Tensor
+from tqdm import tqdm
+from transformers.models.sam3 import Sam3Model, Sam3Processor
+from yacs.config import CfgNode
+from sam3d_body.api.visualization import create_view, set_annotation_context, visualize_sample
+from sam3d_body.build_models import load_sam_3d_body, load_sam_3d_body_hf
+from sam3d_body.models.meta_arch import SAM3DBody
+from sam3d_body.sam_3d_body_estimator import FinalPosePrediction, SAM3DBodyEstimator
+class SAM3ResultsDict(TypedDict):
+    """Torch-format outputs returned directly by ``Sam3Processor`` post-processing."""
+    scores: Float32[Tensor, "n"]
+    boxes: Float32[Tensor, "n 4"]
+    masks: Float32[Tensor, "n h w"]
+@serde()
+class SAM3Results:
+    scores: Float32[ndarray, "n"]
+    """Per-instance confidence scores ``[N]``."""
+    boxes: Float32[ndarray, "n 4"]
+    """Bounding boxes in XYXY pixel coordinates ``[N, 4]``."""
+    masks: Float32[ndarray, "n h w"]
+    """Probability masks for each detection ``[N, H, W]`` (float32 in ``[0, 1]``)."""
+@dataclass
+class SAM3Config:
+    """Configuration for loading a SAM3 checkpoint and selecting device."""
+    device: Literal["cpu", "cuda"] = "cuda"
+    """Computation device passed to the Hugging Face SAM3 model."""
+    sam3_checkpoint: str = "facebook/sam3"
+    """Model identifier or path accepted by ``Sam3Model.from_pretrained``."""
+class SAM3Predictor:
+    """Lightweight wrapper around the SAM3 model for single-image inference."""
+    def __init__(self, config: SAM3Config):
+        self.config = config
+        self.sam3_model = Sam3Model.from_pretrained(config.sam3_checkpoint).to(config.device)
+        self.sam3_processor = Sam3Processor.from_pretrained(config.sam3_checkpoint)
+    def predict_single_image(self, rgb_hw3: UInt8[ndarray, "h w 3"], text: str = "person") -> SAM3Results:
+        """Run SAM3 instance segmentation on one RGB image.
+        Args:
+            rgb_hw3: Input image in RGB order with dtype ``uint8`` and shape ``[H, W, 3]``.
+            text: Optional prompt used by SAM3's text-conditioned decoder (default: ``"person"``).
+        Returns:
+            ``SAM3Results`` with NumPy copies of scores, XYXY boxes, and binary masks.
+        """
+        inputs = self.sam3_processor(
+            images=rgb_hw3,
+            text=text,
+            return_tensors="pt",
+        ).to(self.config.device)
+        with torch.no_grad():
+            outputs = self.sam3_model(**inputs)
+        results: SAM3ResultsDict = self.sam3_processor.post_process_instance_segmentation(
+            outputs, threshold=0.5, mask_threshold=0.5, target_sizes=inputs.get("original_sizes").tolist()
+        )[0]
+        mask_probs: Float32[ndarray, "n h w"] = results["masks"].detach().cpu().numpy().astype(np.float32, copy=False)
+        return SAM3Results(
+            scores=results["scores"].detach().cpu().numpy().astype(np.float32, copy=False),
+            boxes=results["boxes"].detach().cpu().numpy().astype(np.float32, copy=False),
+            masks=mask_probs,
+        )
+@dataclass
+class SAM3DBodyE2EConfig:
+    """Bundle of sub-configurations required for the end-to-end demo."""
+    sam3_config: SAM3Config
+    """Settings for the underlying SAM3 detector."""
+    fov_estimator: Literal["MogeV1Predictor"] = "MogeV1Predictor"
+    """Identifier of the relative depth/FOV estimator to load."""
+    mhr_path: Path = Path("checkpoints/sam-3d-body-dinov3/assets/mhr_model.pt")
+    """Path to the MHR mesh/pose asset file required by the head network."""
+    checkpoint_path: Path = Path("checkpoints/sam-3d-body-dinov3/model.ckpt")
+    """Core SAM 3D Body model checkpoint (.ckpt)."""
+class SAM3DBodyE2E:
+    """Convenience facade that chains detection, FOV estimation, and 3D reconstruction."""
+    def __init__(self, config: SAM3DBodyE2EConfig):
+        self.sam3_predictor = SAM3Predictor(config.sam3_config)
+        self.fov_predictor: BaseRelativePredictor = get_relative_predictor(config.fov_estimator)(device="cuda")
+        device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+        # load_output: tuple[SAM3DBody, CfgNode] = load_sam_3d_body(
+        #     config.checkpoint_path,
+        #     device=device,
+        #     mhr_path=config.mhr_path,
+        # )
+        load_output: tuple[SAM3DBody, CfgNode] = load_sam_3d_body_hf(repo_id="facebook/sam-3d-body-dinov3")
+        model: SAM3DBody = load_output[0]
+        self.sam3d_body_estimator = SAM3DBodyEstimator(
+            sam_3d_body_model=model,
+        )
+    def predict_single_image(
+        self, rgb_hw3: UInt8[ndarray, "h w 3"]
+    ) -> tuple[list[FinalPosePrediction], RelativeDepthPrediction]:
+        """Estimate 3D poses for a single frame.
+        Pipeline:
+        1. Use the configured relative-depth predictor to derive camera intrinsics ``K_33``.
+        2. Run SAM3 to obtain person masks and boxes.
+        3. Feed detections and intrinsics into ``SAM3DBodyEstimator`` for per-person 3D bodies.
+        Args:
+            rgb_hw3: RGB image with shape ``[H, W, 3]`` and dtype ``uint8``.
+        Returns:
+            A list of ``FinalPosePrediction`` entries—one per detected person.
+        """
+        # estimate the camera intrinsics
+        relative_pred: RelativeDepthPrediction = self.fov_predictor(rgb=rgb_hw3, K_33=None)
+        K_33: Float32[ndarray, "3 3"] = relative_pred.K_33
+        sam3_results: SAM3Results = self.sam3_predictor.predict_single_image(rgb_hw3)
+        outputs: list[FinalPosePrediction] = self.sam3d_body_estimator.process_one_image(
+            rgb_hw3,
+            xyxy=sam3_results.boxes,
+            masks=sam3_results.masks,
+            masks_score=sam3_results.scores,
+            K_33=K_33,
+        )
+        return outputs, relative_pred
+@dataclass(slots=True)
+class Sam3DBodyDemoConfig:
+    """Configuration for the standalone demo runner."""
+    rr_config: RerunTyroConfig
+    """Viewer/runtime options for Rerun (window layout, recording, etc.)."""
+    sam3_e2e_config: SAM3DBodyE2EConfig
+    """Configuration for the end-to-end SAM 3D Body model."""
+    image_folder: Path | None = None
+    """Directory containing input images to process."""
+    image_path: Path | None = None
+    """Path to a single input image to process."""
+    max_frames: int | None = None
+    """Optional limit on the number of images to process; ``None`` processes all images."""
+def main(cfg: Sam3DBodyDemoConfig):
+    """Run the Rerun-enabled demo on a folder or single image.
+    Args:
+        cfg: Aggregated configuration containing Rerun settings, SAM3 model options,
+            and input image selection.
+    """
+    # Setup Rerun
+    parent_log_path = Path("/world")
+    set_annotation_context()
+    view: rrb.ContainerLike = create_view()
+    blueprint = rrb.Blueprint(view, collapse_panels=True)
+    rr.send_blueprint(blueprint)
+    rr.log("/", rr.ViewCoordinates.RDF, static=True)
+    if cfg.image_path is not None:
+        images_list = [str(cfg.image_path)]
+    elif cfg.image_folder is not None:
+        image_extensions: list[str] = [
+            "*.jpg",
+            "*.jpeg",
+            "*.png",
+            "*.gif",
+            "*.bmp",
+            "*.tiff",
+            "*.webp",
+        ]
+        images_list: list[str] = sorted(
+            [image for ext in image_extensions for image in glob(os.path.join(cfg.image_folder, ext))]
+        )
+    else:
+        raise ValueError("Either image_path or image_folder must be specified.")
+    # load end to end model
+    sam3D_body_e2e = SAM3DBodyE2E(cfg.sam3_e2e_config)
+    for idx, image_path in enumerate(tqdm(images_list)):
+        rr.set_time(timeline="image_sequence", sequence=idx)
+        # load image and convert to RGB
+        bgr_hw3: UInt8[ndarray, "h w 3"] = cv2.imread(image_path)
+        rgb_hw3: UInt8[ndarray, "h w 3"] = cv2.cvtColor(bgr_hw3, cv2.COLOR_BGR2RGB)
+        outputs: tuple[list[FinalPosePrediction], RelativeDepthPrediction] = sam3D_body_e2e.predict_single_image(
+            rgb_hw3
+        )
+        pred_list: list[FinalPosePrediction] = outputs[0]
+        relative_pred: RelativeDepthPrediction = outputs[1]
+        if len(pred_list) == 0:
+            # Detector/FOV failed on this frame; avoid crashing the visualization step.
+            print(f"[warn] No detections for {image_path}; skipping.")
+            continue
+        visualize_sample(
+            pred_list=pred_list,
+            rgb_hw3=rgb_hw3,
+            parent_log_path=parent_log_path,
+            faces=sam3D_body_e2e.sam3d_body_estimator.faces,
+            relative_depth_pred=relative_pred,
+        )

src/sam3d_body/api/visualization.py ADDED Viewed

	@@ -0,0 +1,425 @@

+from pathlib import Path
+import cv2
+import numpy as np
+import open3d as o3d
+import rerun as rr
+import rerun.blueprint as rrb
+from jaxtyping import Bool, Float32, Int, UInt8
+from monopriors.depth_utils import depth_edges_mask
+from monopriors.relative_depth_models import RelativeDepthPrediction
+from numpy import ndarray
+from simplecv.camera_parameters import Extrinsics, Intrinsics, PinholeParameters
+from simplecv.ops.pc_utils import estimate_voxel_size
+from simplecv.rerun_log_utils import log_pinhole
+from sam3d_body.metadata.mhr70 import MHR70_ID2NAME, MHR70_IDS, MHR70_LINKS
+from sam3d_body.sam_3d_body_estimator import FinalPosePrediction
+BOX_PALETTE: UInt8[np.ndarray, "n_colors 4"] = np.array(
+    [
+        [255, 99, 71, 255],  # tomato
+        [65, 105, 225, 255],  # royal blue
+        [60, 179, 113, 255],  # medium sea green
+        [255, 215, 0, 255],  # gold
+        [138, 43, 226, 255],  # blue violet
+        [255, 140, 0, 255],  # dark orange
+        [220, 20, 60, 255],  # crimson
+        [70, 130, 180, 255],  # steel blue
+    ],
+    dtype=np.uint8,
+)
+# Use a separate id range for segmentation classes to avoid clobbering the person class (id=0).
+SEG_CLASS_OFFSET = 1000  # background = 1000, persons start at 1001
+MAX_POINT_CLOUD_POINTS = 50_000
+MIN_DEPTH_CONFIDENCE = 0.5
+def filter_out_of_bounds(
+    uv: Float32[ndarray, "n_points 2"],
+    h: int,
+    w: int,
+    xyz_cam: Float32[ndarray, "n_points 3"] | None = None,
+) -> Float32[ndarray, "n_points 2"]:
+    """Return a copy of ``uv`` with off-screen (and optional behind-camera) points masked.
+    Args:
+        uv: Pixel coordinates ``[N, 2]`` in (u, v) order.
+        h: Image height in pixels.
+        w: Image width in pixels.
+        xyz_cam: Optional camera-frame coordinates ``[N, 3]`` to mask points with negative ``z``.
+    Returns:
+        Copy of ``uv`` where out-of-bounds rows are set to ``NaN`` so Rerun hides them.
+    """
+    uv_filtered: Float32[ndarray, "n_points 2"] = np.asarray(uv, dtype=np.float32).copy()
+    out_of_bounds: Bool[ndarray, "n_points"] = np.logical_or(uv_filtered[:, 0] >= float(w), uv_filtered[:, 0] < 0.0)
+    out_of_bounds = np.logical_or(out_of_bounds, uv_filtered[:, 1] >= float(h))
+    out_of_bounds = np.logical_or(out_of_bounds, uv_filtered[:, 1] < 0.0)
+    if xyz_cam is not None:
+        out_of_bounds = np.logical_or(out_of_bounds, xyz_cam[:, 2] < 0.0)
+    uv_filtered[out_of_bounds, :] = np.nan
+    return uv_filtered
+def compute_vertex_normals(
+    verts: Float32[ndarray, "n_verts 3"],
+    faces: Int[ndarray, "n_faces 3"],
+    eps: float = 1e-12,
+) -> Float32[ndarray, "n_verts 3"]:
+    """Compute per-vertex normals for a single mesh.
+    Args:
+        verts: Float32 array of vertex positions with shape ``(n_verts, 3)``.
+        faces: Int array of triangle indices with shape ``(n_faces, 3)``.
+        eps: Small epsilon to avoid division by zero when normalizing.
+    Returns:
+        Float32 array of unit vertex normals with shape ``(n_verts, 3)``; zeros for degenerate vertices.
+    """
+    # Expand faces to vertex triplets and fetch their positions.
+    faces_i: Int[ndarray, "n_faces 3"] = faces.astype(np.int64)
+    v0: Float32[ndarray, "n_faces 3"] = verts[faces_i[:, 0]]
+    v1: Float32[ndarray, "n_faces 3"] = verts[faces_i[:, 1]]
+    v2: Float32[ndarray, "n_faces 3"] = verts[faces_i[:, 2]]
+    # Face normal = cross(edge1, edge2).
+    e1: Float32[ndarray, "n_faces 3"] = v1 - v0
+    e2: Float32[ndarray, "n_faces 3"] = v2 - v0
+    face_normals: Float32[ndarray, "n_faces 3"] = np.cross(e1, e2)
+    # Accumulate each face normal into its three vertices with a vectorized scatter-add.
+    vertex_normals: Float32[ndarray, "n_verts 3"] = np.zeros_like(verts, dtype=np.float32)
+    flat_indices: Int[ndarray, "n_faces3"] = faces_i.reshape(-1)
+    face_normals_repeated: Float32[ndarray, "n_faces3 3"] = np.repeat(face_normals, 3, axis=0)
+    np.add.at(vertex_normals, flat_indices, face_normals_repeated)
+    norms: Float32[ndarray, "n_verts 1"] = np.linalg.norm(vertex_normals, axis=-1, keepdims=True)
+    denom: Float32[ndarray, "n_verts 1"] = np.maximum(norms, eps).astype(np.float32)
+    vn_unit: Float32[ndarray, "n_verts 3"] = (vertex_normals / denom).astype(np.float32)
+    mask: ndarray = norms > eps
+    vn_unit = np.where(mask, vn_unit, np.float32(0.0))
+    return vn_unit
+def export_meshes_to_glb(
+    pred_list: list[FinalPosePrediction],
+    faces: Int[ndarray, "n_faces 3"],
+    output_dir: Path,
+    box_palette: UInt8[ndarray, "n_colors 4"] = BOX_PALETTE,
+    center_mesh: bool = True,
+) -> list[Path]:
+    """Write one GLB per predicted mesh and return the file paths."""
+    output_dir.mkdir(parents=True, exist_ok=True)
+    written_paths: list[Path] = []
+    faces_int: Int[ndarray, "n_faces 3"] = np.ascontiguousarray(faces, dtype=np.int32)
+    for idx, output in enumerate(pred_list):
+        verts_cam: Float32[ndarray, "n_verts 3"] = np.ascontiguousarray(output.pred_vertices, dtype=np.float32)
+        cam_t: Float32[ndarray, "3"] = np.ascontiguousarray(output.pred_cam_t, dtype=np.float32)
+        # Convert to world coordinates to mirror the viewer logging convention (cam → world via translation).
+        verts_world: Float32[ndarray, "n_verts 3"] = np.ascontiguousarray(verts_cam + cam_t, dtype=np.float32)
+        verts_export: Float32[ndarray, "n_verts 3"]
+        verts_export = verts_world - np.mean(verts_world, axis=0, keepdims=True) if center_mesh else verts_world
+        vertex_normals: Float32[ndarray, "n_verts 3"] = compute_vertex_normals(verts_export, faces_int)
+        mesh = o3d.geometry.TriangleMesh()
+        mesh.vertices = o3d.utility.Vector3dVector(verts_export.astype(np.float64))
+        mesh.triangles = o3d.utility.Vector3iVector(faces_int.astype(np.int32))
+        mesh.vertex_normals = o3d.utility.Vector3dVector(vertex_normals.astype(np.float64))
+        color: Float32[ndarray, "3"] = box_palette[idx % len(box_palette), :3].astype(np.float32) / 255.0
+        vertex_colors: Float32[ndarray, "n_verts 3"] = np.repeat(color[np.newaxis, :], verts_export.shape[0], axis=0)
+        mesh.vertex_colors = o3d.utility.Vector3dVector(vertex_colors.astype(np.float64))
+        glb_path: Path = output_dir / f"person_{idx:02d}.glb"
+        success: bool = bool(
+            o3d.io.write_triangle_mesh(
+                str(glb_path),
+                mesh,
+                write_ascii=False,
+                write_vertex_normals=True,
+                write_vertex_colors=True,
+            )
+        )
+        if not success:
+            fallback_path: Path = output_dir / f"person_{idx:02d}.ply"
+            success = bool(
+                o3d.io.write_triangle_mesh(
+                    str(fallback_path),
+                    mesh,
+                    write_ascii=False,
+                    write_vertex_normals=True,
+                    write_vertex_colors=True,
+                )
+            )
+            if success:
+                glb_path = fallback_path
+        if success:
+            written_paths.append(glb_path)
+    return written_paths
+def set_annotation_context() -> None:
+    """Register MHR-70 semantic metadata so subsequent logs show names/edges and mask colors."""
+    # Base person class (for keypoints / boxes) uses id=0 (original), segmentation uses 1000+ to avoid clashes.
+    person_class = rr.ClassDescription(
+        info=rr.AnnotationInfo(id=0, label="Person", color=(0, 0, 255)),
+        keypoint_annotations=[rr.AnnotationInfo(id=idx, label=name) for idx, name in MHR70_ID2NAME.items()],
+        keypoint_connections=MHR70_LINKS,
+    )
+    # Segmentation classes: id=SEG_CLASS_OFFSET background, ids SEG_CLASS_OFFSET+1..n for each instance color.
+    seg_classes: list[rr.ClassDescription] = [
+        rr.ClassDescription(info=rr.AnnotationInfo(id=SEG_CLASS_OFFSET, label="Background", color=(64, 64, 64))),
+    ]
+    for idx, color in enumerate(BOX_PALETTE[:, :3].tolist(), start=1):
+        seg_classes.append(
+            rr.ClassDescription(
+                info=rr.AnnotationInfo(
+                    id=SEG_CLASS_OFFSET + idx, label=f"Person-{idx}", color=tuple(int(c) for c in color)
+                ),
+            )
+        )
+    rr.log(
+        "/",
+        rr.AnnotationContext([person_class, *seg_classes]),
+        static=True,
+    )
+def visualize_sample(
+    pred_list: list[FinalPosePrediction],
+    rgb_hw3: UInt8[ndarray, "h w 3"],
+    parent_log_path: Path,
+    faces: Int[ndarray, "n_faces 3"],
+    relative_depth_pred: RelativeDepthPrediction | None = None,
+) -> None:
+    h: int = rgb_hw3.shape[0]
+    w: int = rgb_hw3.shape[1]
+    cam_log_path: Path = parent_log_path / "cam"
+    pinhole_log_path: Path = cam_log_path / "pinhole"
+    image_log_path: Path = pinhole_log_path / "image"
+    pred_log_path: Path = pinhole_log_path / "pred"
+    # log the pinhole camera parameters (assume fx=fy and center at image center)
+    focal_length: float = float(pred_list[0].focal_length)
+    intri: Intrinsics = Intrinsics(
+        camera_conventions="RDF",
+        fl_x=focal_length,
+        fl_y=focal_length,
+        cx=float(w) / 2.0,
+        cy=float(h) / 2.0,
+        height=h,
+        width=w,
+    )
+    world_T_cam: Float32[ndarray, "4 4"] = np.eye(4, dtype=np.float32)
+    extri: Extrinsics = Extrinsics(
+        world_R_cam=world_T_cam[:3, :3],
+        world_t_cam=world_T_cam[:3, 3],
+    )
+    pinhole_params: PinholeParameters = PinholeParameters(intrinsics=intri, extrinsics=extri, name="pinhole")
+    log_pinhole(camera=pinhole_params, cam_log_path=cam_log_path)
+    # clear the previous pred logs
+    rr.log(f"{pred_log_path}", rr.Clear(recursive=True))
+    rr.log(f"{image_log_path}", rr.Image(rgb_hw3, color_model=rr.ColorModel.RGB).compress(jpeg_quality=90))
+    # Build per-pixel maps (SEG_CLASS_OFFSET = background). Also build RGBA overlay with transparent background.
+    seg_map: Int[ndarray, "h w"] = np.full((h, w), SEG_CLASS_OFFSET, dtype=np.int32)
+    seg_overlay: UInt8[ndarray, "h w 4"] = np.zeros((h, w, 4), dtype=np.uint8)
+    human_mask: Bool[ndarray, "h w"] = np.zeros((h, w), dtype=bool)
+    mesh_root_path: Path = parent_log_path / "pred"
+    rr.log(str(mesh_root_path), rr.Clear(recursive=True))
+    for i, output in enumerate(pred_list):
+        box_color: UInt8[ndarray, "1 4"] = BOX_PALETTE[i % len(BOX_PALETTE)].reshape(1, 4)
+        rr.log(
+            f"{pred_log_path}/bbox_{i}",
+            rr.Boxes2D(
+                array=output.bbox,
+                array_format=rr.Box2DFormat.XYXY,
+                class_ids=0,
+                colors=box_color,
+                show_labels=True,
+            ),
+        )
+        kpts_cam: Float32[ndarray, "n_kpts 3"] = np.ascontiguousarray(output.pred_keypoints_3d, dtype=np.float32)
+        kpts_uv: Float32[ndarray, "n_kpts 2"] = np.ascontiguousarray(output.pred_keypoints_2d, dtype=np.float32)
+        kpts_uv_in_bounds: Float32[ndarray, "n_kpts 2"] = filter_out_of_bounds(
+            uv=kpts_uv,
+            h=h,
+            w=w,
+            xyz_cam=None,  # Depth sign from the model can be negative; only cull by image bounds.
+        )
+        rr.log(
+            f"{pred_log_path}/uv_{i}",
+            rr.Points2D(
+                positions=kpts_uv_in_bounds,
+                keypoint_ids=MHR70_IDS,
+                class_ids=0,
+                colors=(0, 255, 0),
+            ),
+        )
+        # Accumulate segmentation masks (if present) into a single segmentation image.
+        mask = output.mask
+        if mask is not None:
+            mask_arr: ndarray = np.asarray(mask).squeeze()
+            if mask_arr.shape != seg_map.shape:
+                mask_arr = cv2.resize(
+                    mask_arr.astype(np.uint8), (seg_map.shape[1], seg_map.shape[0]), interpolation=cv2.INTER_NEAREST
+                )
+            mask_bool = mask_arr.astype(bool)
+            human_mask = np.logical_or(human_mask, mask_bool)
+            seg_id = SEG_CLASS_OFFSET + i + 1  # keep person class (0) separate from seg classes
+            seg_map = np.where(mask_bool, np.uint16(seg_id), seg_map)
+            # Color overlay for this instance, background stays transparent.
+            color = BOX_PALETTE[i % len(BOX_PALETTE), :3]
+            seg_overlay[mask_bool] = np.array([color[0], color[1], color[2], 120], dtype=np.uint8)
+        # Log 3D keypoints in world coordinates
+        cam_t: Float32[ndarray, "3"] = np.ascontiguousarray(output.pred_cam_t, dtype=np.float32)
+        kpts_world: Float32[ndarray, "n_kpts 3"] = np.ascontiguousarray(kpts_cam + cam_t, dtype=np.float32)
+        rr.log(
+            f"{parent_log_path}/pred/kpts3d_{i}",
+            rr.Points3D(
+                positions=kpts_world,
+                keypoint_ids=MHR70_IDS,
+                class_ids=0,
+                colors=(0, 255, 0),
+            ),
+        )
+        # Log the full-body mesh in world coordinates so it shows in 3D
+        verts_cam: Float32[ndarray, "n_verts 3"] = np.ascontiguousarray(output.pred_vertices, dtype=np.float32)
+        verts_world: Float32[ndarray, "n_verts 3"] = np.ascontiguousarray(verts_cam + cam_t, dtype=np.float32)
+        faces_int: Int[ndarray, "n_faces 3"] = np.ascontiguousarray(faces, dtype=np.int32)
+        vertex_normals: Float32[ndarray, "n_verts 3"] = compute_vertex_normals(verts_world, faces_int)
+        rr.log(
+            f"{parent_log_path}/pred/mesh_{i}",
+            rr.Mesh3D(
+                vertex_positions=verts_world,
+                triangle_indices=faces_int,
+                vertex_normals=vertex_normals,
+                albedo_factor=(
+                    float(box_color[0, 0]) / 255.0,
+                    float(box_color[0, 1]) / 255.0,
+                    float(box_color[0, 2]) / 255.0,
+                    0.35,
+                ),
+            ),
+        )
+    # Log segmentation ids (full map) and an RGBA overlay with transparent background.
+    if np.any(seg_map != SEG_CLASS_OFFSET):
+        rr.log(f"{pred_log_path}/segmentation_ids", rr.SegmentationImage(seg_map))
+        rr.log(f"{pred_log_path}/segmentation_overlay", rr.Image(seg_overlay, color_model=rr.ColorModel.RGBA))
+    # Optionally log depth and a background-only point cloud (for 3D view only).
+    if relative_depth_pred is not None:
+        depth_hw: Float32[ndarray, "h w"] = np.asarray(relative_depth_pred.depth, dtype=np.float32)
+        conf_hw: Float32[ndarray, "h w"] = np.asarray(relative_depth_pred.confidence, dtype=np.float32)
+        if depth_hw.shape != (h, w):
+            depth_hw = cv2.resize(depth_hw, (w, h), interpolation=cv2.INTER_NEAREST)
+        if conf_hw.shape != (h, w):
+            conf_hw = cv2.resize(conf_hw, (w, h), interpolation=cv2.INTER_NEAREST)
+        depth_hw = np.nan_to_num(depth_hw, nan=0.0, posinf=0.0, neginf=0.0)
+        # Remove flying pixels along depth discontinuities.
+        edges_mask: Bool[ndarray, "h w"] = depth_edges_mask(depth_hw, threshold=0.01)
+        depth_hw = depth_hw * np.logical_not(edges_mask)
+        # Remove low-confidence pixels.
+        conf_mask: Bool[ndarray, "h w"] = conf_hw >= MIN_DEPTH_CONFIDENCE
+        depth_hw = depth_hw * conf_mask
+        background_mask: Bool[ndarray, "h w"] = np.logical_not(human_mask)
+        depth_bg: Float32[ndarray, "h w"] = depth_hw * background_mask
+        # Log depth image (not referenced by the 2D blueprint).
+        # rr.log(f"{pinhole_log_path}/depth", rr.DepthImage(depth_bg, meter=1.0))
+        fx: float = float(relative_depth_pred.K_33[0, 0])
+        fy: float = float(relative_depth_pred.K_33[1, 1])
+        cx: float = float(relative_depth_pred.K_33[0, 2])
+        cy: float = float(relative_depth_pred.K_33[1, 2])
+        u: Float32[ndarray, "w"] = np.arange(w, dtype=np.float32)
+        v: Float32[ndarray, "h"] = np.arange(h, dtype=np.float32)
+        uu: Float32[ndarray, "h w"]
+        vv: Float32[ndarray, "h w"]
+        uu, vv = np.meshgrid(u, v)
+        z_cam: Float32[ndarray, "h w"] = depth_bg
+        valid: Bool[ndarray, "h w"] = np.logical_and(z_cam > 0.0, np.isfinite(z_cam))
+        if np.any(valid):
+            x_cam: Float32[ndarray, "h w"] = (uu - cx) * z_cam / fx
+            y_cam: Float32[ndarray, "h w"] = (vv - cy) * z_cam / fy
+            points_cam: Float32[ndarray, "h w 3"] = np.stack([x_cam, y_cam, z_cam], axis=-1)
+            points_flat: Float32[ndarray, "n_valid 3"] = points_cam[valid]
+            colors_flat: UInt8[ndarray, "n_valid 3"] = rgb_hw3[valid]
+            if points_flat.shape[0] > MAX_POINT_CLOUD_POINTS:
+                voxel_size: float = estimate_voxel_size(
+                    points_flat, target_points=MAX_POINT_CLOUD_POINTS, tolerance=0.25
+                )
+                pcd: o3d.geometry.PointCloud = o3d.geometry.PointCloud()
+                pcd.points = o3d.utility.Vector3dVector(points_flat)
+                pcd.colors = o3d.utility.Vector3dVector(colors_flat.astype(np.float32) / 255.0)
+                pcd_ds: o3d.geometry.PointCloud = pcd.voxel_down_sample(voxel_size)
+                points_flat = np.asarray(pcd_ds.points, dtype=np.float32)
+                colors_flat = (np.asarray(pcd_ds.colors, dtype=np.float32) * 255.0).astype(np.uint8)
+            rr.log(
+                f"{parent_log_path}/depth_point_cloud",
+                rr.Points3D(
+                    positions=points_flat,
+                    colors=colors_flat,
+                ),
+            )
+def create_view() -> rrb.ContainerLike:
+    view_2d = rrb.Vertical(
+        contents=[
+            # Top: people-only overlay on the RGB image.
+            rrb.Spatial2DView(
+                name="image",
+                origin="/world/cam/pinhole",
+                contents=[
+                    "/world/cam/pinhole/image",
+                    "/world/cam/pinhole/pred/segmentation_overlay",
+                ],
+            ),
+            # Bottom: 2D boxes + keypoints; segmentation hidden.
+            rrb.Spatial2DView(
+                name="mhr",
+                origin="/world/cam/pinhole",
+                contents=[
+                    "/world/cam/pinhole/image",
+                    "/world/cam/pinhole/pred/**",
+                    "- /world/cam/pinhole/pred/segmentation_overlay/**",
+                    "- /world/cam/pinhole/pred/segmentation_ids/**",
+                ],
+            ),
+        ],
+    )
+    view_3d = rrb.Spatial3DView(name="mhr_3d", line_grid=rrb.LineGrid3D(visible=False))
+    main_view = rrb.Horizontal(contents=[view_2d, view_3d], column_shares=[2, 3])
+    view = rrb.Tabs(contents=[main_view], name="sam-3d-body-demo")
+    return view

src/sam3d_body/build_models.py ADDED Viewed

	@@ -0,0 +1,56 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+import os
+from os import PathLike
+import torch
+from .models.meta_arch import SAM3DBody
+from .utils.checkpoint import load_state_dict
+from .utils.config import CN, get_config
+def load_sam_3d_body(
+    checkpoint_path: str | PathLike[str] = "",
+    device: str | torch.device = "cuda",
+    mhr_path: str | PathLike[str] = "",
+) -> tuple[SAM3DBody, CN]:
+    print("Loading SAM 3D Body model...")
+    checkpoint_path = os.fspath(checkpoint_path)
+    mhr_path = os.fspath(mhr_path)
+    # Check the current directory, and if not present check the parent dir.
+    model_cfg = os.path.join(os.path.dirname(checkpoint_path), "model_config.yaml")
+    if not os.path.exists(model_cfg):
+        # Looks at parent dir
+        model_cfg = os.path.join(os.path.dirname(os.path.dirname(checkpoint_path)), "model_config.yaml")
+    model_cfg = get_config(model_cfg)
+    # Disable face for inference
+    model_cfg.defrost()
+    model_cfg.MODEL.MHR_HEAD.MHR_MODEL_PATH = mhr_path
+    model_cfg.freeze()
+    # Initialze the model
+    model = SAM3DBody(model_cfg)
+    checkpoint = torch.load(checkpoint_path, map_location="cpu", weights_only=False)
+    state_dict = checkpoint.get("state_dict", checkpoint)
+    load_state_dict(model, state_dict, strict=False)
+    model = model.to(device)
+    model.eval()
+    return model, model_cfg
+def _hf_download(repo_id):
+    from huggingface_hub import snapshot_download
+    local_dir = snapshot_download(repo_id=repo_id)
+    return os.path.join(local_dir, "model.ckpt"), os.path.join(local_dir, "assets", "mhr_model.pt")
+def load_sam_3d_body_hf(repo_id, **kwargs):
+    ckpt_path, mhr_path = _hf_download(repo_id)
+    return load_sam_3d_body(checkpoint_path=ckpt_path, mhr_path=mhr_path)

src/sam3d_body/data/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Copyright (c) Meta Platforms, Inc. and affiliates.

src/sam3d_body/data/transforms/__init__.py ADDED Viewed

	@@ -0,0 +1,21 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+from .bbox_utils import (
+    bbox_cs2xywh,
+    bbox_cs2xyxy,
+    bbox_xywh2cs,
+    bbox_xywh2xyxy,
+    bbox_xyxy2cs,
+    bbox_xyxy2xywh,
+    flip_bbox,
+    get_udp_warp_matrix,
+    get_warp_matrix,
+)
+from .common import (
+    Compose,
+    GetBBoxCenterScale,
+    NormalizeKeypoint,
+    SquarePad,
+    TopdownAffine,
+    VisionTransformWrapper,
+)

src/sam3d_body/data/transforms/bbox_utils.py ADDED Viewed

	@@ -0,0 +1,380 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+import math
+import cv2
+import numpy as np
+def bbox_xyxy2xywh(bbox_xyxy: np.ndarray) -> np.ndarray:
+    """Transform the bbox format from x1y1x2y2 to xywh.
+    Args:
+        bbox_xyxy (np.ndarray): Bounding boxes (with scores), shaped (n, 4) or
+            (n, 5). (left, top, right, bottom, [score])
+    Returns:
+        np.ndarray: Bounding boxes (with scores),
+          shaped (n, 4) or (n, 5). (left, top, width, height, [score])
+    """
+    bbox_xywh = bbox_xyxy.copy()
+    bbox_xywh[:, 2] = bbox_xywh[:, 2] - bbox_xywh[:, 0]
+    bbox_xywh[:, 3] = bbox_xywh[:, 3] - bbox_xywh[:, 1]
+    return bbox_xywh
+def bbox_xywh2xyxy(bbox_xywh: np.ndarray) -> np.ndarray:
+    """Transform the bbox format from xywh to x1y1x2y2.
+    Args:
+        bbox_xywh (ndarray): Bounding boxes (with scores),
+            shaped (n, 4) or (n, 5). (left, top, width, height, [score])
+    Returns:
+        np.ndarray: Bounding boxes (with scores), shaped (n, 4) or
+          (n, 5). (left, top, right, bottom, [score])
+    """
+    bbox_xyxy = bbox_xywh.copy()
+    bbox_xyxy[:, 2] = bbox_xyxy[:, 2] + bbox_xyxy[:, 0]
+    bbox_xyxy[:, 3] = bbox_xyxy[:, 3] + bbox_xyxy[:, 1]
+    return bbox_xyxy
+def bbox_xyxy2cs(bbox: np.ndarray, padding: float = 1.0) -> tuple[np.ndarray, np.ndarray]:
+    """Transform the bbox format from (x,y,w,h) into (center, scale)
+    Args:
+        bbox (ndarray): Bounding box(es) in shape (4,) or (n, 4), formatted
+            as (left, top, right, bottom)
+        padding (float): BBox padding factor that will be multilied to scale.
+            Default: 1.0
+    Returns:
+        tuple: A tuple containing center and scale.
+        - np.ndarray[float32]: Center (x, y) of the bbox in shape (2,) or
+            (n, 2)
+        - np.ndarray[float32]: Scale (w, h) of the bbox in shape (2,) or
+            (n, 2)
+    """
+    # convert single bbox from (4, ) to (1, 4)
+    dim = bbox.ndim
+    if dim == 1:
+        bbox = bbox[None, :]
+    x1, y1, x2, y2 = np.hsplit(bbox, [1, 2, 3])
+    center = np.hstack([x1 + x2, y1 + y2]) * 0.5
+    scale = np.hstack([x2 - x1, y2 - y1]) * padding
+    if dim == 1:
+        center = center[0]
+        scale = scale[0]
+    return center, scale
+def bbox_xywh2cs(bbox: np.ndarray, padding: float = 1.0) -> tuple[np.ndarray, np.ndarray]:
+    """Transform the bbox format from (x,y,w,h) into (center, scale)
+    Args:
+        bbox (ndarray): Bounding box(es) in shape (4,) or (n, 4), formatted
+            as (x, y, h, w)
+        padding (float): BBox padding factor that will be multilied to scale.
+            Default: 1.0
+    Returns:
+        tuple: A tuple containing center and scale.
+        - np.ndarray[float32]: Center (x, y) of the bbox in shape (2,) or
+            (n, 2)
+        - np.ndarray[float32]: Scale (w, h) of the bbox in shape (2,) or
+            (n, 2)
+    """
+    # convert single bbox from (4, ) to (1, 4)
+    dim = bbox.ndim
+    if dim == 1:
+        bbox = bbox[None, :]
+    x, y, w, h = np.hsplit(bbox, [1, 2, 3])
+    center = np.hstack([x + w * 0.5, y + h * 0.5])
+    scale = np.hstack([w, h]) * padding
+    if dim == 1:
+        center = center[0]
+        scale = scale[0]
+    return center, scale
+def bbox_cs2xyxy(center: np.ndarray, scale: np.ndarray, padding: float = 1.0) -> np.ndarray:
+    """Transform the bbox format from (center, scale) to (x1,y1,x2,y2).
+    Args:
+        center (ndarray): BBox center (x, y) in shape (2,) or (n, 2)
+        scale (ndarray): BBox scale (w, h) in shape (2,) or (n, 2)
+        padding (float): BBox padding factor that will be multilied to scale.
+            Default: 1.0
+    Returns:
+        ndarray[float32]: BBox (x1, y1, x2, y2) in shape (4, ) or (n, 4)
+    """
+    dim = center.ndim
+    assert scale.ndim == dim
+    if dim == 1:
+        center = center[None, :]
+        scale = scale[None, :]
+    wh = scale / padding
+    xy = center - 0.5 * wh
+    bbox = np.hstack((xy, xy + wh))
+    if dim == 1:
+        bbox = bbox[0]
+    return bbox
+def bbox_cs2xywh(center: np.ndarray, scale: np.ndarray, padding: float = 1.0) -> np.ndarray:
+    """Transform the bbox format from (center, scale) to (x,y,w,h).
+    Args:
+        center (ndarray): BBox center (x, y) in shape (2,) or (n, 2)
+        scale (ndarray): BBox scale (w, h) in shape (2,) or (n, 2)
+        padding (float): BBox padding factor that will be multilied to scale.
+            Default: 1.0
+    Returns:
+        ndarray[float32]: BBox (x, y, w, h) in shape (4, ) or (n, 4)
+    """
+    dim = center.ndim
+    assert scale.ndim == dim
+    if dim == 1:
+        center = center[None, :]
+        scale = scale[None, :]
+    wh = scale / padding
+    xy = center - 0.5 * wh
+    bbox = np.hstack((xy, wh))
+    if dim == 1:
+        bbox = bbox[0]
+    return bbox
+def flip_bbox(
+    bbox: np.ndarray,
+    image_size: tuple[int, int],
+    bbox_format: str = "xywh",
+    direction: str = "horizontal",
+) -> np.ndarray:
+    """Flip the bbox in the given direction.
+    Args:
+        bbox (np.ndarray): The bounding boxes. The shape should be (..., 4)
+            if ``bbox_format`` is ``'xyxy'`` or ``'xywh'``, and (..., 2) if
+            ``bbox_format`` is ``'center'``
+        image_size (tuple): The image shape in [w, h]
+        bbox_format (str): The bbox format. Options are ``'xywh'``, ``'xyxy'``
+            and ``'center'``.
+        direction (str): The flip direction. Options are ``'horizontal'``,
+            ``'vertical'`` and ``'diagonal'``. Defaults to ``'horizontal'``
+    Returns:
+        np.ndarray: The flipped bounding boxes.
+    """
+    direction_options = {"horizontal", "vertical", "diagonal"}
+    assert direction in direction_options, f'Invalid flipping direction "{direction}". Options are {direction_options}'
+    format_options = {"xywh", "xyxy", "center"}
+    assert bbox_format in format_options, f'Invalid bbox format "{bbox_format}". Options are {format_options}'
+    bbox_flipped = bbox.copy()
+    w, h = image_size
+    if direction == "horizontal":
+        if bbox_format == "xywh" or bbox_format == "center":
+            bbox_flipped[..., 0] = w - bbox[..., 0] - 1
+        elif bbox_format == "xyxy":
+            bbox_flipped[..., ::2] = w - bbox[..., ::2] - 1
+    elif direction == "vertical":
+        if bbox_format == "xywh" or bbox_format == "center":
+            bbox_flipped[..., 1] = h - bbox[..., 1] - 1
+        elif bbox_format == "xyxy":
+            bbox_flipped[..., 1::2] = h - bbox[..., 1::2] - 1
+    elif direction == "diagonal":
+        if bbox_format == "xywh" or bbox_format == "center":
+            bbox_flipped[..., :2] = [w, h] - bbox[..., :2] - 1
+        elif bbox_format == "xyxy":
+            bbox_flipped[...] = [w, h, w, h] - bbox - 1
+    return bbox_flipped
+def fix_aspect_ratio(bbox_scale: np.ndarray, aspect_ratio: float):
+    """Reshape the bbox to a fixed aspect ratio.
+    Args:
+        bbox_scale (np.ndarray): The bbox scales (w, h) in shape (n, 2)
+        aspect_ratio (float): The ratio of ``w/h``
+    Returns:
+        np.darray: The reshaped bbox scales in (n, 2)
+    """
+    dim = bbox_scale.ndim
+    if dim == 1:
+        bbox_scale = bbox_scale[None, :]
+    w, h = np.hsplit(bbox_scale, [1])
+    bbox_scale = np.where(
+        w > h * aspect_ratio,
+        np.hstack([w, w / aspect_ratio]),
+        np.hstack([h * aspect_ratio, h]),
+    )
+    if dim == 1:
+        bbox_scale = bbox_scale[0]
+    return bbox_scale
+def get_udp_warp_matrix(
+    center: np.ndarray,
+    scale: np.ndarray,
+    rot: float,
+    output_size: tuple[int, int],
+) -> np.ndarray:
+    """Calculate the affine transformation matrix under the unbiased
+    constraint. See `UDP (CVPR 2020)`_ for details.
+    Note:
+        - The bbox number: N
+    Args:
+        center (np.ndarray[2, ]): Center of the bounding box (x, y).
+        scale (np.ndarray[2, ]): Scale of the bounding box
+            wrt [width, height].
+        rot (float): Rotation angle (degree).
+        output_size (tuple): Size ([w, h]) of the output image
+    Returns:
+        np.ndarray: A 2x3 transformation matrix
+    .. _`UDP (CVPR 2020)`: https://arxiv.org/abs/1911.07524
+    """
+    assert len(center) == 2
+    assert len(scale) == 2
+    assert len(output_size) == 2
+    input_size = center * 2
+    rot_rad = np.deg2rad(rot)
+    warp_mat = np.zeros((2, 3), dtype=np.float32)
+    scale_x = (output_size[0] - 1) / scale[0]
+    scale_y = (output_size[1] - 1) / scale[1]
+    warp_mat[0, 0] = math.cos(rot_rad) * scale_x
+    warp_mat[0, 1] = -math.sin(rot_rad) * scale_x
+    warp_mat[0, 2] = scale_x * (
+        -0.5 * input_size[0] * math.cos(rot_rad) + 0.5 * input_size[1] * math.sin(rot_rad) + 0.5 * scale[0]
+    )
+    warp_mat[1, 0] = math.sin(rot_rad) * scale_y
+    warp_mat[1, 1] = math.cos(rot_rad) * scale_y
+    warp_mat[1, 2] = scale_y * (
+        -0.5 * input_size[0] * math.sin(rot_rad) - 0.5 * input_size[1] * math.cos(rot_rad) + 0.5 * scale[1]
+    )
+    return warp_mat
+def get_warp_matrix(
+    center: np.ndarray,
+    scale: np.ndarray,
+    rot: float,
+    output_size: tuple[int, int],
+    shift: tuple[float, float] = (0.0, 0.0),
+    inv: bool = False,
+) -> np.ndarray:
+    """Calculate the affine transformation matrix that can warp the bbox area
+    in the input image to the output size.
+    Args:
+        center (np.ndarray[2, ]): Center of the bounding box (x, y).
+        scale (np.ndarray[2, ]): Scale of the bounding box
+            wrt [width, height].
+        rot (float): Rotation angle (degree).
+        output_size (np.ndarray[2, ] | list(2,)): Size of the
+            destination heatmaps.
+        shift (0-100%): Shift translation ratio wrt the width/height.
+            Default (0., 0.).
+        inv (bool): Option to inverse the affine transform direction.
+            (inv=False: src->dst or inv=True: dst->src)
+    Returns:
+        np.ndarray: A 2x3 transformation matrix
+    """
+    assert len(center) == 2
+    assert len(scale) == 2
+    assert len(output_size) == 2
+    assert len(shift) == 2
+    shift = np.array(shift)
+    src_w = scale[0]
+    dst_w = output_size[0]
+    dst_h = output_size[1]
+    rot_rad = np.deg2rad(rot)
+    src_dir = _rotate_point(np.array([0.0, src_w * -0.5]), rot_rad)
+    dst_dir = np.array([0.0, dst_w * -0.5])
+    src = np.zeros((3, 2), dtype=np.float32)
+    src[0, :] = center + scale * shift
+    src[1, :] = center + src_dir + scale * shift
+    src[2, :] = _get_3rd_point(src[0, :], src[1, :])
+    dst = np.zeros((3, 2), dtype=np.float32)
+    dst[0, :] = [dst_w * 0.5, dst_h * 0.5]
+    dst[1, :] = np.array([dst_w * 0.5, dst_h * 0.5]) + dst_dir
+    dst[2, :] = _get_3rd_point(dst[0, :], dst[1, :])
+    if inv:
+        warp_mat = cv2.getAffineTransform(np.float32(dst), np.float32(src))
+    else:
+        warp_mat = cv2.getAffineTransform(np.float32(src), np.float32(dst))
+    return warp_mat
+def _rotate_point(pt: np.ndarray, angle_rad: float) -> np.ndarray:
+    """Rotate a point by an angle.
+    Args:
+        pt (np.ndarray): 2D point coordinates (x, y) in shape (2, )
+        angle_rad (float): rotation angle in radian
+    Returns:
+        np.ndarray: Rotated point in shape (2, )
+    """
+    sn, cs = np.sin(angle_rad), np.cos(angle_rad)
+    rot_mat = np.array([[cs, -sn], [sn, cs]])
+    return rot_mat @ pt
+def _get_3rd_point(a: np.ndarray, b: np.ndarray):
+    """To calculate the affine matrix, three pairs of points are required. This
+    function is used to get the 3rd point, given 2D points a & b.
+    The 3rd point is defined by rotating vector `a - b` by 90 degrees
+    anticlockwise, using b as the rotation center.
+    Args:
+        a (np.ndarray): The 1st point (x,y) in shape (2, )
+        b (np.ndarray): The 2nd point (x,y) in shape (2, )
+    Returns:
+        np.ndarray: The 3rd point.
+    """
+    direction = a - b
+    c = b + np.r_[-direction[1], direction[0]]
+    return c

src/sam3d_body/data/transforms/common.py ADDED Viewed

	@@ -0,0 +1,345 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+from collections.abc import Callable, Sequence
+import cv2
+import numpy as np
+import torch.nn as nn
+import torchvision.transforms.functional as F
+from PIL import Image
+from sam3d_body.models.modules import to_2tuple
+from .bbox_utils import (
+    bbox_xywh2cs,
+    bbox_xyxy2cs,
+    fix_aspect_ratio,
+    get_udp_warp_matrix,
+    get_warp_matrix,
+)
+class Compose:
+    """Compose multiple transforms sequentially.
+    Args:
+        transforms (Sequence[dict, callable], optional): Sequence of transform
+            object or config dict to be composed.
+    """
+    def __init__(self, transforms: list[Callable] | None = None):
+        if transforms is None:
+            transforms = []
+        else:
+            self.transforms = transforms
+    def __call__(self, data: dict) -> dict | None:
+        """Call function to apply transforms sequentially.
+        Args:
+            data (dict): A result dict contains the data to transform.
+        Returns:
+           dict: Transformed data.
+        """
+        for t in self.transforms:
+            data = t(data)
+            # The transform will return None when it failed to load images or
+            # cannot find suitable augmentation parameters to augment the data.
+            # Here we simply return None if the transform returns None and the
+            # dataset will handle it by randomly selecting another data sample.
+            if data is None:
+                return None
+        return data
+    def __repr__(self):
+        """Print ``self.transforms`` in sequence.
+        Returns:
+            str: Formatted string.
+        """
+        format_string = self.__class__.__name__ + "("
+        for t in self.transforms:
+            format_string += "\n"
+            format_string += f"    {t}"
+        format_string += "\n)"
+        return format_string
+class VisionTransformWrapper:
+    """A wrapper to use torchvision transform functions in this codebase."""
+    def __init__(self, transform: Callable):
+        self.transform = transform
+    def __call__(self, results: dict) -> dict | None:
+        results["img"] = self.transform(results["img"])
+        return results
+    def __repr__(self) -> str:
+        """print the basic information of the transform.
+        Returns:
+            str: Formatted string.
+        """
+        repr_str = self.transform.__class__.__name__
+        return repr_str
+class GetBBoxCenterScale(nn.Module):
+    """Convert bboxes to center and scale.
+    The center is the coordinates of the bbox center, and the scale is the
+    bbox width and height normalized by a scale factor.
+    Required Keys:
+        - bbox
+        - bbox_format
+    Added Keys:
+        - bbox_center
+        - bbox_scale
+    Args:
+        padding (float): The bbox padding scale that will be multilied to
+            `bbox_scale`. Defaults to 1.25
+    """
+    def __init__(self, padding: float = 1.25) -> None:
+        super().__init__()
+        self.padding = padding
+    def forward(self, results: dict) -> dict | None:
+        """The transform function of :class:`GetBBoxCenterScale`.
+        Args:
+            results (dict): The result dict
+        Returns:
+            dict: The result dict.
+        """
+        if "bbox_center" in results and "bbox_scale" in results:
+            results["bbox_scale"] *= self.padding
+        else:
+            bbox = results["bbox"]
+            bbox_format = results.get("bbox_format", "none")
+            if bbox_format == "xywh":
+                center, scale = bbox_xywh2cs(bbox, padding=self.padding)
+            elif bbox_format == "xyxy":
+                center, scale = bbox_xyxy2cs(bbox, padding=self.padding)
+            else:
+                raise ValueError("Invalid bbox format: {}".format(results["bbox_format"]))
+            results["bbox_center"] = center
+            results["bbox_scale"] = scale
+        return results
+    def __repr__(self) -> str:
+        """print the basic information of the transform.
+        Returns:
+            str: Formatted string.
+        """
+        repr_str = self.__class__.__name__ + f"(padding={self.padding})"
+        return repr_str
+class SquarePad:
+    def __call__(self, results: dict) -> dict | None:
+        assert isinstance(results["img"], Image.Image)
+        w, h = results["img"].size
+        max_wh = np.max([w, h])
+        hp = int((max_wh - w) / 2)
+        vp = int((max_wh - h) / 2)
+        padding = (hp, vp, max_wh - w - hp, max_wh - h - vp)
+        results["img"] = F.pad(results["img"], padding, 0, "constant")
+        return results
+    def __repr__(self) -> str:
+        """print the basic information of the transform.
+        Returns:
+            str: Formatted string.
+        """
+        repr_str = self.__class__.__name__
+        return repr_str
+class ToPIL:
+    def __call__(self, results: dict) -> dict | None:
+        if isinstance(results["img"], list):
+            if isinstance(results["img"][0], np.ndarray):
+                results["img"] = [Image.fromarray(img) for img in results["img"]]
+        elif isinstance(results["img"], np.ndarray):
+            results["img"] = Image.fromarray(results["img"])
+class ToCv2:
+    def __call__(self, results: dict) -> dict | None:
+        if isinstance(results["img"], list):
+            if isinstance(results["img"][0], Image.Image):
+                results["img"] = [np.array(img) for img in results["img"]]
+        elif isinstance(results["img"], Image.Image):
+            results["img"] = np.array(results["img"])
+class TopdownAffine(nn.Module):
+    """Get the bbox image as the model input by affine transform.
+    Required Keys:
+        - img
+        - bbox_center
+        - bbox_scale
+        - bbox_rotation (optional)
+        - keypoints_2d (optional)
+        - mask (optional)
+    Modified Keys:
+        - img
+        - bbox_scale
+    Added Keys:
+        - input_size
+        - transformed_keypoints
+    Args:
+        input_size (Tuple[int, int]): The input image size of the model in
+            [w, h]. The bbox region will be cropped and resize to `input_size`
+        use_udp (bool): Whether use unbiased data processing. See
+            `UDP (CVPR 2020)`_ for details. Defaults to ``False``
+        aspect_ratio (float): both HMR2.0 and Sapiens will expand input bbox to
+            a fixed ratio (width/height = 192/256), then expand to the ratio of
+            the model input size. E.g., HMR2.0 will eventually expand to 1:1, while
+            Sapiens will be 768:1024.
+    .. _`UDP (CVPR 2020)`: https://arxiv.org/abs/1911.07524
+    """
+    def __init__(
+        self,
+        input_size: int | tuple[int, int] | Sequence[int],
+        use_udp: bool = False,
+        aspect_ratio: float = 0.75,
+        fix_square: bool = False,
+    ) -> None:
+        super().__init__()
+        self.input_size = to_2tuple(input_size)
+        self.use_udp = use_udp
+        self.aspect_ratio = aspect_ratio
+        self.fix_square = fix_square
+    def forward(self, results: dict) -> dict | None:
+        """The transform function of :class:`TopdownAffine`.
+        See ``transform()`` method of :class:`BaseTransform` for details.
+        Args:
+            results (dict): The result dict
+        Returns:
+            dict: The result dict.
+        """
+        # # Debug only
+        # import copy
+        # results['ori_img'] = np.zeros((2000, 2000, 3), dtype=np.uint8)
+        # results['ori_img'][:results['img'].shape[0], :results['img'].shape[1]] = copy.deepcopy(results['img'])
+        w, h = self.input_size
+        warp_size = (int(w), int(h))
+        # expand bbox to fixed aspect ratio
+        results["orig_bbox_scale"] = results["bbox_scale"].copy()
+        if self.fix_square and results["bbox_scale"][0] == results["bbox_scale"][1]:
+            # In HMR2.0 etc, no fexpand_aspect_ratio for square bbox
+            bbox_scale = fix_aspect_ratio(results["bbox_scale"], aspect_ratio=w / h)
+        else:
+            # first to a prior aspect ratio, then reshape to model input size
+            bbox_scale = fix_aspect_ratio(results["bbox_scale"], aspect_ratio=self.aspect_ratio)
+            results["bbox_scale"] = fix_aspect_ratio(bbox_scale, aspect_ratio=w / h)
+        results["bbox_expand_factor"] = results["bbox_scale"].max() / results["orig_bbox_scale"].max()
+        rot = 0.0
+        if results["bbox_center"].ndim == 2:
+            assert results["bbox_center"].shape[0] == 1, (
+                "Only support cropping one instance at a time. Got invalid "
+                f"shape of bbox_center {results['bbox_center'].shape}."
+            )
+            center = results["bbox_center"][0]
+            scale = results["bbox_scale"][0]
+            if "bbox_rotation" in results:
+                rot = results["bbox_rotation"][0]
+        else:
+            center = results["bbox_center"]
+            scale = results["bbox_scale"]
+            if "bbox_rotation" in results:
+                rot = results["bbox_rotation"]
+        if self.use_udp:
+            warp_mat = get_udp_warp_matrix(center, scale, rot, output_size=(w, h))
+        else:
+            warp_mat = get_warp_matrix(center, scale, rot, output_size=(w, h))
+        if "img" not in results:
+            pass
+        elif isinstance(results["img"], list):
+            results["img"] = [
+                cv2.warpAffine(img, warp_mat, warp_size, flags=cv2.INTER_LINEAR) for img in results["img"]
+            ]
+            height, width = results["img"][0].shape[:2]
+            results["ori_img_size"] = np.array([width, height])
+        else:
+            height, width = results["img"].shape[:2]
+            results["ori_img_size"] = np.array([width, height])
+            results["img"] = cv2.warpAffine(results["img"], warp_mat, warp_size, flags=cv2.INTER_LINEAR)
+        if results.get("keypoints_2d") is not None:
+            results["orig_keypoints_2d"] = results["keypoints_2d"].copy()
+            transformed_keypoints = results["keypoints_2d"].copy()
+            # Only transform (x, y) coordinates
+            # cv2 expect the input to be [[[x1, y1], [x2, y2]]]
+            transformed_keypoints[:, :2] = cv2.transform(results["keypoints_2d"][None, :, :2], warp_mat)[0]
+            results["keypoints_2d"] = transformed_keypoints
+        if results.get("mask") is not None:
+            results["mask"] = cv2.warpAffine(results["mask"], warp_mat, warp_size, flags=cv2.INTER_LINEAR)
+        results["img_size"] = np.array([w, h])
+        results["input_size"] = np.array([w, h])
+        results["affine_trans"] = warp_mat
+        return results
+    def __repr__(self) -> str:
+        """print the basic information of the transform.
+        Returns:
+            str: Formatted string.
+        """
+        repr_str = self.__class__.__name__
+        repr_str += f"(input_size={self.input_size}, "
+        repr_str += f"use_udp={self.use_udp})"
+        return repr_str
+class NormalizeKeypoint(nn.Module):
+    """
+    Normalize 2D keypoints to range [-0.5, 0.5].
+    Required Keys:
+        - keypoints_2d
+        - img_size
+    Modified Keys:
+        - keypoints_2d
+    """
+    def forward(self, results: dict) -> dict | None:
+        if "keypoints_2d" in results:
+            img_size = results.get("img_size", results["input_size"])
+            results["keypoints_2d"][:, :2] = results["keypoints_2d"][:, :2] / np.array(img_size).reshape(1, 2) - 0.5
+        return results

src/sam3d_body/data/utils/io.py ADDED Viewed

	@@ -0,0 +1,114 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+import os
+import time
+from typing import Any, List
+import braceexpand
+import cv2
+import numpy as np
+from PIL import Image
+def expand(s):
+    return os.path.expanduser(os.path.expandvars(s))
+def expand_urls(urls: str | List[str]):
+    if isinstance(urls, str):
+        urls = [urls]
+    urls = [u for url in urls for u in braceexpand.braceexpand(expand(url))]
+    return urls
+def load_image_from_file(
+    data_info: dict,
+    backend: str = "cv2",
+    image_format: str = "rgb",
+    retry: int = 10,
+) -> dict:
+    img = load_image(data_info["img_path"], backend, image_format, retry)
+    data_info["img"] = img
+    data_info["img_shape"] = img.shape[:2]
+    data_info["ori_shape"] = img.shape[:2]
+    return data_info
+def _pil_load(path: str, image_format: str) -> Image.Image:
+    with Image.open(path) as img:
+        if img is not None and image_format.lower() == "rgb":
+            img = img.convert("RGB")
+    return img
+def _cv2_load(path: str, image_format: str) -> np.ndarray:
+    img = cv2.imread(path)
+    if img is not None and image_format.lower() == "rgb":
+        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+    return img
+def load_image(
+    path: str,
+    backend: str = "pil",
+    image_format: str = "rgb",
+    retry: int = 10,
+) -> Any:
+    for i_try in range(retry):
+        if backend == "pil":
+            img = _pil_load(path, image_format)
+        elif backend == "cv2":
+            img = _cv2_load(path, image_format)
+        else:
+            raise ValueError("Invalid backend {} for loading image.".format(backend))
+        if img is not None:
+            return img
+        else:
+            print("Reading {} failed. Will retry.".format(path))
+            time.sleep(1.0)
+        if i_try == retry - 1:
+            raise Exception("Failed to load image {}".format(path))
+def resize_image(img, target_size, center=None, scale=None):
+    height, width = img.shape[:2]
+    aspect_ratio = width / height
+    # Calculate the new size while maintaining the aspect ratio
+    if aspect_ratio > 1:
+        new_width = target_size
+        new_height = int(target_size / aspect_ratio)
+    else:
+        new_width = int(target_size * aspect_ratio)
+        new_height = target_size
+    # Resize the image using OpenCV
+    resized_img = cv2.resize(img, (new_width, new_height), interpolation=cv2.INTER_AREA)
+    # Create a new blank image with the target size
+    final_img = np.ones((target_size, target_size, 3), dtype=np.uint8) * 255
+    # Paste the resized image onto the blank image, centering it
+    start_x = (target_size - new_width) // 2
+    start_y = (target_size - new_height) // 2
+    final_img[start_y : start_y + new_height, start_x : start_x + new_width] = (
+        resized_img
+    )
+    if center is not None and scale is not None:
+        ratio_width = new_width / width
+        ratio_height = new_height / height
+        new_scale = np.stack(
+            [scale[:, 0] * ratio_width, scale[:, 1] * ratio_height], axis=1
+        )
+        new_center = np.stack(
+            [center[:, 0] * ratio_width, center[:, 1] * ratio_height], axis=1
+        )
+        new_center[:, 0] += start_x
+        new_center[:, 1] += start_y
+    else:
+        new_center, new_scale = None, None
+    return aspect_ratio, final_img, new_center, new_scale

src/sam3d_body/data/utils/prepare_batch.py ADDED Viewed

	@@ -0,0 +1,99 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+from collections.abc import Callable
+from typing import Any, TypedDict, cast
+import numpy as np
+import torch
+from jaxtyping import Float, UInt8
+from numpy import ndarray
+from torch import Tensor
+from torch.utils.data import default_collate
+class PreparedBatchDict(TypedDict, total=False):
+    img: Float[Tensor, "B N 3 H W"]
+    img_size: Float[Tensor, "B N 2"]
+    ori_img_size: Float[Tensor, "B N 2"]
+    bbox_center: Float[Tensor, "B N 2"]
+    bbox_scale: Float[Tensor, "B N 2"]
+    bbox: Float[Tensor, "B N 4"]
+    affine_trans: Float[Tensor, "B N 2 3"]
+    mask: Float[Tensor, "B N 1 H W"]
+    mask_score: Float[Tensor, "B N"]
+    cam_int: Float[Tensor, "B 3 3"]
+    person_valid: Float[Tensor, "B N"]
+    img_ori: list["NoCollate"]
+class NoCollate:
+    def __init__(self, data: Any) -> None:
+        self.data: Any = data
+def prepare_batch(
+    img: UInt8[ndarray, "h w 3"],
+    transform: Callable[[dict[str, Any]], dict[str, Any]],
+    boxes: Float[ndarray, "n 4"],
+    masks: Float[ndarray, "n h w"] | None = None,
+    masks_score: Float[ndarray, "n"] | None = None,
+    cam_int: Float[Tensor, "B 3 3"] | None = None,
+) -> PreparedBatchDict:
+    """A helper function to prepare data batch for SAM 3D Body model inference."""
+    height, width = img.shape[:2]
+    # construct batch data samples
+    data_list: list[dict[str, Any]] = []
+    for idx in range(boxes.shape[0]):
+        data_info: dict[str, Any] = dict(img=img)
+        data_info["bbox"] = boxes[idx]  # shape (4,)
+        data_info["bbox_format"] = "xyxy"
+        if masks is not None:
+            data_info["mask"] = masks[idx].astype(np.float32, copy=False)
+            if masks_score is not None:
+                data_info["mask_score"] = masks_score[idx]
+            else:
+                data_info["mask_score"] = np.array(1.0, dtype=np.float32)
+        else:
+            data_info["mask"] = np.zeros((height, width, 1), dtype=np.uint8)
+            data_info["mask_score"] = np.array(0.0, dtype=np.float32)
+        data_list.append(transform(data_info))
+    batch = default_collate(data_list)
+    max_num_person = batch["img"].shape[0]
+    for key in [
+        "img",
+        "img_size",
+        "ori_img_size",
+        "bbox_center",
+        "bbox_scale",
+        "bbox",
+        "affine_trans",
+        "mask",
+        "mask_score",
+    ]:
+        if key in batch:
+            batch[key] = batch[key].unsqueeze(0).float()
+    if "mask" in batch:
+        batch["mask"] = batch["mask"].unsqueeze(2)
+    batch["person_valid"] = torch.ones((1, max_num_person))
+    if cam_int is not None:
+        batch["cam_int"] = cam_int.to(batch["img"])
+    else:
+        # Default camera intrinsics according image size
+        batch["cam_int"] = torch.tensor(
+            [
+                [
+                    [(height**2 + width**2) ** 0.5, 0, width / 2.0],
+                    [0, (height**2 + width**2) ** 0.5, height / 2.0],
+                    [0, 0, 1],
+                ]
+            ],
+        ).to(batch["img"])
+    batch["img_ori"] = [NoCollate(img)]
+    return cast(PreparedBatchDict, batch)

src/sam3d_body/gradio_ui/sam3d_body_ui.py ADDED Viewed

	@@ -0,0 +1,164 @@

+"""
+Demonstrates integrating Rerun visualization with Gradio.
+Provides example implementations of data streaming, keypoint annotation, and dynamic
+visualization across multiple Gradio tabs using Rerun's recording and visualization capabilities.
+"""
+import os
+import shutil
+import tempfile
+from pathlib import Path
+from typing import Final
+import cv2
+import gradio as gr
+import rerun as rr
+import rerun.blueprint as rrb
+import spaces
+from gradio_rerun import Rerun
+from jaxtyping import Int, UInt8
+from monopriors.relative_depth_models import RelativeDepthPrediction
+from numpy import ndarray
+from sam3d_body.api.demo import SAM3Config, SAM3DBodyE2E, SAM3DBodyE2EConfig, create_view, set_annotation_context
+from sam3d_body.api.visualization import export_meshes_to_glb, visualize_sample
+from sam3d_body.sam_3d_body_estimator import FinalPosePrediction
+CFG: SAM3DBodyE2EConfig = SAM3DBodyE2EConfig(sam3_config=SAM3Config())
+MODEL_E2E: SAM3DBodyE2E = SAM3DBodyE2E(config=CFG)
+mesh_faces: Int[ndarray, "n_faces=36874 3"] = MODEL_E2E.sam3d_body_estimator.faces
+STATE: Final[str] = "✅ Ready"
+# Absolute path to bundled example data used by Gradio examples.
+TEST_INPUT_DIR: Final[Path] = Path(__file__).resolve().parents[3] / "data" / "example-data"
+# Allow Gradio to serve and cache files from the bundled test data directory.
+gr.set_static_paths([str(TEST_INPUT_DIR)])
+@spaces.GPU()
+@rr.thread_local_stream("sam3d_body_gradio_ui")
+def sam3d_prediction_fn(
+    rgb_hw3,
+    log_relative_depth,
+    export_glb,
+    center_glb,
+    pending_cleanup=None,
+) -> tuple[str, str, list[str]]:
+    # resize rgb so that its largest dimension is 1024
+    rgb_hw3: UInt8[ndarray, "h w 3"] = cv2.resize(
+        rgb_hw3,  # type: ignore[arg-type]
+        dsize=(0, 0),
+        fx=1024 / max(rgb_hw3.shape[0], rgb_hw3.shape[1]),
+        fy=1024 / max(rgb_hw3.shape[0], rgb_hw3.shape[1]),
+        interpolation=cv2.INTER_AREA,
+    )
+    # We eventually want to clean up the RRD file after it's sent to the viewer, so tracking
+    # any pending files to be cleaned up when the state is deleted.
+    temp = tempfile.NamedTemporaryFile(prefix="cube_", suffix=".rrd", delete=False)
+    if pending_cleanup is not None:
+        pending_cleanup.append(temp.name)
+    view: rrb.ContainerLike = create_view()
+    blueprint = rrb.Blueprint(view, collapse_panels=True)
+    rr.save(path=temp.name, default_blueprint=blueprint)
+    set_annotation_context()
+    parent_log_path = Path("/world")
+    rr.log("/", rr.ViewCoordinates.RDF, static=True)
+    outputs: tuple[list[FinalPosePrediction], RelativeDepthPrediction] = MODEL_E2E.predict_single_image(rgb_hw3=rgb_hw3)
+    pred_list: list[FinalPosePrediction] = outputs[0]
+    relative_pred: RelativeDepthPrediction = outputs[1]
+    rr.set_time(timeline="image_sequence", sequence=0)
+    visualize_sample(
+        pred_list=pred_list,
+        rgb_hw3=rgb_hw3,
+        parent_log_path=parent_log_path,
+        faces=mesh_faces,
+        relative_depth_pred=relative_pred if log_relative_depth else None,
+    )
+    glb_files: list[str] = []
+    if export_glb and len(pred_list) > 0:
+        glb_dir: Path = Path(tempfile.mkdtemp(prefix="sam3d_glb_"))
+        glb_paths = export_meshes_to_glb(
+            pred_list=pred_list,
+            faces=mesh_faces,
+            output_dir=glb_dir,
+            center_mesh=center_glb,
+        )
+        glb_files = [str(p) for p in glb_paths]
+        if pending_cleanup is not None:
+            pending_cleanup.extend(glb_files)
+            pending_cleanup.append(str(glb_dir))
+    return temp.name, STATE, glb_files
+def cleanup_rrds(pending_cleanup: list[str]) -> None:
+    for f in pending_cleanup:
+        if os.path.isdir(f):
+            shutil.rmtree(f, ignore_errors=True)
+        elif os.path.isfile(f):
+            os.unlink(f)
+def _switch_to_outputs() -> gr.Tabs:
+    return gr.update(selected="outputs")
+def main():
+    viewer = Rerun(
+        streaming=True,
+        panel_states={
+            "time": "collapsed",
+            "blueprint": "hidden",
+            "selection": "hidden",
+        },
+        height=800,
+    )
+    with gr.Blocks() as demo, gr.Tab("SAM3D Body Estimation"):
+        pending_cleanup = gr.State([], time_to_live=10, delete_callback=cleanup_rrds)
+        with gr.Row():
+            with gr.Column(scale=1):
+                tabs = gr.Tabs(selected="inputs")
+                with tabs:
+                    with gr.TabItem("Inputs", id="inputs"):
+                        img = gr.Image(interactive=True, label="Image", type="numpy", image_mode="RGB")
+                        depth_checkbox = gr.Checkbox(label="Log relative depth", value=False)
+                        with gr.Row():
+                            export_checkbox = gr.Checkbox(label="Export GLB meshes", value=False)
+                            center_checkbox = gr.Checkbox(label="Center GLB at origin", value=True)
+                        create_rrd = gr.Button("Predict Pose")
+                    with gr.TabItem("Outputs", id="outputs"):
+                        status = gr.Text(STATE, label="Status")
+                        mesh_files = gr.Files(label="GLB meshes", file_count="multiple")
+                gr.Examples(
+                    examples=[
+                        [str(TEST_INPUT_DIR / "Planche.jpg"), True, False, True],
+                        [str(TEST_INPUT_DIR / "Amir-Khan-Lamont-Peterson_2689582.jpg"), False, False, True],
+                        [str(TEST_INPUT_DIR / "BNAAHPYGMYSE26U6C6T7VA6544.jpg"), False, True, True],
+                        [str(TEST_INPUT_DIR / "yoga-example.jpg"), True, True, False],
+                    ],
+                    inputs=[img, depth_checkbox, export_checkbox, center_checkbox],
+                    outputs=[viewer, status, mesh_files],
+                    fn=sam3d_prediction_fn,
+                    run_on_click=True,
+                    cache_examples=False,
+                    examples_per_page=2,
+                )
+            with gr.Column(scale=5):
+                viewer.render()
+        create_rrd.click(
+            fn=_switch_to_outputs,
+            inputs=None,
+            outputs=[tabs],
+        ).then(
+            sam3d_prediction_fn,
+            inputs=[img, depth_checkbox, export_checkbox, center_checkbox, pending_cleanup],
+            outputs=[viewer, status, mesh_files],
+        )
+    return demo

src/sam3d_body/metadata/__init__.py ADDED Viewed

	@@ -0,0 +1,79 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+OPENPOSE_TO_COCO = [0, 16, 15, 18, 17, 5, 2, 6, 3, 7, 4, 12, 9, 13, 10, 14, 11]
+# Mapping the J19 used in HMR2.0 to the 14 common points for evaluation
+# J19 is defined as the first 19 keypoints in https://github.com/nkolot/SPIN/blob/master/constants.py#L42
+# The first 14 keypoints in J19 are LSP keypoints
+J19_TO_J14 = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 18]
+# Mapping from 14 LSP keypoints to 17 COCO keypoints
+# Key: coco_idx, value: lsp_idx
+LSP_TO_COCO = {
+    5: 9,
+    6: 8,
+    7: 10,
+    8: 7,
+    9: 11,
+    10: 6,
+    11: 3,
+    12: 2,
+    13: 4,
+    14: 1,
+    15: 5,
+    16: 0,
+}
+# fmt: off
+OPENPOSE_PERMUTATION = [0, 1, 5, 6, 7, 2, 3, 4, 8, 12, 13, 14, 9, 10, 11, 16, 15, 18, 17, 22, 23, 24, 19, 20, 21]
+J19_PERMUTATION = [5, 4, 3, 2, 1, 0, 11, 10, 9, 8, 7, 6, 12, 13, 14, 15, 16, 17, 18]
+COCO_PERMUTATION = [0, 2, 1, 4, 3, 6, 5, 8, 7, 10, 9, 12, 11, 14, 13, 16, 15]
+# fmt: on
+# Mapping the 70 MHR keypoints to OpenPose (COCO included)
+# key: OpenPose, value: mhr_idx
+MHR70_TO_OPENPOSE = {
+    0: 0,
+    1: 69,
+    2: 6,
+    3: 8,
+    4: 41,
+    5: 5,
+    6: 7,
+    7: 62,
+    9: 10,
+    10: 12,
+    11: 14,
+    12: 9,
+    13: 11,
+    14: 13,
+    15: 2,
+    16: 1,
+    17: 4,
+    18: 3,
+    19: 15,
+    20: 16,
+    21: 17,
+    22: 18,
+    23: 19,
+    24: 20,
+}
+# fmt: off
+MHR70_PERMUTATION = [0, 2, 1, 4, 3, 6, 5, 8, 7, 10, 9, 12, 11, 14, 13, 18, 19, 20, 15, 16, 17, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 64, 63, 66, 65, 68, 67, 69]
+# fmt: on
+MHR70_TO_LSP = {
+    0: 14,
+    1: 12,
+    2: 10,
+    3: 9,
+    4: 11,
+    5: 13,
+    6: 41,
+    7: 8,
+    8: 6,
+    9: 5,
+    10: 7,
+    11: 62,
+    12: 69,
+}

src/sam3d_body/metadata/mhr70.py ADDED Viewed

	@@ -0,0 +1,915 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+"""The first 70 of 308 MHR keypoints, ignoring the rest for face keypoints"""
+from typing import Final
+mhr_names = [
+    "nose",
+    "left-eye",
+    "right-eye",
+    "left-ear",
+    "right-ear",
+    "left-shoulder",
+    "right-shoulder",
+    "left-elbow",
+    "right-elbow",
+    "left-hip",
+    "right-hip",
+    "left-knee",
+    "right-knee",
+    "left-ankle",
+    "right-ankle",
+    "left-big-toe-tip",
+    "left-small-toe-tip",
+    "left-heel",
+    "right-big-toe-tip",
+    "right-small-toe-tip",
+    "right-heel",
+    "right-thumb-tip",
+    "right-thumb-first-joint",
+    "right-thumb-second-joint",
+    "right-thumb-third-joint",
+    "right-index-tip",
+    "right-index-first-joint",
+    "right-index-second-joint",
+    "right-index-third-joint",
+    "right-middle-tip",
+    "right-middle-first-joint",
+    "right-middle-second-joint",
+    "right-middle-third-joint",
+    "right-ring-tip",
+    "right-ring-first-joint",
+    "right-ring-second-joint",
+    "right-ring-third-joint",
+    "right-pinky-tip",
+    "right-pinky-first-joint",
+    "right-pinky-second-joint",
+    "right-pinky-third-joint",
+    "right-wrist",
+    "left-thumb-tip",
+    "left-thumb-first-joint",
+    "left-thumb-second-joint",
+    "left-thumb-third-joint",
+    "left-index-tip",
+    "left-index-first-joint",
+    "left-index-second-joint",
+    "left-index-third-joint",
+    "left-middle-tip",
+    "left-middle-first-joint",
+    "left-middle-second-joint",
+    "left-middle-third-joint",
+    "left-ring-tip",
+    "left-ring-first-joint",
+    "left-ring-second-joint",
+    "left-ring-third-joint",
+    "left-pinky-tip",
+    "left-pinky-first-joint",
+    "left-pinky-second-joint",
+    "left-pinky-third-joint",
+    "left-wrist",
+    "left-olecranon",
+    "right-olecranon",
+    "left-cubital-fossa",
+    "right-cubital-fossa",
+    "left-acromion",
+    "right-acromion",
+    "neck",
+]
+pose_info = dict(
+    pose_format="mhr70",
+    paper_info=dict(
+        author="",
+        year="",
+        homepage="",
+    ),
+    min_visible_keypoints=8,
+    image_height=4096,
+    image_width=2668,
+    original_keypoint_info={
+        0: "nose",
+        1: "left_eye",
+        2: "right_eye",
+        3: "left_ear",
+        4: "right_ear",
+        5: "left_shoulder",
+        6: "right_shoulder",
+        7: "left_elbow",
+        8: "right_elbow",
+        9: "left_hip",
+        10: "right_hip",
+        11: "left_knee",
+        12: "right_knee",
+        13: "left_ankle",
+        14: "right_ankle",
+        15: "left_big_toe_tip",
+        16: "left_small_toe_tip",
+        17: "left_heel",
+        18: "right_big_toe_tip",
+        19: "right_small_toe_tip",
+        20: "right_heel",
+        21: "right_thumb_tip",
+        22: "right_thumb_first_joint",
+        23: "right_thumb_second_joint",
+        24: "right_thumb_third_joint",
+        25: "right_index_tip",
+        26: "right_index_first_joint",
+        27: "right_index_second_joint",
+        28: "right_index_third_joint",
+        29: "right_middle_tip",
+        30: "right_middle_first_joint",
+        31: "right_middle_second_joint",
+        32: "right_middle_third_joint",
+        33: "right_ring_tip",
+        34: "right_ring_first_joint",
+        35: "right_ring_second_joint",
+        36: "right_ring_third_joint",
+        37: "right_pinky_tip",
+        38: "right_pinky_first_joint",
+        39: "right_pinky_second_joint",
+        40: "right_pinky_third_joint",
+        41: "right_wrist",
+        42: "left_thumb_tip",
+        43: "left_thumb_first_joint",
+        44: "left_thumb_second_joint",
+        45: "left_thumb_third_joint",
+        46: "left_index_tip",
+        47: "left_index_first_joint",
+        48: "left_index_second_joint",
+        49: "left_index_third_joint",
+        50: "left_middle_tip",
+        51: "left_middle_first_joint",
+        52: "left_middle_second_joint",
+        53: "left_middle_third_joint",
+        54: "left_ring_tip",
+        55: "left_ring_first_joint",
+        56: "left_ring_second_joint",
+        57: "left_ring_third_joint",
+        58: "left_pinky_tip",
+        59: "left_pinky_first_joint",
+        60: "left_pinky_second_joint",
+        61: "left_pinky_third_joint",
+        62: "left_wrist",
+        63: "left_olecranon",
+        64: "right_olecranon",
+        65: "left_cubital_fossa",
+        66: "right_cubital_fossa",
+        67: "left_acromion",
+        68: "right_acromion",
+        69: "neck",
+    },
+    keypoint_info={
+        0: dict(name="nose", id=0, color=[51, 153, 255], type="upper", swap=""),
+        1: dict(
+            name="left_eye", id=1, color=[51, 153, 255], type="upper", swap="right_eye"
+        ),
+        2: dict(
+            name="right_eye", id=2, color=[51, 153, 255], type="upper", swap="left_eye"
+        ),
+        3: dict(
+            name="left_ear", id=3, color=[51, 153, 255], type="upper", swap="right_ear"
+        ),
+        4: dict(
+            name="right_ear", id=4, color=[51, 153, 255], type="upper", swap="left_ear"
+        ),
+        5: dict(
+            name="left_shoulder",
+            id=5,
+            color=[51, 153, 255],
+            type="upper",
+            swap="right_shoulder",
+        ),
+        6: dict(
+            name="right_shoulder",
+            id=6,
+            color=[51, 153, 255],
+            type="upper",
+            swap="left_shoulder",
+        ),
+        7: dict(
+            name="left_elbow",
+            id=7,
+            color=[51, 153, 255],
+            type="upper",
+            swap="right_elbow",
+        ),
+        8: dict(
+            name="right_elbow",
+            id=8,
+            color=[51, 153, 255],
+            type="upper",
+            swap="left_elbow",
+        ),
+        9: dict(
+            name="left_hip", id=9, color=[51, 153, 255], type="lower", swap="right_hip"
+        ),
+        10: dict(
+            name="right_hip", id=10, color=[51, 153, 255], type="lower", swap="left_hip"
+        ),
+        11: dict(
+            name="left_knee",
+            id=11,
+            color=[51, 153, 255],
+            type="lower",
+            swap="right_knee",
+        ),
+        12: dict(
+            name="right_knee",
+            id=12,
+            color=[51, 153, 255],
+            type="lower",
+            swap="left_knee",
+        ),
+        13: dict(
+            name="left_ankle",
+            id=13,
+            color=[51, 153, 255],
+            type="lower",
+            swap="right_ankle",
+        ),
+        14: dict(
+            name="right_ankle",
+            id=14,
+            color=[51, 153, 255],
+            type="lower",
+            swap="left_ankle",
+        ),
+        15: dict(
+            name="left_big_toe",
+            id=15,
+            color=[51, 153, 255],
+            type="lower",
+            swap="right_big_toe",
+        ),
+        16: dict(
+            name="left_small_toe",
+            id=16,
+            color=[51, 153, 255],
+            type="lower",
+            swap="right_small_toe",
+        ),
+        17: dict(
+            name="left_heel",
+            id=17,
+            color=[51, 153, 255],
+            type="lower",
+            swap="right_heel",
+        ),
+        18: dict(
+            name="right_big_toe",
+            id=18,
+            color=[51, 153, 255],
+            type="lower",
+            swap="left_big_toe",
+        ),
+        19: dict(
+            name="right_small_toe",
+            id=19,
+            color=[51, 153, 255],
+            type="lower",
+            swap="left_small_toe",
+        ),
+        20: dict(
+            name="right_heel",
+            id=20,
+            color=[51, 153, 255],
+            type="lower",
+            swap="left_heel",
+        ),
+        21: dict(
+            name="right_thumb4",
+            id=21,
+            color=[51, 153, 255],
+            type="upper",
+            swap="left_thumb4",
+        ),
+        22: dict(
+            name="right_thumb3",
+            id=22,
+            color=[51, 153, 255],
+            type="upper",
+            swap="left_thumb3",
+        ),
+        23: dict(
+            name="right_thumb2",
+            id=23,
+            color=[51, 153, 255],
+            type="upper",
+            swap="left_thumb2",
+        ),
+        24: dict(
+            name="right_thumb_third_joint",
+            id=24,
+            color=[51, 153, 255],
+            type="upper",
+            swap="left_thumb_third_joint",
+        ),
+        25: dict(
+            name="right_forefinger4",
+            id=25,
+            color=[51, 153, 255],
+            type="upper",
+            swap="left_forefinger4",
+        ),
+        26: dict(
+            name="right_forefinger3",
+            id=26,
+            color=[51, 153, 255],
+            type="upper",
+            swap="left_forefinger3",
+        ),
+        27: dict(
+            name="right_forefinger2",
+            id=27,
+            color=[51, 153, 255],
+            type="upper",
+            swap="left_forefinger2",
+        ),
+        28: dict(
+            name="right_forefinger_third_joint",
+            id=28,
+            color=[51, 153, 255],
+            type="upper",
+            swap="left_forefinger_third_joint",
+        ),
+        29: dict(
+            name="right_middle_finger4",
+            id=29,
+            color=[51, 153, 255],
+            type="upper",
+            swap="left_middle_finger4",
+        ),
+        30: dict(
+            name="right_middle_finger3",
+            id=30,
+            color=[51, 153, 255],
+            type="upper",
+            swap="left_middle_finger3",
+        ),
+        31: dict(
+            name="right_middle_finger2",
+            id=31,
+            color=[51, 153, 255],
+            type="upper",
+            swap="left_middle_finger2",
+        ),
+        32: dict(
+            name="right_middle_finger_third_joint",
+            id=32,
+            color=[51, 153, 255],
+            type="upper",
+            swap="left_middle_finger_third_joint",
+        ),
+        33: dict(
+            name="right_ring_finger4",
+            id=33,
+            color=[51, 153, 255],
+            type="upper",
+            swap="left_ring_finger4",
+        ),
+        34: dict(
+            name="right_ring_finger3",
+            id=34,
+            color=[51, 153, 255],
+            type="upper",
+            swap="left_ring_finger3",
+        ),
+        35: dict(
+            name="right_ring_finger2",
+            id=35,
+            color=[51, 153, 255],
+            type="upper",
+            swap="left_ring_finger2",
+        ),
+        36: dict(
+            name="right_ring_finger_third_joint",
+            id=36,
+            color=[51, 153, 255],
+            type="upper",
+            swap="left_ring_finger_third_joint",
+        ),
+        37: dict(
+            name="right_pinky_finger4",
+            id=37,
+            color=[51, 153, 255],
+            type="upper",
+            swap="left_pinky_finger4",
+        ),
+        38: dict(
+            name="right_pinky_finger3",
+            id=38,
+            color=[51, 153, 255],
+            type="upper",
+            swap="left_pinky_finger3",
+        ),
+        39: dict(
+            name="right_pinky_finger2",
+            id=39,
+            color=[51, 153, 255],
+            type="upper",
+            swap="left_pinky_finger2",
+        ),
+        40: dict(
+            name="right_pinky_finger_third_joint",
+            id=40,
+            color=[51, 153, 255],
+            type="upper",
+            swap="left_pinky_finger_third_joint",
+        ),
+        41: dict(
+            name="right_wrist",
+            id=41,
+            color=[51, 153, 255],
+            type="upper",
+            swap="left_wrist",
+        ),
+        42: dict(
+            name="left_thumb4",
+            id=42,
+            color=[51, 153, 255],
+            type="upper",
+            swap="right_thumb4",
+        ),
+        43: dict(
+            name="left_thumb3",
+            id=43,
+            color=[51, 153, 255],
+            type="upper",
+            swap="right_thumb3",
+        ),
+        44: dict(
+            name="left_thumb2",
+            id=44,
+            color=[51, 153, 255],
+            type="upper",
+            swap="right_thumb2",
+        ),
+        45: dict(
+            name="left_thumb_third_joint",
+            id=45,
+            color=[51, 153, 255],
+            type="upper",
+            swap="right_thumb_third_joint",
+        ),  ## doesnt match with wholebody
+        46: dict(
+            name="left_forefinger4",
+            id=46,
+            color=[51, 153, 255],
+            type="upper",
+            swap="right_forefinger4",
+        ),
+        47: dict(
+            name="left_forefinger3",
+            id=47,
+            color=[51, 153, 255],
+            type="upper",
+            swap="right_forefinger3",
+        ),
+        48: dict(
+            name="left_forefinger2",
+            id=48,
+            color=[51, 153, 255],
+            type="upper",
+            swap="right_forefinger2",
+        ),
+        49: dict(
+            name="left_forefinger_third_joint",
+            id=49,
+            color=[51, 153, 255],
+            type="upper",
+            swap="right_forefinger_third_joint",
+        ),
+        50: dict(
+            name="left_middle_finger4",
+            id=50,
+            color=[51, 153, 255],
+            type="upper",
+            swap="right_middle_finger4",
+        ),
+        51: dict(
+            name="left_middle_finger3",
+            id=51,
+            color=[51, 153, 255],
+            type="upper",
+            swap="right_middle_finger3",
+        ),
+        52: dict(
+            name="left_middle_finger2",
+            id=52,
+            color=[51, 153, 255],
+            type="upper",
+            swap="right_middle_finger2",
+        ),
+        53: dict(
+            name="left_middle_finger_third_joint",
+            id=53,
+            color=[51, 153, 255],
+            type="upper",
+            swap="right_middle_finger_third_joint",
+        ),
+        54: dict(
+            name="left_ring_finger4",
+            id=54,
+            color=[51, 153, 255],
+            type="upper",
+            swap="right_ring_finger4",
+        ),
+        55: dict(
+            name="left_ring_finger3",
+            id=55,
+            color=[51, 153, 255],
+            type="upper",
+            swap="right_ring_finger3",
+        ),
+        56: dict(
+            name="left_ring_finger2",
+            id=56,
+            color=[51, 153, 255],
+            type="upper",
+            swap="right_ring_finger2",
+        ),
+        57: dict(
+            name="left_ring_finger_third_joint",
+            id=57,
+            color=[51, 153, 255],
+            type="upper",
+            swap="right_ring_finger_third_joint",
+        ),
+        58: dict(
+            name="left_pinky_finger4",
+            id=58,
+            color=[51, 153, 255],
+            type="upper",
+            swap="right_pinky_finger4",
+        ),
+        59: dict(
+            name="left_pinky_finger3",
+            id=59,
+            color=[51, 153, 255],
+            type="upper",
+            swap="right_pinky_finger3",
+        ),
+        60: dict(
+            name="left_pinky_finger2",
+            id=60,
+            color=[51, 153, 255],
+            type="upper",
+            swap="right_pinky_finger2",
+        ),
+        61: dict(
+            name="left_pinky_finger_third_joint",
+            id=61,
+            color=[51, 153, 255],
+            type="upper",
+            swap="right_pinky_finger_third_joint",
+        ),
+        62: dict(
+            name="left_wrist",
+            id=62,
+            color=[51, 153, 255],
+            type="upper",
+            swap="right_wrist",
+        ),
+        63: dict(
+            name="left_olecranon",
+            id=63,
+            color=[51, 153, 255],
+            type="",
+            swap="right_olecranon",
+        ),
+        64: dict(
+            name="right_olecranon",
+            id=64,
+            color=[51, 153, 255],
+            type="",
+            swap="left_olecranon",
+        ),
+        65: dict(
+            name="left_cubital_fossa",
+            id=65,
+            color=[51, 153, 255],
+            type="",
+            swap="right_cubital_fossa",
+        ),
+        66: dict(
+            name="right_cubital_fossa",
+            id=66,
+            color=[51, 153, 255],
+            type="",
+            swap="left_cubital_fossa",
+        ),
+        67: dict(
+            name="left_acromion",
+            id=67,
+            color=[51, 153, 255],
+            type="",
+            swap="right_acromion",
+        ),
+        68: dict(
+            name="right_acromion",
+            id=68,
+            color=[51, 153, 255],
+            type="",
+            swap="left_acromion",
+        ),
+        69: dict(name="neck", id=69, color=[51, 153, 255], type="", swap=""),
+    },
+    skeleton_info={
+        0: dict(link=("left_ankle", "left_knee"), id=0, color=[0, 255, 0]),
+        1: dict(link=("left_knee", "left_hip"), id=1, color=[0, 255, 0]),
+        2: dict(link=("right_ankle", "right_knee"), id=2, color=[255, 128, 0]),
+        3: dict(link=("right_knee", "right_hip"), id=3, color=[255, 128, 0]),
+        4: dict(link=("left_hip", "right_hip"), id=4, color=[51, 153, 255]),
+        5: dict(link=("left_shoulder", "left_hip"), id=5, color=[51, 153, 255]),
+        6: dict(link=("right_shoulder", "right_hip"), id=6, color=[51, 153, 255]),
+        7: dict(link=("left_shoulder", "right_shoulder"), id=7, color=[51, 153, 255]),
+        8: dict(link=("left_shoulder", "left_elbow"), id=8, color=[0, 255, 0]),
+        9: dict(link=("right_shoulder", "right_elbow"), id=9, color=[255, 128, 0]),
+        10: dict(link=("left_elbow", "left_wrist"), id=10, color=[0, 255, 0]),
+        11: dict(link=("right_elbow", "right_wrist"), id=11, color=[255, 128, 0]),
+        12: dict(link=("left_eye", "right_eye"), id=12, color=[51, 153, 255]),
+        13: dict(link=("nose", "left_eye"), id=13, color=[51, 153, 255]),
+        14: dict(link=("nose", "right_eye"), id=14, color=[51, 153, 255]),
+        15: dict(link=("left_eye", "left_ear"), id=15, color=[51, 153, 255]),
+        16: dict(link=("right_eye", "right_ear"), id=16, color=[51, 153, 255]),
+        17: dict(link=("left_ear", "left_shoulder"), id=17, color=[51, 153, 255]),
+        18: dict(link=("right_ear", "right_shoulder"), id=18, color=[51, 153, 255]),
+        19: dict(link=("left_ankle", "left_big_toe"), id=19, color=[0, 255, 0]),
+        20: dict(link=("left_ankle", "left_small_toe"), id=20, color=[0, 255, 0]),
+        21: dict(link=("left_ankle", "left_heel"), id=21, color=[0, 255, 0]),
+        22: dict(link=("right_ankle", "right_big_toe"), id=22, color=[255, 128, 0]),
+        23: dict(link=("right_ankle", "right_small_toe"), id=23, color=[255, 128, 0]),
+        24: dict(link=("right_ankle", "right_heel"), id=24, color=[255, 128, 0]),
+        25: dict(
+            link=("left_wrist", "left_thumb_third_joint"), id=25, color=[255, 128, 0]
+        ),
+        26: dict(
+            link=("left_thumb_third_joint", "left_thumb2"), id=26, color=[255, 128, 0]
+        ),
+        27: dict(link=("left_thumb2", "left_thumb3"), id=27, color=[255, 128, 0]),
+        28: dict(link=("left_thumb3", "left_thumb4"), id=28, color=[255, 128, 0]),
+        29: dict(
+            link=("left_wrist", "left_forefinger_third_joint"),
+            id=29,
+            color=[255, 153, 255],
+        ),
+        30: dict(
+            link=("left_forefinger_third_joint", "left_forefinger2"),
+            id=30,
+            color=[255, 153, 255],
+        ),
+        31: dict(
+            link=("left_forefinger2", "left_forefinger3"), id=31, color=[255, 153, 255]
+        ),
+        32: dict(
+            link=("left_forefinger3", "left_forefinger4"), id=32, color=[255, 153, 255]
+        ),
+        33: dict(
+            link=("left_wrist", "left_middle_finger_third_joint"),
+            id=33,
+            color=[102, 178, 255],
+        ),
+        34: dict(
+            link=("left_middle_finger_third_joint", "left_middle_finger2"),
+            id=34,
+            color=[102, 178, 255],
+        ),
+        35: dict(
+            link=("left_middle_finger2", "left_middle_finger3"),
+            id=35,
+            color=[102, 178, 255],
+        ),
+        36: dict(
+            link=("left_middle_finger3", "left_middle_finger4"),
+            id=36,
+            color=[102, 178, 255],
+        ),
+        37: dict(
+            link=("left_wrist", "left_ring_finger_third_joint"),
+            id=37,
+            color=[255, 51, 51],
+        ),
+        38: dict(
+            link=("left_ring_finger_third_joint", "left_ring_finger2"),
+            id=38,
+            color=[255, 51, 51],
+        ),
+        39: dict(
+            link=("left_ring_finger2", "left_ring_finger3"), id=39, color=[255, 51, 51]
+        ),
+        40: dict(
+            link=("left_ring_finger3", "left_ring_finger4"), id=40, color=[255, 51, 51]
+        ),
+        41: dict(
+            link=("left_wrist", "left_pinky_finger_third_joint"),
+            id=41,
+            color=[0, 255, 0],
+        ),
+        42: dict(
+            link=("left_pinky_finger_third_joint", "left_pinky_finger2"),
+            id=42,
+            color=[0, 255, 0],
+        ),
+        43: dict(
+            link=("left_pinky_finger2", "left_pinky_finger3"), id=43, color=[0, 255, 0]
+        ),
+        44: dict(
+            link=("left_pinky_finger3", "left_pinky_finger4"), id=44, color=[0, 255, 0]
+        ),
+        45: dict(
+            link=("right_wrist", "right_thumb_third_joint"), id=45, color=[255, 128, 0]
+        ),
+        46: dict(
+            link=("right_thumb_third_joint", "right_thumb2"), id=46, color=[255, 128, 0]
+        ),
+        47: dict(link=("right_thumb2", "right_thumb3"), id=47, color=[255, 128, 0]),
+        48: dict(link=("right_thumb3", "right_thumb4"), id=48, color=[255, 128, 0]),
+        49: dict(
+            link=("right_wrist", "right_forefinger_third_joint"),
+            id=49,
+            color=[255, 153, 255],
+        ),
+        50: dict(
+            link=("right_forefinger_third_joint", "right_forefinger2"),
+            id=50,
+            color=[255, 153, 255],
+        ),
+        51: dict(
+            link=("right_forefinger2", "right_forefinger3"),
+            id=51,
+            color=[255, 153, 255],
+        ),
+        52: dict(
+            link=("right_forefinger3", "right_forefinger4"),
+            id=52,
+            color=[255, 153, 255],
+        ),
+        53: dict(
+            link=("right_wrist", "right_middle_finger_third_joint"),
+            id=53,
+            color=[102, 178, 255],
+        ),
+        54: dict(
+            link=("right_middle_finger_third_joint", "right_middle_finger2"),
+            id=54,
+            color=[102, 178, 255],
+        ),
+        55: dict(
+            link=("right_middle_finger2", "right_middle_finger3"),
+            id=55,
+            color=[102, 178, 255],
+        ),
+        56: dict(
+            link=("right_middle_finger3", "right_middle_finger4"),
+            id=56,
+            color=[102, 178, 255],
+        ),
+        57: dict(
+            link=("right_wrist", "right_ring_finger_third_joint"),
+            id=57,
+            color=[255, 51, 51],
+        ),
+        58: dict(
+            link=("right_ring_finger_third_joint", "right_ring_finger2"),
+            id=58,
+            color=[255, 51, 51],
+        ),
+        59: dict(
+            link=("right_ring_finger2", "right_ring_finger3"),
+            id=59,
+            color=[255, 51, 51],
+        ),
+        60: dict(
+            link=("right_ring_finger3", "right_ring_finger4"),
+            id=60,
+            color=[255, 51, 51],
+        ),
+        61: dict(
+            link=("right_wrist", "right_pinky_finger_third_joint"),
+            id=61,
+            color=[0, 255, 0],
+        ),
+        62: dict(
+            link=("right_pinky_finger_third_joint", "right_pinky_finger2"),
+            id=62,
+            color=[0, 255, 0],
+        ),
+        63: dict(
+            link=("right_pinky_finger2", "right_pinky_finger3"),
+            id=63,
+            color=[0, 255, 0],
+        ),
+        64: dict(
+            link=("right_pinky_finger3", "right_pinky_finger4"),
+            id=64,
+            color=[0, 255, 0],
+        ),
+    },
+    joint_weights=[1.0] * 70,
+    body_keypoint_names=[
+        "nose",
+        "left_eye",
+        "right_eye",
+        "left_ear",
+        "right_ear",
+        "left_shoulder",
+        "right_shoulder",
+        "left_elbow",
+        "right_elbow",
+        "left_wrist",
+        "right_wrist",
+        "left_hip",
+        "right_hip",
+        "left_knee",
+        "right_knee",
+        "left_ankle",
+        "right_ankle",
+    ],
+    foot_keypoint_names=[
+        "left_big_toe",
+        "left_small_toe",
+        "left_heel",
+        "right_big_toe",
+        "right_small_toe",
+        "right_heel",
+    ],
+    left_hand_keypoint_names=[
+        "left_thumb4",
+        "left_thumb3",
+        "left_thumb2",
+        "left_thumb_third_joint",
+        "left_forefinger4",
+        "left_forefinger3",
+        "left_forefinger2",
+        "left_forefinger_third_joint",
+        "left_middle_finger4",
+        "left_middle_finger3",
+        "left_middle_finger2",
+        "left_middle_finger_third_joint",
+        "left_ring_finger4",
+        "left_ring_finger3",
+        "left_ring_finger2",
+        "left_ring_finger_third_joint",
+        "left_pinky_finger4",
+        "left_pinky_finger3",
+        "left_pinky_finger2",
+        "left_pinky_finger_third_joint",
+    ],
+    right_hand_keypoint_names=[
+        "right_thumb4",
+        "right_thumb3",
+        "right_thumb2",
+        "right_thumb_third_joint",
+        "right_forefinger4",
+        "right_forefinger3",
+        "right_forefinger2",
+        "right_forefinger_third_joint",
+        "right_middle_finger4",
+        "right_middle_finger3",
+        "right_middle_finger2",
+        "right_middle_finger_third_joint",
+        "right_ring_finger4",
+        "right_ring_finger3",
+        "right_ring_finger2",
+        "right_ring_finger_third_joint",
+        "right_pinky_finger4",
+        "right_pinky_finger3",
+        "right_pinky_finger2",
+        "right_pinky_finger_third_joint",
+    ],
+    ## 7 of them
+    extra_keypoint_names=[
+        "neck",
+        "left_olecranon",
+        "right_olecranon",
+        "left_cubital_fossa",
+        "right_cubital_fossa",
+        "left_acromion",
+        "right_acromion",
+    ],
+    sigmas=[],
+)
+# Rerun‑friendly helpers ----------------------------------------------------
+# These mirror the COCO‑133 helpers exposed by ``simplecv.data.skeleton.coco_133``
+# so downstream code can build annotation contexts without re‑deriving names/links.
+MHR70_ID2NAME: Final[dict[int, str]] = {
+    idx: info["name"] for idx, info in pose_info["keypoint_info"].items()
+}
+MHR70_IDS: Final[list[int]] = sorted(MHR70_ID2NAME.keys())
+_NAME_TO_ID = {name: idx for idx, name in MHR70_ID2NAME.items()}
+MHR70_LINKS: Final[list[tuple[int, int]]] = [
+    (_NAME_TO_ID[link_info["link"][0]], _NAME_TO_ID[link_info["link"][1]])
+    for link_info in pose_info["skeleton_info"].values()
+]
+__all__ = [
+    "pose_info",
+    "MHR70_ID2NAME",
+    "MHR70_IDS",
+    "MHR70_LINKS",
+]

src/sam3d_body/models/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Copyright (c) Meta Platforms, Inc. and affiliates.

src/sam3d_body/models/backbones/__init__.py ADDED Viewed

	@@ -0,0 +1,35 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+def create_backbone(name, cfg=None):
+    if name in ["vit_hmr"]:
+        from .vit import vit
+        backbone = vit(cfg)
+    elif name in ["vit_hmr_512_384"]:
+        from .vit import vit512_384
+        backbone = vit512_384(cfg)
+    elif name in ["vit_l"]:
+        from .vit import vit_l
+        backbone = vit_l(cfg)
+    elif name in ["vit_b"]:
+        from .vit import vit_b
+        backbone = vit_b(cfg)
+    elif name in [
+        "dinov3_vit7b",
+        "dinov3_vith16plus",
+        "dinov3_vits16",
+        "dinov3_vits16plus",
+        "dinov3_vitb16",
+        "dinov3_vitl16",
+    ]:
+        from .dinov3 import Dinov3Backbone
+        backbone = Dinov3Backbone(name, cfg=cfg)
+    else:
+        raise NotImplementedError("Backbone type is not implemented")
+    return backbone

src/sam3d_body/models/backbones/dinov3.py ADDED Viewed

	@@ -0,0 +1,69 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+import torch
+from torch import nn
+class Dinov3Backbone(nn.Module):
+    def __init__(
+        self, name="dinov2_vitb14", pretrained_weight=None, cfg=None, *args, **kwargs
+    ):
+        super().__init__()
+        self.name = name
+        self.cfg = cfg
+        self.encoder = torch.hub.load(
+            "facebookresearch/dinov3",
+            self.name,
+            source="github",
+            pretrained=False,
+            drop_path=self.cfg.MODEL.BACKBONE.DROP_PATH_RATE,
+        )
+        self.patch_size = self.encoder.patch_size
+        self.embed_dim = self.embed_dims = self.encoder.embed_dim
+    def forward(self, x, extra_embed=None):
+        """
+        Encode a RGB image using a ViT-backbone
+        Args:
+            - x: torch.Tensor of shape [bs,3,w,h]
+        Return:
+            - y: torch.Tensor of shape [bs,k,d] - image in patchified mode
+        """
+        assert extra_embed is None, "Not Implemented Yet"
+        y = self.encoder.get_intermediate_layers(x, n=1, reshape=True, norm=True)[-1]
+        return y
+    def get_layer_depth(self, param_name: str, prefix: str = "encoder."):
+        """Get the layer-wise depth of a parameter.
+        Args:
+            param_name (str): The name of the parameter.
+            prefix (str): The prefix for the parameter.
+                Defaults to an empty string.
+        Returns:
+            Tuple[int, int]: The layer-wise depth and the num of layers.
+        Note:
+            The first depth is the stem module (``layer_depth=0``), and the
+            last depth is the subsequent module (``layer_depth=num_layers-1``)
+        """
+        num_layers = self.encoder.n_blocks + 2
+        if not param_name.startswith(prefix):
+            # For subsequent module like head
+            return num_layers - 1, num_layers
+        param_name = param_name[len(prefix) :]
+        if param_name in ("cls_token", "pos_embed", "storage_tokens"):
+            layer_depth = 0
+        elif param_name.startswith("patch_embed"):
+            layer_depth = 0
+        elif param_name.startswith("blocks"):
+            layer_id = int(param_name.split(".")[1])
+            layer_depth = layer_id + 1
+        else:
+            layer_depth = num_layers - 1
+        return layer_depth, num_layers

src/sam3d_body/models/backbones/vit.py ADDED Viewed

	@@ -0,0 +1,658 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+from functools import partial
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as checkpoint
+try:
+    from flash_attn.flash_attn_interface import flash_attn_func
+except:
+    print("No Flash Attention!")
+from timm.models.layers import drop_path, to_2tuple, trunc_normal_
+from ..modules.transformer import LayerNorm32
+def vit(cfg):
+    return ViT(
+        img_size=(256, 192),
+        patch_size=16,
+        embed_dim=1280,
+        depth=32,
+        num_heads=16,
+        ratio=1,
+        norm_layer=LayerNorm32,
+        use_checkpoint=False,
+        mlp_ratio=4,
+        qkv_bias=True,
+        drop_path_rate=0.55,
+        frozen_stages=cfg.MODEL.BACKBONE.get("FROZEN_STAGES", -1),
+        flash_attn=cfg.MODEL.BACKBONE.get("FLASH_ATTN", False),
+    )
+def vit_l(cfg):
+    return ViT(
+        img_size=(256, 192),
+        patch_size=16,
+        embed_dim=1024,
+        depth=24,
+        num_heads=16,
+        ratio=1,
+        norm_layer=LayerNorm32,
+        use_checkpoint=False,
+        mlp_ratio=4,
+        qkv_bias=True,
+        drop_path_rate=0.55,
+        frozen_stages=cfg.MODEL.BACKBONE.get("FROZEN_STAGES", -1),
+        flash_attn=cfg.MODEL.BACKBONE.get("FLASH_ATTN", False),
+    )
+def vit_b(cfg):
+    return ViT(
+        img_size=(256, 192),
+        patch_size=16,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        ratio=1,
+        norm_layer=LayerNorm32,
+        use_checkpoint=False,
+        mlp_ratio=4,
+        qkv_bias=True,
+        drop_path_rate=0.3,
+        frozen_stages=cfg.MODEL.BACKBONE.get("FROZEN_STAGES", -1),
+        flash_attn=cfg.MODEL.BACKBONE.get("FLASH_ATTN", False),
+    )
+def vit256(cfg):
+    return ViT(
+        img_size=(256, 256),
+        patch_size=16,
+        embed_dim=1280,
+        depth=32,
+        num_heads=16,
+        ratio=1,
+        norm_layer=LayerNorm32,
+        use_checkpoint=False,
+        mlp_ratio=4,
+        qkv_bias=True,
+        drop_path_rate=0.55,
+        frozen_stages=cfg.MODEL.BACKBONE.get("FROZEN_STAGES", -1),
+        flash_attn=cfg.MODEL.BACKBONE.get("FLASH_ATTN", False),
+    )
+def vit512_384(cfg):
+    return ViT(
+        img_size=(512, 384),
+        patch_size=16,
+        embed_dim=1280,
+        depth=32,
+        num_heads=16,
+        ratio=1,
+        norm_layer=LayerNorm32,
+        use_checkpoint=False,
+        mlp_ratio=4,
+        qkv_bias=True,
+        drop_path_rate=0.55,
+        frozen_stages=cfg.MODEL.BACKBONE.get("FROZEN_STAGES", -1),
+        flash_attn=cfg.MODEL.BACKBONE.get("FLASH_ATTN", False),
+    )
+def get_abs_pos(abs_pos, h, w, ori_h, ori_w, has_cls_token=True):
+    """
+    Calculate absolute positional embeddings. If needed, resize embeddings and remove cls_token
+        dimension for the original embeddings.
+    Args:
+        abs_pos (Tensor): absolute positional embeddings with (1, num_position, C).
+        has_cls_token (bool): If true, has 1 embedding in abs_pos for cls token.
+        hw (Tuple): size of input image tokens.
+    Returns:
+        Absolute positional embeddings after processing with shape (1, H, W, C)
+    """
+    cls_token = None
+    B, L, C = abs_pos.shape
+    if has_cls_token:
+        cls_token = abs_pos[:, 0:1]
+        abs_pos = abs_pos[:, 1:]
+    if ori_h != h or ori_w != w:
+        new_abs_pos = (
+            F.interpolate(
+                abs_pos.reshape(1, ori_h, ori_w, -1).permute(0, 3, 1, 2),
+                size=(h, w),
+                mode="bicubic",
+                align_corners=False,
+            )
+            .permute(0, 2, 3, 1)
+            .reshape(B, -1, C)
+        )
+    else:
+        new_abs_pos = abs_pos
+    if cls_token is not None:
+        new_abs_pos = torch.cat([cls_token, new_abs_pos], dim=1)
+    return new_abs_pos
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks)."""
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+    def extra_repr(self):
+        return "p={}".format(self.drop_prob)
+class Mlp(nn.Module):
+    def __init__(
+        self,
+        in_features,
+        hidden_features=None,
+        out_features=None,
+        act_layer=nn.GELU,
+        drop=0.0,
+    ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+class Attention(nn.Module):
+    def __init__(
+        self,
+        dim,
+        num_heads=8,
+        qkv_bias=False,
+        qk_scale=None,
+        attn_drop=0.0,
+        proj_drop=0.0,
+        attn_head_dim=None,
+    ):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.dim = dim
+        if attn_head_dim is not None:
+            head_dim = attn_head_dim
+        all_head_dim = head_dim * self.num_heads
+        self.scale = qk_scale or head_dim**-0.5
+        self.qkv = nn.Linear(dim, all_head_dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(all_head_dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+    def forward(self, x):
+        B, N, C = x.shape
+        qkv = self.qkv(x)
+        qkv = qkv.reshape(B, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
+        q, k, v = (
+            qkv[0],
+            qkv[1],
+            qkv[2],
+        )  # make torchscript happy (cannot use tensor as tuple)
+        q = q * self.scale
+        attn = q @ k.transpose(-2, -1)
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B, N, -1)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class FlashAttention(nn.Module):
+    def __init__(
+        self,
+        dim,
+        num_heads=8,
+        qkv_bias=False,
+        qk_scale=None,
+        attn_drop=0.0,
+        proj_drop=0.0,
+        attn_head_dim=None,
+    ):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = attn_head_dim or (dim // num_heads)
+        self.head_dim = head_dim
+        self.dim = dim
+        self.qkv = nn.Linear(dim, head_dim * num_heads * 3, bias=qkv_bias)
+        self.proj = nn.Linear(head_dim * num_heads, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.attn_drop = attn_drop
+    def forward(self, x):
+        B, N, C = x.shape  # (batch, sequence_length, embedding_dim)
+        qkv = self.qkv(x)  # (B, N, 3 * num_heads * head_dim)
+        qkv = qkv.view(B, N, 3, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]  # each: (B, num_heads, N, head_dim)
+        # FlashAttention expects (B, N, num_heads, head_dim)
+        q = q.transpose(1, 2).contiguous()
+        k = k.transpose(1, 2).contiguous()
+        v = v.transpose(1, 2).contiguous()
+        # Optional: FlashAttention requires fp16 or bf16
+        if q.dtype == torch.float32:
+            q = q.half()
+            k = k.half()
+            v = v.half()
+        out = flash_attn_func(
+            q, k, v, dropout_p=self.attn_drop, causal=False
+        )  # (B, N, num_heads * head_dim)
+        # If needed, cast back to float32
+        out = out.reshape(B, N, -1)
+        out = out.to(x.dtype)
+        # breakpoint()
+        out = self.proj(out)
+        out = self.proj_drop(out)
+        return out
+class Block(nn.Module):
+    def __init__(
+        self,
+        dim,
+        num_heads,
+        mlp_ratio=4.0,
+        qkv_bias=False,
+        qk_scale=None,
+        drop=0.0,
+        attn_drop=0.0,
+        drop_path=0.0,
+        act_layer=nn.GELU,
+        norm_layer=nn.LayerNorm,
+        attn_head_dim=None,
+        flash_attn=False,
+    ):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        if flash_attn:
+            self.attn = FlashAttention(
+                dim,
+                num_heads=num_heads,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                attn_drop=attn_drop,
+                proj_drop=drop,
+                attn_head_dim=attn_head_dim,
+            )
+        else:
+            self.attn = Attention(
+                dim,
+                num_heads=num_heads,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                attn_drop=attn_drop,
+                proj_drop=drop,
+                attn_head_dim=attn_head_dim,
+            )
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=act_layer,
+            drop=drop,
+        )
+    def forward(self, x):
+        x = x + self.drop_path(self.attn(self.norm1(x)))
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+class PatchEmbed(nn.Module):
+    """Image to Patch Embedding"""
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768, ratio=1):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        num_patches = (
+            (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0]) * (ratio**2)
+        )
+        self.patch_shape = (
+            int(img_size[0] // patch_size[0] * ratio),
+            int(img_size[1] // patch_size[1] * ratio),
+        )
+        self.origin_patch_shape = (
+            int(img_size[0] // patch_size[0]),
+            int(img_size[1] // patch_size[1]),
+        )
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.num_patches = num_patches
+        self.proj = nn.Conv2d(
+            in_chans,
+            embed_dim,
+            kernel_size=patch_size,
+            stride=(patch_size[0] // ratio),
+            padding=4 + 2 * (ratio // 2 - 1),
+        )
+    def forward(self, x, **kwargs):
+        B, C, H, W = x.shape
+        x = self.proj(x)
+        Hp, Wp = x.shape[2], x.shape[3]
+        x = x.flatten(2).transpose(1, 2)
+        return x, (Hp, Wp)
+class PatchEmbedNoPadding(nn.Module):
+    """Image to Patch Embedding"""
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768, ratio=1):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        num_patches = (
+            (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0]) * (ratio**2)
+        )
+        self.patch_shape = (
+            int(img_size[0] // patch_size[0] * ratio),
+            int(img_size[1] // patch_size[1] * ratio),
+        )
+        self.origin_patch_shape = (
+            int(img_size[0] // patch_size[0]),
+            int(img_size[1] // patch_size[1]),
+        )
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.num_patches = num_patches
+        self.proj = nn.Conv2d(
+            in_chans,
+            embed_dim,
+            kernel_size=patch_size,
+            stride=(patch_size[0] // ratio),
+            padding=0,
+        )
+    def forward(self, x, **kwargs):
+        B, C, H, W = x.shape
+        x = self.proj(x)
+        Hp, Wp = x.shape[2], x.shape[3]
+        x = x.flatten(2).transpose(1, 2)
+        return x, (Hp, Wp)
+class HybridEmbed(nn.Module):
+    """CNN Feature Map Embedding
+    Extract feature map from CNN, flatten, project to embedding dim.
+    """
+    def __init__(
+        self, backbone, img_size=224, feature_size=None, in_chans=3, embed_dim=768
+    ):
+        super().__init__()
+        assert isinstance(backbone, nn.Module)
+        img_size = to_2tuple(img_size)
+        self.img_size = img_size
+        self.backbone = backbone
+        if feature_size is None:
+            with torch.no_grad():
+                training = backbone.training
+                if training:
+                    backbone.eval()
+                o = self.backbone(torch.zeros(1, in_chans, img_size[0], img_size[1]))[
+                    -1
+                ]
+                feature_size = o.shape[-2:]
+                feature_dim = o.shape[1]
+                backbone.train(training)
+        else:
+            feature_size = to_2tuple(feature_size)
+            feature_dim = self.backbone.feature_info.channels()[-1]
+        self.num_patches = feature_size[0] * feature_size[1]
+        self.proj = nn.Linear(feature_dim, embed_dim)
+    def forward(self, x):
+        x = self.backbone(x)[-1]
+        x = x.flatten(2).transpose(1, 2)
+        x = self.proj(x)
+        return x
+class ViT(nn.Module):
+    def __init__(
+        self,
+        img_size=224,
+        patch_size=16,
+        in_chans=3,
+        num_classes=80,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4.0,
+        qkv_bias=False,
+        qk_scale=None,
+        drop_rate=0.0,
+        attn_drop_rate=0.0,
+        drop_path_rate=0.0,
+        hybrid_backbone=None,
+        norm_layer=None,
+        use_checkpoint=False,
+        frozen_stages=-1,
+        ratio=1,
+        last_norm=True,
+        patch_padding="pad",
+        freeze_attn=False,
+        freeze_ffn=False,
+        flash_attn=False,
+        no_patch_padding=False,
+    ):
+        # Protect mutable default arguments
+        super(ViT, self).__init__()
+        norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6)
+        self.num_classes = num_classes
+        self.num_features = self.embed_dim = self.embed_dims = (
+            embed_dim  # num_features for consistency with other models
+        )
+        self.frozen_stages = frozen_stages
+        self.use_checkpoint = use_checkpoint
+        self.patch_padding = patch_padding
+        self.freeze_attn = freeze_attn
+        self.freeze_ffn = freeze_ffn
+        self.depth = depth
+        if hybrid_backbone is not None:
+            self.patch_embed = HybridEmbed(
+                hybrid_backbone,
+                img_size=img_size,
+                in_chans=in_chans,
+                embed_dim=embed_dim,
+            )
+        else:
+            if no_patch_padding:
+                self.patch_embed = PatchEmbedNoPadding(
+                    img_size=img_size,
+                    patch_size=patch_size,
+                    in_chans=in_chans,
+                    embed_dim=embed_dim,
+                    ratio=ratio,
+                )
+            else:
+                self.patch_embed = PatchEmbed(
+                    img_size=img_size,
+                    patch_size=patch_size,
+                    in_chans=in_chans,
+                    embed_dim=embed_dim,
+                    ratio=ratio,
+                )
+        num_patches = self.patch_embed.num_patches
+        self.patch_size = patch_size
+        # since the pretraining model has class token
+        self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim))
+        dpr = [
+            x.item() for x in torch.linspace(0, drop_path_rate, depth)
+        ]  # stochastic depth decay rule
+        self.blocks = nn.ModuleList(
+            [
+                Block(
+                    dim=embed_dim,
+                    num_heads=num_heads,
+                    mlp_ratio=mlp_ratio,
+                    qkv_bias=qkv_bias,
+                    qk_scale=qk_scale,
+                    drop=drop_rate,
+                    attn_drop=attn_drop_rate,
+                    drop_path=dpr[i],
+                    norm_layer=norm_layer,
+                    flash_attn=flash_attn,
+                )
+                for i in range(depth)
+            ]
+        )
+        self.last_norm = norm_layer(embed_dim) if last_norm else nn.Identity()
+        if self.pos_embed is not None:
+            trunc_normal_(self.pos_embed, std=0.02)
+        self._freeze_stages()
+    def _freeze_stages(self):
+        """Freeze parameters."""
+        if self.frozen_stages >= 0:
+            self.patch_embed.eval()
+            for param in self.patch_embed.parameters():
+                param.requires_grad = False
+        for i in range(1, self.frozen_stages + 1):
+            m = self.blocks[i - 1]
+            m.eval()
+            for param in m.parameters():
+                param.requires_grad = False
+        if self.freeze_attn:
+            for i in range(0, self.depth):
+                m = self.blocks[i]
+                m.attn.eval()
+                m.norm1.eval()
+                for param in m.attn.parameters():
+                    param.requires_grad = False
+                for param in m.norm1.parameters():
+                    param.requires_grad = False
+        if self.freeze_ffn:
+            self.pos_embed.requires_grad = False
+            self.patch_embed.eval()
+            for param in self.patch_embed.parameters():
+                param.requires_grad = False
+            for i in range(0, self.depth):
+                m = self.blocks[i]
+                m.mlp.eval()
+                m.norm2.eval()
+                for param in m.mlp.parameters():
+                    param.requires_grad = False
+                for param in m.norm2.parameters():
+                    param.requires_grad = False
+    def init_weights(self):
+        """Initialize the weights in backbone.
+        Args:
+            pretrained (str, optional): Path to pre-trained weights.
+                Defaults to None.
+        """
+        def _init_weights(m):
+            if isinstance(m, nn.Linear):
+                trunc_normal_(m.weight, std=0.02)
+                if isinstance(m, nn.Linear) and m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.LayerNorm):
+                nn.init.constant_(m.bias, 0)
+                nn.init.constant_(m.weight, 1.0)
+        self.apply(_init_weights)
+    def get_num_layers(self):
+        return len(self.blocks)
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {"pos_embed", "cls_token"}
+    def forward_features(self, x, extra_embed=None):
+        B, C, H, W = x.shape
+        x, (Hp, Wp) = self.patch_embed(x)
+        if self.pos_embed is not None:
+            # fit for multiple GPU training
+            # since the first element for pos embed (sin-cos manner) is zero, it will cause no difference
+            x = x + self.pos_embed[:, 1:] + self.pos_embed[:, :1]
+        if extra_embed is not None:
+            x = x + extra_embed.flatten(2).transpose(1, 2).to(x)
+        for blk in self.blocks:
+            if self.use_checkpoint:
+                x = checkpoint.checkpoint(blk, x)
+            else:
+                x = blk(x)
+        x = self.last_norm(x)
+        xp = x.permute(0, 2, 1).reshape(B, -1, Hp, Wp).contiguous()
+        return xp
+    def forward(self, x, *args, **kwargs):
+        x = self.forward_features(x, *args, **kwargs)
+        return x
+    def train(self, mode=True):
+        """Convert the model into training mode."""
+        super().train(mode)
+        self._freeze_stages()

src/sam3d_body/models/decoders/__init__.py ADDED Viewed

	@@ -0,0 +1,32 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+from .keypoint_prompt_sampler import build_keypoint_sampler
+from .prompt_encoder import PromptEncoder
+from .promptable_decoder import PromptableDecoder
+def build_decoder(cfg, context_dim=None):
+    from .promptable_decoder import PromptableDecoder
+    if cfg.TYPE == "sam":
+        return PromptableDecoder(
+            dims=cfg.DIM,
+            context_dims=context_dim,
+            depth=cfg.DEPTH,
+            num_heads=cfg.HEADS,
+            head_dims=cfg.DIM_HEAD,
+            mlp_dims=cfg.MLP_DIM,
+            layer_scale_init_value=cfg.LAYER_SCALE_INIT,
+            drop_rate=cfg.DROP_RATE,
+            attn_drop_rate=cfg.ATTN_DROP_RATE,
+            drop_path_rate=cfg.DROP_PATH_RATE,
+            ffn_type=cfg.FFN_TYPE,
+            enable_twoway=cfg.ENABLE_TWOWAY,
+            repeat_pe=cfg.REPEAT_PE,
+            frozen=cfg.get("FROZEN", False),
+            do_interm_preds=cfg.get("DO_INTERM_PREDS", False),
+            do_keypoint_tokens=cfg.get("DO_KEYPOINT_TOKENS", False),
+            keypoint_token_update=cfg.get("KEYPOINT_TOKEN_UPDATE", None),
+        )
+    else:
+        raise ValueError("Invalid decoder type: ", cfg.TYPE)

src/sam3d_body/models/decoders/keypoint_prompt_sampler.py ADDED Viewed

	@@ -0,0 +1,183 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+import random
+from abc import ABC, abstractmethod
+from typing import Dict, List
+import torch
+from omegaconf import DictConfig
+from yacs.config import CfgNode
+def build_keypoint_sampler(sampler_cfg, prompt_keypoints, keybody_idx):
+    sampler_type = sampler_cfg.get("TYPE", "v1")
+    if sampler_type == "v1":
+        sampler_cls = KeypointSamplerV1
+    else:
+        raise ValueError("Invalid sampler type: ", sampler_type)
+    return sampler_cls(sampler_cfg, prompt_keypoints, keybody_idx)
+class BaseKeypointSampler(ABC):
+    @abstractmethod
+    def sample(
+        self, gt_keypoints: torch.Tensor, pred_keypoints: torch.Tensor, is_train: bool
+    ) -> torch.Tensor:
+        pass
+    def _get_worst_keypoint(self, distances, keypoint_list):
+        # Set distance to -1 for non-promptable keypoints
+        cur_dist = torch.ones_like(distances) * -1
+        cur_dist[keypoint_list] = distances[keypoint_list]
+        keypoint_idx = int(cur_dist.argmax())
+        if cur_dist[keypoint_idx] > self.distance_thresh:
+            valid_keypoint = True
+        else:
+            valid_keypoint = False
+        return keypoint_idx, valid_keypoint
+    def _get_random_keypoint(self, distances, keypoint_list):
+        candidates = [idx for idx in keypoint_list if distances[idx] > 0]
+        if len(candidates):
+            keypoint_idx = random.choice(candidates)
+            valid_keypoint = True
+        else:
+            keypoint_idx = None
+            valid_keypoint = False
+        return keypoint_idx, valid_keypoint
+    def _masked_distance(self, x, y, mask=None):
+        """
+        Args:
+            x, y: [B, K, D]
+            mask: [B, K]
+        Return:
+            distances: [K, B]
+        """
+        distances = (x - y).pow(2).sum(dim=-1)
+        if mask is not None:
+            distances[mask] = -1
+        return distances.T
+class KeypointSamplerV1(BaseKeypointSampler):
+    def __init__(
+        self,
+        sampler_cfg: DictConfig | CfgNode,
+        prompt_keypoints: Dict,
+        keybody_idx: List,
+    ):
+        self.prompt_keypoints = prompt_keypoints
+        self._keybody_idx = keybody_idx
+        self._non_keybody_idx = [
+            idx for idx in self.prompt_keypoints if idx not in self._keybody_idx
+        ]
+        self.keybody_ratio = sampler_cfg.get("KEYBODY_RATIO", 0.8)
+        self.worst_ratio = sampler_cfg.get("WORST_RATIO", 0.8)
+        self.negative_ratio = sampler_cfg.get("NEGATIVE_RATIO", 0.0)
+        self.dummy_ratio = sampler_cfg.get("DUMMY_RATIO", 0.1)
+        self.distance_thresh = sampler_cfg.get("DISTANCE_THRESH", 0.0)
+    def sample(
+        self,
+        gt_keypoints_2d: torch.Tensor,
+        pred_keypoints_2d: torch.Tensor,
+        is_train: bool = True,
+        force_dummy: bool = False,
+    ) -> torch.Tensor:
+        # Get the distance between each predicted and gt keypoint
+        # Elements will be ignored if (1) the gt has low confidence or
+        # (2) both the gt and pred are outside of the image
+        mask_1 = gt_keypoints_2d[:, :, -1] < 0.5
+        mask_2 = (
+            (gt_keypoints_2d[:, :, :2] > 0.5) | (gt_keypoints_2d[:, :, :2] < -0.5)
+        ).any(dim=-1)
+        # Elements to be ignored
+        if not is_train or torch.rand(1).item() > self.negative_ratio:
+            mask = mask_1 | mask_2
+            # print_base = "positive"
+        else:
+            mask_3 = (
+                (pred_keypoints_2d[:, :, :2] > 0.5)
+                | (pred_keypoints_2d[:, :, :2] < -0.5)
+            ).any(dim=-1)
+            # To include negative prompts
+            mask = mask_1 | (mask_2 & mask_3)
+            # print_base = "negative"
+        # Get pairwise distances with shape [K, B]
+        distances = self._masked_distance(
+            pred_keypoints_2d, gt_keypoints_2d[..., :2], mask
+        )
+        batch_size = distances.shape[1]
+        keypoints_prompt = []
+        for b in range(batch_size):
+            # print_str = print_base
+            # Decide to get the worst keypoint or a random keypoint
+            if not is_train or torch.rand(1).item() < self.worst_ratio:
+                sampler = self._get_worst_keypoint
+                # print_str += "_worst"
+            else:
+                sampler = self._get_random_keypoint
+                # print_str += "_random"
+            # Decide to prompt keybody kepoints or non-keybody ones
+            if not is_train or torch.rand(1).item() < self.keybody_ratio:
+                cur_idx = self._keybody_idx
+                alt_idx = self._non_keybody_idx
+                # print_str += "_keybody"
+            else:
+                cur_idx = self._non_keybody_idx
+                alt_idx = self._keybody_idx
+                # print_str += "_nonkey"
+            # Get a valid or dummy prompt
+            if not is_train or torch.rand(1).item() > self.dummy_ratio:
+                keypoint_idx, valid_keypoint = sampler(distances[:, b], cur_idx)
+                if not valid_keypoint:
+                    # Try the alternative keypoints
+                    keypoint_idx, valid_keypoint = self._get_worst_keypoint(
+                        distances[:, b], alt_idx
+                    )
+            else:
+                valid_keypoint = False
+            if valid_keypoint:
+                cur_point = gt_keypoints_2d[b, keypoint_idx].clone()
+                if torch.any(cur_point[:2] > 0.5) or torch.any(cur_point[:2] < -0.5):
+                    # Negative prompt --> indicating the predicted keypoint is incorrect
+                    cur_point[:2] = pred_keypoints_2d[b, keypoint_idx][:2]
+                    cur_point = torch.clamp(
+                        cur_point + 0.5, min=0.0, max=1.0
+                    )  # shift from [-0.5, 0.5] to [0, 1]
+                    cur_point[-1] = -1
+                    # print_str += "_negative"
+                else:
+                    cur_point = torch.clamp(
+                        cur_point + 0.5, min=0.0, max=1.0
+                    )  # shift from [-0.5, 0.5] to [0, 1]
+                    cur_point[-1] = self.prompt_keypoints[
+                        keypoint_idx
+                    ]  # map to prompt_idx
+                    # print_str += "_positive"
+            else:
+                cur_point = torch.zeros(3).to(gt_keypoints_2d)
+                cur_point[-1] = -2
+                # print_str += "_dummy"
+            if force_dummy:
+                cur_point = torch.zeros(3).to(gt_keypoints_2d)
+                cur_point[-1] = -2
+            keypoints_prompt.append(cur_point)
+            # print(print_str)
+        keypoints_prompt = torch.stack(keypoints_prompt, dim=0).view(batch_size, 1, 3)
+        return keypoints_prompt

src/sam3d_body/models/decoders/prompt_encoder.py ADDED Viewed

	@@ -0,0 +1,256 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+from typing import Any, Optional, Tuple
+import numpy as np
+import torch
+import torch.nn as nn
+from sam3d_body.models.modules.transformer import LayerNorm2d
+class PromptEncoder(nn.Module):
+    def __init__(
+        self,
+        embed_dim: int,
+        num_body_joints: int,
+        # img_size: Tuple[int, int],
+        # patch_resolution: Tuple[int, int],
+        frozen: bool = False,
+        mask_embed_type: Optional[str] = None,
+    ) -> None:
+        """
+        Encodes prompts for input to SAM's mask decoder.
+        Arguments:
+            embed_dim (int): The prompts' embedding dimension
+            num_body_joints (int): The number of body joints
+            img_size (Tuple): The padded size of the image as input
+                to the image encoder, as (H, W).
+            patch_resolution (Tuple): image path size, as (H, W)
+        """
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_body_joints = num_body_joints
+        # self.img_size = img_size
+        # self.patch_resolution = patch_resolution
+        # Keypoint prompts
+        self.pe_layer = PositionEmbeddingRandom(embed_dim // 2)
+        self.point_embeddings = nn.ModuleList(
+            [nn.Embedding(1, embed_dim) for _ in range(self.num_body_joints)]
+        )
+        self.not_a_point_embed = nn.Embedding(1, embed_dim)
+        self.invalid_point_embed = nn.Embedding(1, embed_dim)
+        # Mask prompt
+        if mask_embed_type in ["v1"]:
+            mask_in_chans = 16  # SAM2
+            self.mask_downscaling = nn.Sequential(
+                nn.Conv2d(1, mask_in_chans // 4, kernel_size=4, stride=4),
+                LayerNorm2d(mask_in_chans // 4),
+                nn.GELU(),
+                nn.Conv2d(mask_in_chans // 4, mask_in_chans, kernel_size=4, stride=4),
+                LayerNorm2d(mask_in_chans),
+                nn.GELU(),
+                nn.Conv2d(mask_in_chans, embed_dim, kernel_size=1),
+            )
+        elif mask_embed_type in ["v2"]:
+            mask_in_chans = 256
+            self.mask_downscaling = nn.Sequential(
+                nn.Conv2d(1, mask_in_chans // 64, kernel_size=2, stride=2),
+                LayerNorm2d(mask_in_chans // 64),
+                nn.GELU(),
+                nn.Conv2d(
+                    mask_in_chans // 64,
+                    mask_in_chans // 16,
+                    kernel_size=2,
+                    stride=2,
+                ),
+                LayerNorm2d(mask_in_chans // 16),
+                nn.GELU(),
+                nn.Conv2d(
+                    mask_in_chans // 16, mask_in_chans // 4, kernel_size=2, stride=2
+                ),
+                LayerNorm2d(mask_in_chans // 4),
+                nn.GELU(),
+                nn.Conv2d(mask_in_chans // 4, mask_in_chans, kernel_size=2, stride=2),
+                LayerNorm2d(mask_in_chans),
+                nn.GELU(),
+                nn.Conv2d(mask_in_chans, embed_dim, kernel_size=1),
+            )
+        else:
+            assert mask_embed_type is None
+        if mask_embed_type is not None:
+            # Zero-initialize the last conv layer as gating
+            nn.init.zeros_(self.mask_downscaling[-1].weight)
+            nn.init.zeros_(self.mask_downscaling[-1].bias)
+            self.no_mask_embed = nn.Embedding(1, embed_dim)
+            nn.init.zeros_(self.no_mask_embed.weight)
+        self.frozen = frozen
+        self._freeze_stages()
+    def get_dense_pe(self, size: Tuple[int, int]) -> torch.Tensor:
+        """
+        Returns the positional encoding used to encode point prompts,
+        applied to a dense set of points the shape of the image encoding.
+        Returns:
+          torch.Tensor: Positional encoding with shape
+            1x(embed_dim)x(embedding_h)x(embedding_w)
+        """
+        return self.pe_layer(size).unsqueeze(0)
+    def _embed_keypoints(
+        self,
+        points: torch.Tensor,
+        labels: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Embeds point prompts.
+        Assuming points have been normalized to [0, 1].
+        Output shape [B, N, C], mask shape [B, N]
+        """
+        assert points.min() >= 0 and points.max() <= 1
+        point_embedding = self.pe_layer._pe_encoding(points.to(torch.float))
+        point_embedding[labels == -2] = 0.0  # invalid points
+        point_embedding[labels == -2] += self.invalid_point_embed.weight
+        point_embedding[labels == -1] = 0.0
+        point_embedding[labels == -1] += self.not_a_point_embed.weight
+        for i in range(self.num_body_joints):
+            point_embedding[labels == i] += self.point_embeddings[i].weight
+        point_mask = labels > -2
+        return point_embedding, point_mask
+    def _get_batch_size(
+        self,
+        keypoints: Optional[torch.Tensor],
+        boxes: Optional[torch.Tensor],
+        masks: Optional[torch.Tensor],
+    ) -> int:
+        """
+        Gets the batch size of the output given the batch size of the input prompts.
+        """
+        if keypoints is not None:
+            return keypoints.shape[0]
+        elif boxes is not None:
+            return boxes.shape[0]
+        elif masks is not None:
+            return masks.shape[0]
+        else:
+            return 1
+    def _get_device(self) -> torch.device:
+        return self.point_embeddings[0].weight.device
+    def forward(
+        self,
+        keypoints: Optional[torch.Tensor],
+        boxes: Optional[torch.Tensor] = None,
+        masks: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Embeds different types of prompts, returning both sparse and dense
+        embeddings.
+        Arguments:
+          keypoints (torchTensor or none): point coordinates and labels to embed.
+          boxes (torch.Tensor or none): boxes to embed
+          masks (torch.Tensor or none): masks to embed
+        Returns:
+          torch.Tensor: sparse embeddings for the points and boxes, with shape
+            BxNx(embed_dim), where N is determined by the number of input points
+            and boxes.
+          torch.Tensor: dense embeddings for the masks, in the shape
+            Bx(embed_dim)x(embed_H)x(embed_W)
+        """
+        bs = self._get_batch_size(keypoints, boxes, masks)
+        sparse_embeddings = torch.empty(
+            (bs, 0, self.embed_dim), device=self._get_device()
+        )
+        sparse_masks = torch.empty((bs, 0), device=self._get_device())
+        if keypoints is not None:
+            coords = keypoints[:, :, :2]
+            labels = keypoints[:, :, -1]
+            point_embeddings, point_mask = self._embed_keypoints(
+                coords, labels
+            )  # pad=(boxes is None))
+            sparse_embeddings = torch.cat([sparse_embeddings, point_embeddings], dim=1)
+            sparse_masks = torch.cat([sparse_masks, point_mask], dim=1)
+        return sparse_embeddings, sparse_masks
+    def get_mask_embeddings(
+        self,
+        masks: Optional[torch.Tensor] = None,
+        bs: int = 1,
+        size: Tuple[int, int] = (16, 16),  # [H, W]
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Embeds mask inputs."""
+        no_mask_embeddings = self.no_mask_embed.weight.reshape(1, -1, 1, 1).expand(
+            bs, -1, size[0], size[1]
+        )
+        if masks is not None:
+            mask_embeddings = self.mask_downscaling(masks)
+        else:
+            mask_embeddings = no_mask_embeddings
+        return mask_embeddings, no_mask_embeddings
+    def _freeze_stages(self):
+        """Freeze parameters."""
+        if self.frozen:
+            for param in self.parameters():
+                param.requires_grad = False
+class PositionEmbeddingRandom(nn.Module):
+    """
+    Positional encoding using random spatial frequencies.
+    """
+    def __init__(self, num_pos_feats: int = 64, scale: Optional[float] = None) -> None:
+        super().__init__()
+        if scale is None or scale <= 0.0:
+            scale = 1.0
+        self.register_buffer(
+            "positional_encoding_gaussian_matrix",
+            scale * torch.randn((2, num_pos_feats)),
+        )
+    def _pe_encoding(self, coords: torch.Tensor) -> torch.Tensor:
+        """Positionally encode points that are normalized to [0,1]."""
+        # assuming coords are in [0, 1]^2 square and have d_1 x ... x d_n x 2 shape
+        coords = 2 * coords - 1
+        coords = coords @ self.positional_encoding_gaussian_matrix
+        coords = 2 * np.pi * coords
+        # outputs d_1 x ... x d_n x C shape
+        return torch.cat([torch.sin(coords), torch.cos(coords)], dim=-1)
+    def forward(self, size: Tuple[int, int]) -> torch.Tensor:
+        """Generate positional encoding for a grid of the specified size."""
+        h, w = size
+        device: Any = self.positional_encoding_gaussian_matrix.device
+        grid = torch.ones((h, w), device=device, dtype=torch.float32)
+        y_embed = grid.cumsum(dim=0) - 0.5
+        x_embed = grid.cumsum(dim=1) - 0.5
+        y_embed = y_embed / h
+        x_embed = x_embed / w
+        pe = self._pe_encoding(torch.stack([x_embed, y_embed], dim=-1))
+        return pe.permute(2, 0, 1)  # C x H x W
+    def forward_with_coords(
+        self, coords_input: torch.Tensor, image_size: Tuple[int, int]
+    ) -> torch.Tensor:
+        """Positionally encode points that are not normalized to [0,1]."""
+        coords = coords_input.clone()
+        coords[:, :, 0] = coords[:, :, 0] / image_size[1]
+        coords[:, :, 1] = coords[:, :, 1] / image_size[0]
+        return self._pe_encoding(coords.to(torch.float))  # B x N x C

src/sam3d_body/models/decoders/promptable_decoder.py ADDED Viewed

	@@ -0,0 +1,194 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+import pickle
+from typing import Dict, Optional
+import torch
+import torch.nn as nn
+from ..modules.transformer import build_norm_layer, TransformerDecoderLayer
+class PromptableDecoder(nn.Module):
+    """Cross-attention based Transformer decoder with prompts input.
+    Args:
+        token_dims (int): The dimension of input pose tokens.
+        prompt_dims (int): The dimension of input prompt tokens.
+        context_dims (int): The dimension of image context features.
+        dims (int): The projected dimension of all tokens in the decoder.
+        depth (int): The number of layers for Transformer decoder.
+        num_heads (int): The number of heads for multi-head attention.
+        head_dims (int): The dimension of each head.
+        mlp_dims (int): The dimension of hidden layers in MLP.
+        layer_scale_init_value (float or torch.Tensor): Init value of layer
+            scale. Defaults to 0.
+        drop_rate (float): Probability of an element to be zeroed
+            after the feed forward layer. Defaults to 0.
+        attn_drop_rate (float): The drop out rate for attention output weights.
+            Defaults to 0.
+        drop_path_rate (float): Stochastic depth rate. Defaults to 0.
+        ffn_type (str): Select the type of ffn layers. Defaults to 'origin'.
+        act_layer (nn.Module, optional): The activation layer for FFNs.
+            Default: nn.GELU
+        norm_cfg (dict): Config dict for normalization layer.
+            Defaults to ``dict(type='LN')``.
+        enable_twoway (bool): Whether to enable two-way Transformer (used in SAM).
+        repeat_pe (bool): Whether to re-add PE at each layer (used in SAM)
+    """
+    def __init__(
+        self,
+        dims: int,
+        context_dims: int,
+        depth: int,
+        num_heads: int = 8,
+        head_dims: int = 64,
+        mlp_dims: int = 1024,
+        layer_scale_init_value: float = 0.0,
+        drop_rate: float = 0.0,
+        attn_drop_rate: float = 0.0,
+        drop_path_rate: float = 0.0,
+        ffn_type: str = "origin",
+        act_layer: nn.Module = nn.GELU,
+        norm_cfg: Dict = dict(type="LN", eps=1e-6),
+        enable_twoway: bool = False,
+        repeat_pe: bool = False,
+        frozen: bool = False,
+        do_interm_preds: bool = False,
+        do_keypoint_tokens: bool = False,
+        keypoint_token_update: bool | str = False,
+    ):
+        super().__init__()
+        self.layers = nn.ModuleList()
+        for i in range(depth):
+            self.layers.append(
+                TransformerDecoderLayer(
+                    token_dims=dims,
+                    context_dims=context_dims,
+                    num_heads=num_heads,
+                    head_dims=head_dims,
+                    mlp_dims=mlp_dims,
+                    layer_scale_init_value=layer_scale_init_value,
+                    drop_rate=drop_rate,
+                    attn_drop_rate=attn_drop_rate,
+                    drop_path_rate=drop_path_rate,
+                    ffn_type=ffn_type,
+                    act_layer=act_layer,
+                    norm_cfg=norm_cfg,
+                    enable_twoway=enable_twoway,
+                    repeat_pe=repeat_pe,
+                    skip_first_pe=(i == 0),
+                )
+            )
+        self.norm_final = build_norm_layer(norm_cfg, dims)
+        self.do_interm_preds = do_interm_preds
+        self.do_keypoint_tokens = do_keypoint_tokens
+        self.keypoint_token_update = keypoint_token_update
+        self.frozen = frozen
+        self._freeze_stages()
+    def forward(
+        self,
+        token_embedding: torch.Tensor,
+        image_embedding: torch.Tensor,
+        token_augment: Optional[torch.Tensor] = None,
+        image_augment: Optional[torch.Tensor] = None,
+        token_mask: Optional[torch.Tensor] = None,
+        channel_first: bool = True,
+        token_to_pose_output_fn=None,
+        keypoint_token_update_fn=None,
+        hand_embeddings=None,
+        hand_augment=None,
+    ):
+        """
+        Args:
+            token_embedding: [B, N, C]
+            image_embedding: [B, C, H, W]
+        """
+        if channel_first:
+            image_embedding = image_embedding.flatten(2).permute(0, 2, 1)
+            if image_augment is not None:
+                image_augment = image_augment.flatten(2).permute(0, 2, 1)
+            if hand_embeddings is not None:
+                hand_embeddings = hand_embeddings.flatten(2).permute(0, 2, 1)
+                hand_augment = hand_augment.flatten(2).permute(0, 2, 1)
+                if len(hand_augment) == 1:
+                    # inflate batch dimension
+                    assert len(hand_augment.shape) == 3
+                    hand_augment = hand_augment.repeat(len(hand_embeddings), 1, 1)
+        if self.do_interm_preds:
+            assert token_to_pose_output_fn is not None
+            all_pose_outputs = []
+        for layer_idx, layer in enumerate(self.layers):
+            if hand_embeddings is None:
+                token_embedding, image_embedding = layer(
+                    token_embedding,
+                    image_embedding,
+                    token_augment,
+                    image_augment,
+                    token_mask,
+                )
+            else:
+                token_embedding, image_embedding = layer(
+                    token_embedding,
+                    torch.cat([image_embedding, hand_embeddings], dim=1),
+                    token_augment,
+                    torch.cat([image_augment, hand_augment], dim=1),
+                    token_mask,
+                )
+                image_embedding = image_embedding[:, : image_augment.shape[1]]
+            if self.do_interm_preds and layer_idx < len(self.layers) - 1:
+                curr_pose_output = token_to_pose_output_fn(
+                    self.norm_final(token_embedding),
+                    prev_pose_output=(
+                        all_pose_outputs[-1] if len(all_pose_outputs) > 0 else None
+                    ),
+                    layer_idx=layer_idx,
+                )
+                all_pose_outputs.append(curr_pose_output)
+                if self.keypoint_token_update:
+                    assert keypoint_token_update_fn is not None
+                    token_embedding, token_augment, _, _ = keypoint_token_update_fn(
+                        token_embedding, token_augment, curr_pose_output, layer_idx
+                    )
+        out = self.norm_final(token_embedding)
+        if self.do_interm_preds:
+            curr_pose_output = token_to_pose_output_fn(
+                out,
+                prev_pose_output=(
+                    all_pose_outputs[-1] if len(all_pose_outputs) > 0 else None
+                ),
+                layer_idx=layer_idx,
+            )
+            all_pose_outputs.append(curr_pose_output)
+            return out, all_pose_outputs
+        else:
+            return out
+    def _freeze_stages(self):
+        """Freeze parameters."""
+        if self.frozen:
+            for layer in self.layers:
+                layer.eval()
+            self.norm_final.eval()
+            for param in self.parameters():
+                param.requires_grad = False
+    def train(self, mode=True):
+        """
+        Convert the model into training mode.
+        (not called by lightning in trainer.fit() actually)
+        """
+        super().train(mode)
+        self._freeze_stages()

src/sam3d_body/models/heads/__init__.py ADDED Viewed

	@@ -0,0 +1,28 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+from ..modules import to_2tuple
+from .camera_head import PerspectiveHead
+from .mhr_head import MHRHead
+def build_head(cfg, head_type="mhr", enable_hand_model=False, default_scale_factor=1.0):
+    if head_type == "mhr":
+        return MHRHead(
+            input_dim=cfg.MODEL.DECODER.DIM,
+            mlp_depth=cfg.MODEL.MHR_HEAD.get("MLP_DEPTH", 1),
+            mhr_model_path=cfg.MODEL.MHR_HEAD.MHR_MODEL_PATH,
+            mlp_channel_div_factor=cfg.MODEL.MHR_HEAD.get("MLP_CHANNEL_DIV_FACTOR", 1),
+            enable_hand_model=enable_hand_model,
+        )
+    elif head_type == "perspective":
+        return PerspectiveHead(
+            input_dim=cfg.MODEL.DECODER.DIM,
+            img_size=to_2tuple(cfg.MODEL.IMAGE_SIZE),
+            mlp_depth=cfg.MODEL.get("CAMERA_HEAD", dict()).get("MLP_DEPTH", 1),
+            mlp_channel_div_factor=cfg.MODEL.get("CAMERA_HEAD", dict()).get(
+                "MLP_CHANNEL_DIV_FACTOR", 1
+            ),
+            default_scale_factor=default_scale_factor,
+        )
+    else:
+        raise ValueError("Invalid head type: ", head_type)

src/sam3d_body/models/heads/camera_head.py ADDED Viewed

	@@ -0,0 +1,110 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+from typing import Optional, Sequence, Tuple
+import torch
+import torch.nn as nn
+from sam3d_body.models.modules.geometry_utils import perspective_projection
+from ..modules import get_intrinsic_matrix, to_2tuple
+from ..modules.transformer import FFN
+class PerspectiveHead(nn.Module):
+    """
+    Predict camera translation (s, tx, ty) and perform full-perspective
+    2D reprojection (CLIFF/CameraHMR setup).
+    """
+    def __init__(
+        self,
+        input_dim: int,
+        img_size: Tuple[int, int] | Sequence[int],  # model input size (W, H)
+        mlp_depth: int = 1,
+        drop_ratio: float = 0.0,
+        mlp_channel_div_factor: int = 8,
+        default_scale_factor: float | int = 1,
+    ):
+        super().__init__()
+        # Metadata to compute 3D skeleton and 2D reprojection
+        self.img_size = to_2tuple(img_size)
+        self.ncam = 3  # (s, tx, ty)
+        self.default_scale_factor = default_scale_factor
+        self.proj = FFN(
+            embed_dims=input_dim,
+            feedforward_channels=input_dim // mlp_channel_div_factor,
+            output_dims=self.ncam,
+            num_fcs=mlp_depth,
+            ffn_drop=drop_ratio,
+            add_identity=False,
+        )
+    def forward(
+        self,
+        x: torch.Tensor,
+        init_estimate: Optional[torch.Tensor] = None,
+    ):
+        """
+        Args:
+            x: pose token with shape [B, C], usually C=DECODER.DIM
+            init_estimate: [B, self.ncam]
+        """
+        pred_cam = self.proj(x)
+        if init_estimate is not None:
+            pred_cam = pred_cam + init_estimate
+        return pred_cam
+    def perspective_projection(
+        self,
+        points_3d: torch.Tensor,
+        pred_cam: torch.Tensor,
+        bbox_center: torch.Tensor,
+        bbox_size: torch.Tensor,
+        img_size: torch.Tensor,
+        cam_int: torch.Tensor,
+        use_intrin_center: bool = False,
+    ):
+        """
+        Args:
+            bbox_center / img_size: shape [N, 2], in original image space (w, h)
+            bbox_size: shape [N,], in original image space
+            cam_int: shape [N, 3, 3]
+        """
+        batch_size = points_3d.shape[0]
+        pred_cam = pred_cam.clone()
+        pred_cam[..., [0, 2]] *= -1  # Camera system difference
+        # Compute camera translation: (scale, x, y) --> (x, y, depth)
+        # depth ~= f / s
+        # Note that f is in the NDC space (see Zolly section 3.1)
+        s, tx, ty = pred_cam[:, 0], pred_cam[:, 1], pred_cam[:, 2]
+        bs = bbox_size * s * self.default_scale_factor + 1e-8
+        focal_length = cam_int[:, 0, 0]
+        tz = 2 * focal_length / bs
+        if not use_intrin_center:
+            cx = 2 * (bbox_center[:, 0] - (img_size[:, 0] / 2)) / bs
+            cy = 2 * (bbox_center[:, 1] - (img_size[:, 1] / 2)) / bs
+        else:
+            cx = 2 * (bbox_center[:, 0] - (cam_int[:, 0, 2])) / bs
+            cy = 2 * (bbox_center[:, 1] - (cam_int[:, 1, 2])) / bs
+        pred_cam_t = torch.stack([tx + cx, ty + cy, tz], dim=-1)
+        # Compute camera translation
+        j3d_cam = points_3d + pred_cam_t.unsqueeze(1)
+        # Projection to the image plane.
+        # Note that the projection output is in *original* image space now.
+        j2d = perspective_projection(j3d_cam, cam_int)
+        return {
+            "pred_keypoints_2d": j2d.reshape(batch_size, -1, 2),
+            "pred_cam_t": pred_cam_t,
+            "focal_length": focal_length,
+            "pred_keypoints_2d_depth": j3d_cam.reshape(batch_size, -1, 3)[:, :, 2],
+        }

src/sam3d_body/models/heads/mhr_head.py ADDED Viewed

	@@ -0,0 +1,369 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+import os
+import warnings
+from typing import Optional
+import roma
+import torch
+import torch.nn as nn
+from ..modules import rot6d_to_rotmat
+from ..modules.mhr_utils import (
+    compact_cont_to_model_params_body,
+    compact_cont_to_model_params_hand,
+    compact_model_params_to_cont_body,
+    mhr_param_hand_mask,
+)
+from ..modules.transformer import FFN
+MOMENTUM_ENABLED = os.environ.get("MOMENTUM_ENABLED") is None
+try:
+    if MOMENTUM_ENABLED:
+        from mhr.mhr import MHR
+        MOMENTUM_ENABLED = True
+        warnings.warn("Momentum is enabled")
+    else:
+        warnings.warn("Momentum is not enabled")
+        raise ImportError
+except:
+    MOMENTUM_ENABLED = False
+    warnings.warn("Momentum is not enabled")
+class MHRHead(nn.Module):
+    def __init__(
+        self,
+        input_dim: int,
+        mlp_depth: int = 1,
+        mhr_model_path: str = "",
+        extra_joint_regressor: str = "",
+        ffn_zero_bias: bool = True,
+        mlp_channel_div_factor: int = 8,
+        enable_hand_model=False,
+    ):
+        super().__init__()
+        self.num_shape_comps = 45
+        self.num_scale_comps = 28
+        self.num_hand_comps = 54
+        self.num_face_comps = 72
+        self.enable_hand_model = enable_hand_model
+        self.body_cont_dim = 260
+        self.npose = (
+            6  # Global Rotation
+            + self.body_cont_dim  # then body
+            + self.num_shape_comps
+            + self.num_scale_comps
+            + self.num_hand_comps * 2
+            + self.num_face_comps
+        )
+        self.proj = FFN(
+            embed_dims=input_dim,
+            feedforward_channels=input_dim // mlp_channel_div_factor,
+            output_dims=self.npose,
+            num_fcs=mlp_depth,
+            ffn_drop=0.0,
+            add_identity=False,
+        )
+        if ffn_zero_bias:
+            torch.nn.init.zeros_(self.proj.layers[-2].bias)
+        # MHR Parameters
+        self.model_data_dir = mhr_model_path
+        self.num_hand_scale_comps = self.num_scale_comps - 18
+        self.num_hand_pose_comps = self.num_hand_comps
+        # Buffers to be filled in by model state dict
+        self.joint_rotation = nn.Parameter(torch.zeros(127, 3, 3), requires_grad=False)
+        self.scale_mean = nn.Parameter(torch.zeros(68), requires_grad=False)
+        self.scale_comps = nn.Parameter(torch.zeros(28, 68), requires_grad=False)
+        self.faces = nn.Parameter(torch.zeros(36874, 3).long(), requires_grad=False)
+        self.hand_pose_mean = nn.Parameter(torch.zeros(54), requires_grad=False)
+        self.hand_pose_comps = nn.Parameter(torch.eye(54), requires_grad=False)
+        self.hand_joint_idxs_left = nn.Parameter(
+            torch.zeros(27).long(), requires_grad=False
+        )
+        self.hand_joint_idxs_right = nn.Parameter(
+            torch.zeros(27).long(), requires_grad=False
+        )
+        self.keypoint_mapping = nn.Parameter(
+            torch.zeros(308, 18439 + 127), requires_grad=False
+        )
+        # Some special buffers for the hand-version
+        self.right_wrist_coords = nn.Parameter(torch.zeros(3), requires_grad=False)
+        self.root_coords = nn.Parameter(torch.zeros(3), requires_grad=False)
+        self.local_to_world_wrist = nn.Parameter(torch.zeros(3, 3), requires_grad=False)
+        self.nonhand_param_idxs = nn.Parameter(
+            torch.zeros(145).long(), requires_grad=False
+        )
+        # Load MHR itself
+        if MOMENTUM_ENABLED:
+            self.mhr = MHR.from_files(
+                device=torch.device("cuda" if torch.cuda.is_available() else "cpu"),
+                lod=1,
+            )
+        else:
+            self.mhr = torch.jit.load(
+                mhr_model_path,
+                map_location=("cuda" if torch.cuda.is_available() else "cpu"),
+            )
+        for param in self.mhr.parameters():
+            param.requires_grad = False
+    def get_zero_pose_init(self, factor=1.0):
+        # Initialize pose token with zero-initialized learnable params
+        # Note: bias/initial value should be zero-pose in cont, not all-zeros
+        weights = torch.zeros(1, self.npose)
+        weights[:, : 6 + self.body_cont_dim] = torch.cat(
+            [
+                torch.FloatTensor([1, 0, 0, 0, 1, 0]),
+                compact_model_params_to_cont_body(torch.zeros(1, 133)).squeeze()
+                * factor,
+            ],
+            dim=0,
+        )
+        return weights
+    def replace_hands_in_pose(self, full_pose_params, hand_pose_params):
+        assert full_pose_params.shape[1] == 136
+        # This drops in the hand poses from hand_pose_params (PCA 6D) into full_pose_params.
+        # Split into left and right hands
+        left_hand_params, right_hand_params = torch.split(
+            hand_pose_params,
+            [self.num_hand_pose_comps, self.num_hand_pose_comps],
+            dim=1,
+        )
+        # Change from cont to model params
+        left_hand_params_model_params = compact_cont_to_model_params_hand(
+            self.hand_pose_mean
+            + torch.einsum("da,ab->db", left_hand_params, self.hand_pose_comps)
+        )
+        right_hand_params_model_params = compact_cont_to_model_params_hand(
+            self.hand_pose_mean
+            + torch.einsum("da,ab->db", right_hand_params, self.hand_pose_comps)
+        )
+        # Drop it in
+        full_pose_params[:, self.hand_joint_idxs_left] = left_hand_params_model_params
+        full_pose_params[:, self.hand_joint_idxs_right] = right_hand_params_model_params
+        return full_pose_params  # B x 207
+    def mhr_forward(
+        self,
+        global_trans,
+        global_rot,
+        body_pose_params,
+        hand_pose_params,
+        scale_params,
+        shape_params,
+        expr_params=None,
+        return_keypoints=False,
+        do_pcblend=True,
+        return_joint_coords=False,
+        return_model_params=False,
+        return_joint_rotations=False,
+        scale_offsets=None,
+        vertex_offsets=None,
+    ):
+        if self.enable_hand_model:
+            # Transfer wrist-centric predictions to the body.
+            global_rot_ori = global_rot.clone()
+            global_trans_ori = global_trans.clone()
+            global_rot = roma.rotmat_to_euler(
+                "xyz",
+                roma.euler_to_rotmat("xyz", global_rot_ori) @ self.local_to_world_wrist,
+            )
+            global_trans = (
+                -(
+                    roma.euler_to_rotmat("xyz", global_rot)
+                    @ (self.right_wrist_coords - self.root_coords)
+                    + self.root_coords
+                )
+                + global_trans_ori
+            )
+        body_pose_params = body_pose_params[..., :130]
+        # Convert from scale and shape params to actual scales and vertices
+        ## Add singleton batches in case...
+        if len(scale_params.shape) == 1:
+            scale_params = scale_params[None]
+        if len(shape_params.shape) == 1:
+            shape_params = shape_params[None]
+        ## Convert scale...
+        scales = self.scale_mean[None, :] + scale_params @ self.scale_comps
+        if scale_offsets is not None:
+            scales = scales + scale_offsets
+        # Now, figure out the pose.
+        ## 10 here is because it's more stable to optimize global translation in meters.
+        full_pose_params = torch.cat(
+            [global_trans * 10, global_rot, body_pose_params], dim=1
+        )  # B x 127
+        ## Put in hands
+        if hand_pose_params is not None:
+            full_pose_params = self.replace_hands_in_pose(
+                full_pose_params, hand_pose_params
+            )
+        model_params = torch.cat([full_pose_params, scales], dim=1)
+        if self.enable_hand_model:
+            # Zero out non-hand parameters
+            model_params[:, self.nonhand_param_idxs] = 0
+        curr_skinned_verts, curr_skel_state = self.mhr(
+            shape_params, model_params, expr_params
+        )
+        curr_joint_coords, curr_joint_quats, _ = torch.split(
+            curr_skel_state, [3, 4, 1], dim=2
+        )
+        curr_skinned_verts = curr_skinned_verts / 100
+        curr_joint_coords = curr_joint_coords / 100
+        curr_joint_rots = roma.unitquat_to_rotmat(curr_joint_quats)
+        # Prepare returns
+        to_return = [curr_skinned_verts]
+        if return_keypoints:
+            # Get sapiens 308 keypoints
+            model_vert_joints = torch.cat(
+                [curr_skinned_verts, curr_joint_coords], dim=1
+            )  # B x (num_verts + 127) x 3
+            model_keypoints_pred = (
+                (
+                    self.keypoint_mapping
+                    @ model_vert_joints.permute(1, 0, 2).flatten(1, 2)
+                )
+                .reshape(-1, model_vert_joints.shape[0], 3)
+                .permute(1, 0, 2)
+            )
+            if self.enable_hand_model:
+                # Zero out everything except for the right hand
+                model_keypoints_pred[:, :21] = 0
+                model_keypoints_pred[:, 42:] = 0
+            to_return = to_return + [model_keypoints_pred]
+        if return_joint_coords:
+            to_return = to_return + [curr_joint_coords]
+        if return_model_params:
+            to_return = to_return + [model_params]
+        if return_joint_rotations:
+            to_return = to_return + [curr_joint_rots]
+        if isinstance(to_return, list) and len(to_return) == 1:
+            return to_return[0]
+        else:
+            return tuple(to_return)
+    def forward(
+        self,
+        x: torch.Tensor,
+        init_estimate: Optional[torch.Tensor] = None,
+        do_pcblend=True,
+        slim_keypoints=False,
+    ):
+        """
+        Args:
+            x: pose token with shape [B, C], usually C=DECODER.DIM
+            init_estimate: [B, self.npose]
+        """
+        batch_size = x.shape[0]
+        pred = self.proj(x)
+        if init_estimate is not None:
+            pred = pred + init_estimate
+        # From pred, we want to pull out individual predictions.
+        ## First, get globals
+        ### Global rotation is first 6.
+        count = 6
+        global_rot_6d = pred[:, :count]
+        global_rot_rotmat = rot6d_to_rotmat(global_rot_6d)  # B x 3 x 3
+        global_rot_euler = roma.rotmat_to_euler("ZYX", global_rot_rotmat)  # B x 3
+        global_trans = torch.zeros_like(global_rot_euler)
+        ## Next, get body pose.
+        ### Hold onto raw, continuous version for iterative correction.
+        pred_pose_cont = pred[:, count : count + self.body_cont_dim]
+        count += self.body_cont_dim
+        ### Convert to eulers (and trans)
+        pred_pose_euler = compact_cont_to_model_params_body(pred_pose_cont)
+        ### Zero-out hands
+        pred_pose_euler[:, mhr_param_hand_mask] = 0
+        ### Zero-out jaw
+        pred_pose_euler[:, -3:] = 0
+        ## Get remaining parameters
+        pred_shape = pred[:, count : count + self.num_shape_comps]
+        count += self.num_shape_comps
+        pred_scale = pred[:, count : count + self.num_scale_comps]
+        count += self.num_scale_comps
+        pred_hand = pred[:, count : count + self.num_hand_comps * 2]
+        count += self.num_hand_comps * 2
+        pred_face = pred[:, count : count + self.num_face_comps] * 0
+        count += self.num_face_comps
+        # Run everything through mhr
+        output = self.mhr_forward(
+            global_trans=global_trans,
+            global_rot=global_rot_euler,
+            body_pose_params=pred_pose_euler,
+            hand_pose_params=pred_hand,
+            scale_params=pred_scale,
+            shape_params=pred_shape,
+            expr_params=pred_face,
+            do_pcblend=do_pcblend,
+            return_keypoints=True,
+            return_joint_coords=True,
+            return_model_params=True,
+            return_joint_rotations=True,
+        )
+        # Some existing code to get joints and fix camera system
+        verts, j3d, jcoords, mhr_model_params, joint_global_rots = output
+        j3d = j3d[:, :70]  # 308 --> 70 keypoints
+        if verts is not None:
+            verts[..., [1, 2]] *= -1  # Camera system difference
+        j3d[..., [1, 2]] *= -1  # Camera system difference
+        if jcoords is not None:
+            jcoords[..., [1, 2]] *= -1
+        # Prep outputs
+        output = {
+            "pred_pose_raw": torch.cat(
+                [global_rot_6d, pred_pose_cont], dim=1
+            ),  # Both global rot and continuous pose
+            "pred_pose_rotmat": None,  # This normally used for mhr pose param rotmat supervision.
+            "global_rot": global_rot_euler,
+            "body_pose": pred_pose_euler,  # Unused during training
+            "shape": pred_shape,
+            "scale": pred_scale,
+            "hand": pred_hand,
+            "face": pred_face,
+            "pred_keypoints_3d": j3d.reshape(batch_size, -1, 3),
+            "pred_vertices": (
+                verts.reshape(batch_size, -1, 3) if verts is not None else None
+            ),
+            "pred_joint_coords": (
+                jcoords.reshape(batch_size, -1, 3) if jcoords is not None else None
+            ),
+            "faces": self.faces.cpu().numpy(),
+            "joint_global_rots": joint_global_rots,
+            "mhr_model_params": mhr_model_params,
+        }
+        return output

src/sam3d_body/models/meta_arch/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2	+
3	+ from .sam3d_body import SAM3DBody

src/sam3d_body/models/meta_arch/base_lightning_module.py ADDED Viewed

	@@ -0,0 +1,48 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+import numpy as np
+import pytorch_lightning as pl
+from pytorch_lightning.loggers import TensorBoardLogger, WandbLogger
+class BaseLightningModule(pl.LightningModule):
+    def _log_metric(self, name, value, step=None):
+        for logger in self.trainer.loggers:
+            if isinstance(logger, WandbLogger):
+                if step is not None:
+                    logger.experiment.log({name: value, "step": step})
+                else:
+                    logger.experiment.log({name: value})
+            elif isinstance(logger, TensorBoardLogger):
+                logger.experiment.add_scalar(name, value, step)
+            else:
+                raise ValueError(f"Unsupported logger: {logger}")
+    def _log_image(self, name, img_tensor, dataformats="CHW", step_count=None):
+        """Log image tensor to both W&B and TensorBoard."""
+        step = step_count if step_count is not None else self.global_step
+        for logger in self.trainer.loggers:
+            if isinstance(logger, WandbLogger):
+                import wandb
+                img = img_tensor
+                if dataformats.upper() == "CHW":
+                    # If in PyTorch format (C,H,W), convert to (H,W,C) for wandb
+                    img = img_tensor.permute(1, 2, 0).cpu().numpy()
+                logger.experiment.log({name: wandb.Image(img), "step": step})
+            elif isinstance(logger, TensorBoardLogger):
+                logger.experiment.add_image(
+                    name, img_tensor, step, dataformats=dataformats
+                )
+            else:
+                raise ValueError(f"Unsupported logger: {logger}")
+    def _log_hist(self, name, array, step_count=None):
+        for logger in self.trainer.loggers:
+            if isinstance(logger, WandbLogger):
+                import wandb
+                value = wandb.Histogram(
+                    np_histogram=(array, np.arange(array.shape[0] + 1)),
+                )
+                logger.experiment.log({name: value, "step": step_count})

src/sam3d_body/models/meta_arch/base_model.py ADDED Viewed

	@@ -0,0 +1,162 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+"""Define an abstract base model for consistent format input / processing / output."""
+from abc import abstractmethod
+from functools import partial
+import torch
+from yacs.config import CfgNode
+from ..optim.fp16_utils import convert_module_to_f16, convert_to_fp16_safe
+from .base_lightning_module import BaseLightningModule
+class BaseModel(BaseLightningModule):
+    def __init__(self, cfg: CfgNode | None, **kwargs):
+        super().__init__()
+        # Save hyperparameters
+        self.save_hyperparameters(logger=False)
+        self.cfg = cfg
+        self._initialze_model(**kwargs)
+        # Initialize attributes for image-based batch format
+        self._max_num_person = None
+        self._person_valid = None
+    @abstractmethod
+    def _initialze_model(self, **kwargs) -> None:
+        pass
+    def data_preprocess(
+        self,
+        inputs: torch.Tensor,
+        crop_width: bool = False,
+        is_full: bool = False,  # whether for full_branch
+        crop_hand: int = 0,
+    ) -> torch.Tensor:
+        image_mean = self.image_mean if not is_full else self.full_image_mean
+        image_std = self.image_std if not is_full else self.full_image_std
+        if inputs.max() > 1 and image_mean.max() <= 1.0:
+            inputs = inputs / 255.0
+        elif inputs.max() <= 1.0 and image_mean.max() > 1:
+            inputs = inputs * 255.0
+        batch_inputs = (inputs - image_mean) / image_std
+        if crop_width:
+            if crop_hand > 0:
+                batch_inputs = batch_inputs[:, :, :, crop_hand:-crop_hand]
+            elif self.cfg.MODEL.BACKBONE.TYPE in [
+                "vit_hmr",
+                "vit",
+            ]:
+                # ViT backbone assumes a different aspect ratio as input size
+                batch_inputs = batch_inputs[:, :, :, 32:-32]
+            elif self.cfg.MODEL.BACKBONE.TYPE in [
+                "vit_hmr_512_384",
+            ]:
+                batch_inputs = batch_inputs[:, :, :, 64:-64]
+            else:
+                raise Exception
+        return batch_inputs
+    def _initialize_batch(self, batch: dict) -> None:
+        # Check whether the input batch is with format
+        # [batch_size, num_person, ...]
+        if batch["img"].dim() == 5:
+            self._batch_size, self._max_num_person = batch["img"].shape[:2]
+            self._person_valid = self._flatten_person(batch["person_valid"]) > 0
+        else:
+            self._batch_size = batch["img"].shape[0]
+            self._max_num_person = 0
+            self._person_valid = None
+    def _flatten_person(self, x: torch.Tensor) -> torch.Tensor:
+        assert self._max_num_person is not None, "No max_num_person initialized"
+        if self._max_num_person:
+            # Merge person crops to batch dimension
+            shape = x.shape
+            x = x.view(self._batch_size * self._max_num_person, *shape[2:])
+        return x
+    def _unflatten_person(self, x: torch.Tensor) -> torch.Tensor:
+        shape = x.shape
+        if self._max_num_person:
+            x = x.view(self._batch_size, self._max_num_person, *shape[1:])
+        return x
+    def _get_valid(self, x: torch.Tensor) -> torch.Tensor:
+        assert self._max_num_person is not None, "No max_num_person initialized"
+        if self._person_valid is not None:
+            x = x[self._person_valid]
+        return x
+    def _full_to_crop(self, batch: dict, pred_keypoints_2d: torch.Tensor) -> torch.Tensor:
+        """Convert full-image keypoints coordinates to crop and normalize to [-0.5. 0.5]"""
+        pred_keypoints_2d_cropped = torch.cat(
+            [pred_keypoints_2d, torch.ones_like(pred_keypoints_2d[:, :, [-1]])], dim=-1
+        )
+        affine_trans = self._flatten_person(batch["affine_trans"]).to(pred_keypoints_2d_cropped)
+        img_size = self._flatten_person(batch["img_size"]).unsqueeze(1)
+        pred_keypoints_2d_cropped = pred_keypoints_2d_cropped @ affine_trans.mT
+        pred_keypoints_2d_cropped = pred_keypoints_2d_cropped[..., :2] / img_size - 0.5
+        return pred_keypoints_2d_cropped
+    def _cam_full_to_crop(
+        self, batch: dict, pred_cam_t: torch.Tensor, focal_length: torch.Tensor = None
+    ) -> torch.Tensor:
+        """Revert the camera translation from full to crop image space"""
+        num_person = batch["img"].shape[1]
+        cam_int = self._flatten_person(batch["cam_int"].unsqueeze(1).expand(-1, num_person, -1, -1).contiguous())
+        bbox_center = self._flatten_person(batch["bbox_center"])
+        bbox_size = self._flatten_person(batch["bbox_scale"])[:, 0]
+        input_size = self._flatten_person(batch["img_size"])[:, 0]
+        tx, ty, tz = pred_cam_t[:, 0], pred_cam_t[:, 1], pred_cam_t[:, 2]
+        if focal_length is None:
+            focal_length = cam_int[:, 0, 0]
+        bs = 2 * focal_length / (tz + 1e-8)
+        cx = 2 * (bbox_center[:, 0] - (cam_int[:, 0, 2])) / bs
+        cy = 2 * (bbox_center[:, 1] - (cam_int[:, 1, 2])) / bs
+        crop_cam_t = torch.stack([tx - cx, ty - cy, tz * bbox_size / input_size], dim=-1)
+        return crop_cam_t
+    def convert_to_fp16(self) -> torch.dtype:
+        """
+        Convert the torso of the model to float16.
+        """
+        fp16_type = torch.float16 if self.cfg.TRAIN.get("FP16_TYPE", "float16") == "float16" else torch.bfloat16
+        if hasattr(self, "backbone"):
+            self._set_fp16(self.backbone, fp16_type)
+        if hasattr(self, "full_encoder"):
+            self._set_fp16(self.full_encoder, fp16_type)
+        if hasattr(self.backbone, "lhand_pos_embed"):
+            self.backbone.lhand_pos_embed.data = self.backbone.lhand_pos_embed.data.to(fp16_type)
+        if hasattr(self.backbone, "rhand_pos_embed"):
+            self.backbone.rhand_pos_embed.data = self.backbone.rhand_pos_embed.data.to(fp16_type)
+        return fp16_type
+    def _set_fp16(self, module, fp16_type):
+        if hasattr(module, "pos_embed"):
+            module.apply(partial(convert_module_to_f16, dtype=fp16_type))
+            module.pos_embed.data = module.pos_embed.data.to(fp16_type)
+        elif hasattr(module.encoder, "rope_embed"):
+            # DINOv3
+            module.encoder.apply(partial(convert_to_fp16_safe, dtype=fp16_type))
+            module.encoder.rope_embed = module.encoder.rope_embed.to(fp16_type)
+        else:
+            # DINOv2
+            module.encoder.pos_embed.data = module.encoder.pos_embed.data.to(fp16_type)

src/sam3d_body/models/meta_arch/sam3d_body.py ADDED Viewed

	@@ -0,0 +1,1728 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+from collections.abc import Sequence
+from dataclasses import dataclass
+from typing import Any
+import numpy as np
+import roma
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from sam3d_body.data.utils.prepare_batch import prepare_batch
+from sam3d_body.models.decoders.prompt_encoder import PositionEmbeddingRandom
+from sam3d_body.models.modules.mhr_utils import (
+    fix_wrist_euler,
+    rotation_angle_difference,
+)
+from sam3d_body.utils import recursive_to
+from sam3d_body.utils.logging import get_pylogger
+from ..backbones import create_backbone
+from ..decoders import PromptEncoder, build_decoder, build_keypoint_sampler
+from ..heads import build_head
+from ..modules.camera_embed import CameraEncoder
+from ..modules.transformer import FFN, MLP
+from .base_model import BaseModel
+logger = get_pylogger(__name__)
+# fmt: off
+PROMPT_KEYPOINTS = {  # keypoint_idx: prompt_idx
+    "mhr70": {
+        i: i for i in range(70)
+    },  # all 70 keypoints are supported for prompting
+}
+KEY_BODY = [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 41, 62]  # key body joints for prompting
+KEY_RIGHT_HAND = list(range(21, 42))
+# fmt: on
+@dataclass
+class BodyPredContainer:
+    """Structured container for main body + optional hand inference outputs."""
+    pose_output: dict[str, Any]
+    batch_lhand: dict[str, Any] | None = None
+    batch_rhand: dict[str, Any] | None = None
+    lhand_output: dict[str, Any] | None = None
+    rhand_output: dict[str, Any] | None = None
+class SAM3DBody(BaseModel):
+    pelvis_idx = [9, 10]  # left_hip, right_hip
+    def _initialze_model(self):
+        self.register_buffer("image_mean", torch.tensor(self.cfg.MODEL.IMAGE_MEAN).view(-1, 1, 1), False)
+        self.register_buffer("image_std", torch.tensor(self.cfg.MODEL.IMAGE_STD).view(-1, 1, 1), False)
+        # Create backbone feature extractor for human crops
+        self.backbone = create_backbone(self.cfg.MODEL.BACKBONE.TYPE, self.cfg)
+        # Create header for pose estimation output
+        self.head_pose = build_head(self.cfg, self.cfg.MODEL.PERSON_HEAD.POSE_TYPE)
+        self.head_pose.hand_pose_comps_ori = nn.Parameter(self.head_pose.hand_pose_comps.clone(), requires_grad=False)
+        self.head_pose.hand_pose_comps.data = torch.eye(54).to(self.head_pose.hand_pose_comps.data).float()
+        # Initialize pose token with learnable params
+        # Note: bias/initial value should be zero-pose in cont, not all-zeros
+        self.init_pose = nn.Embedding(1, self.head_pose.npose)
+        # Define header for hand pose estimation
+        self.head_pose_hand = build_head(self.cfg, self.cfg.MODEL.PERSON_HEAD.POSE_TYPE, enable_hand_model=True)
+        self.head_pose_hand.hand_pose_comps_ori = nn.Parameter(
+            self.head_pose_hand.hand_pose_comps.clone(), requires_grad=False
+        )
+        self.head_pose_hand.hand_pose_comps.data = torch.eye(54).to(self.head_pose_hand.hand_pose_comps.data).float()
+        self.init_pose_hand = nn.Embedding(1, self.head_pose_hand.npose)
+        self.head_camera = build_head(self.cfg, self.cfg.MODEL.PERSON_HEAD.CAMERA_TYPE)
+        self.init_camera = nn.Embedding(1, self.head_camera.ncam)
+        nn.init.zeros_(self.init_camera.weight)
+        self.head_camera_hand = build_head(
+            self.cfg,
+            self.cfg.MODEL.PERSON_HEAD.CAMERA_TYPE,
+            default_scale_factor=self.cfg.MODEL.CAMERA_HEAD.get("DEFAULT_SCALE_FACTOR_HAND", 1.0),
+        )
+        self.init_camera_hand = nn.Embedding(1, self.head_camera_hand.ncam)
+        nn.init.zeros_(self.init_camera_hand.weight)
+        self.camera_type = "perspective"
+        # Support conditioned information for decoder
+        cond_dim = 3
+        init_dim = self.head_pose.npose + self.head_camera.ncam + cond_dim
+        self.init_to_token_mhr = nn.Linear(init_dim, self.cfg.MODEL.DECODER.DIM)
+        self.prev_to_token_mhr = nn.Linear(init_dim - cond_dim, self.cfg.MODEL.DECODER.DIM)
+        self.init_to_token_mhr_hand = nn.Linear(init_dim, self.cfg.MODEL.DECODER.DIM)
+        self.prev_to_token_mhr_hand = nn.Linear(init_dim - cond_dim, self.cfg.MODEL.DECODER.DIM)
+        # Create prompt encoder
+        self.max_num_clicks = 0
+        if self.cfg.MODEL.PROMPT_ENCODER.ENABLE:
+            self.max_num_clicks = self.cfg.MODEL.PROMPT_ENCODER.MAX_NUM_CLICKS
+            self.prompt_keypoints = PROMPT_KEYPOINTS[self.cfg.MODEL.PROMPT_ENCODER.PROMPT_KEYPOINTS]
+            self.prompt_encoder = PromptEncoder(
+                embed_dim=self.backbone.embed_dims,  # need to match backbone dims for PE
+                num_body_joints=len(set(self.prompt_keypoints.values())),
+                frozen=self.cfg.MODEL.PROMPT_ENCODER.get("frozen", False),
+                mask_embed_type=self.cfg.MODEL.PROMPT_ENCODER.get("MASK_EMBED_TYPE", None),
+            )
+            self.prompt_to_token = nn.Linear(self.backbone.embed_dims, self.cfg.MODEL.DECODER.DIM)
+            self.keypoint_prompt_sampler = build_keypoint_sampler(
+                self.cfg.MODEL.PROMPT_ENCODER.get("KEYPOINT_SAMPLER", {}),
+                prompt_keypoints=self.prompt_keypoints,
+                keybody_idx=(
+                    KEY_BODY if not self.cfg.MODEL.PROMPT_ENCODER.get("SAMPLE_HAND", False) else KEY_RIGHT_HAND
+                ),
+            )
+            # To keep track of prompting history
+            self.prompt_hist = np.zeros(
+                (len(set(self.prompt_keypoints.values())) + 2, self.max_num_clicks),
+                dtype=np.float32,
+            )
+            if self.cfg.MODEL.DECODER.FROZEN:
+                for param in self.prompt_to_token.parameters():
+                    param.requires_grad = False
+        # Create promptable decoder
+        self.decoder = build_decoder(self.cfg.MODEL.DECODER, context_dim=self.backbone.embed_dims)
+        # shared config for the two decoders
+        self.decoder_hand = build_decoder(self.cfg.MODEL.DECODER, context_dim=self.backbone.embed_dims)
+        self.hand_pe_layer = PositionEmbeddingRandom(self.backbone.embed_dims // 2)
+        # Manually convert the torso of the model to fp16.
+        if self.cfg.TRAIN.USE_FP16:
+            self.convert_to_fp16()
+            if self.cfg.TRAIN.get("FP16_TYPE", "float16") == "float16":
+                self.backbone_dtype = torch.float16
+            else:
+                self.backbone_dtype = torch.bfloat16
+        else:
+            self.backbone_dtype = torch.float32
+        self.ray_cond_emb = CameraEncoder(
+            self.backbone.embed_dim,
+            self.backbone.patch_size,
+        )
+        self.ray_cond_emb_hand = CameraEncoder(
+            self.backbone.embed_dim,
+            self.backbone.patch_size,
+        )
+        self.keypoint_embedding_idxs = list(range(70))
+        self.keypoint_embedding = nn.Embedding(len(self.keypoint_embedding_idxs), self.cfg.MODEL.DECODER.DIM)
+        self.keypoint_embedding_idxs_hand = list(range(70))
+        self.keypoint_embedding_hand = nn.Embedding(len(self.keypoint_embedding_idxs_hand), self.cfg.MODEL.DECODER.DIM)
+        if self.cfg.MODEL.DECODER.get("DO_HAND_DETECT_TOKENS", False):
+            self.hand_box_embedding = nn.Embedding(2, self.cfg.MODEL.DECODER.DIM)  # for two hands
+            # decice if there is left or right hand inside the image
+            self.hand_cls_embed = nn.Linear(self.cfg.MODEL.DECODER.DIM, 2)
+            self.bbox_embed = MLP(self.cfg.MODEL.DECODER.DIM, self.cfg.MODEL.DECODER.DIM, 4, 3)
+        self.keypoint_posemb_linear = FFN(
+            embed_dims=2,
+            feedforward_channels=self.cfg.MODEL.DECODER.DIM,
+            output_dims=self.cfg.MODEL.DECODER.DIM,
+            num_fcs=2,
+            add_identity=False,
+        )
+        self.keypoint_posemb_linear_hand = FFN(
+            embed_dims=2,
+            feedforward_channels=self.cfg.MODEL.DECODER.DIM,
+            output_dims=self.cfg.MODEL.DECODER.DIM,
+            num_fcs=2,
+            add_identity=False,
+        )
+        self.keypoint_feat_linear = nn.Linear(self.backbone.embed_dims, self.cfg.MODEL.DECODER.DIM)
+        self.keypoint_feat_linear_hand = nn.Linear(self.backbone.embed_dims, self.cfg.MODEL.DECODER.DIM)
+        # Do all KPS
+        self.keypoint3d_embedding_idxs = list(range(70))
+        self.keypoint3d_embedding = nn.Embedding(len(self.keypoint3d_embedding_idxs), self.cfg.MODEL.DECODER.DIM)
+        # Assume always do full body for the hand decoder
+        self.keypoint3d_embedding_idxs_hand = list(range(70))
+        self.keypoint3d_embedding_hand = nn.Embedding(
+            len(self.keypoint3d_embedding_idxs_hand), self.cfg.MODEL.DECODER.DIM
+        )
+        self.keypoint3d_posemb_linear = FFN(
+            embed_dims=3,
+            feedforward_channels=self.cfg.MODEL.DECODER.DIM,
+            output_dims=self.cfg.MODEL.DECODER.DIM,
+            num_fcs=2,
+            add_identity=False,
+        )
+        self.keypoint3d_posemb_linear_hand = FFN(
+            embed_dims=3,
+            feedforward_channels=self.cfg.MODEL.DECODER.DIM,
+            output_dims=self.cfg.MODEL.DECODER.DIM,
+            num_fcs=2,
+            add_identity=False,
+        )
+    def _get_decoder_condition(self, batch: dict) -> torch.Tensor | None:
+        num_person = batch["img"].shape[1]
+        if self.cfg.MODEL.DECODER.CONDITION_TYPE == "cliff":
+            # CLIFF-style condition info (cx/f, cy/f, b/f)
+            cx, cy = torch.chunk(self._flatten_person(batch["bbox_center"]), chunks=2, dim=-1)
+            img_w, img_h = torch.chunk(self._flatten_person(batch["ori_img_size"]), chunks=2, dim=-1)
+            b = self._flatten_person(batch["bbox_scale"])[:, [0]]
+            focal_length = self._flatten_person(
+                batch["cam_int"].unsqueeze(1).expand(-1, num_person, -1, -1).contiguous()
+            )[:, 0, 0]
+            if not self.cfg.MODEL.DECODER.get("USE_INTRIN_CENTER", False):
+                condition_info = torch.cat([cx - img_w / 2.0, cy - img_h / 2.0, b], dim=-1)
+            else:
+                full_img_cxy = self._flatten_person(
+                    batch["cam_int"].unsqueeze(1).expand(-1, num_person, -1, -1).contiguous()
+                )[:, [0, 1], [2, 2]]
+                condition_info = torch.cat([cx - full_img_cxy[:, [0]], cy - full_img_cxy[:, [1]], b], dim=-1)
+            condition_info[:, :2] = condition_info[:, :2] / focal_length.unsqueeze(-1)  # [-1, 1]
+            condition_info[:, 2] = condition_info[:, 2] / focal_length  # [-1, 1]
+        elif self.cfg.MODEL.DECODER.CONDITION_TYPE == "none":
+            return None
+        else:
+            raise NotImplementedError
+        return condition_info.type(batch["img"].dtype)
+    def forward_decoder(
+        self,
+        image_embeddings: torch.Tensor,
+        init_estimate: torch.Tensor | None = None,
+        keypoints: torch.Tensor | None = None,
+        prev_estimate: torch.Tensor | None = None,
+        condition_info: torch.Tensor | None = None,
+        batch=None,
+    ):
+        """
+        Args:
+            image_embeddings: image features from the backbone, shape (B, C, H, W)
+            init_estimate: initial estimate to be refined on, shape (B, 1, C)
+            keypoints: optional prompt input, shape (B, N, 3),
+                3 for coordinates (x,y) + label.
+                (x, y) should be normalized to range [0, 1].
+                label==-1 indicates incorrect points,
+                label==-2 indicates invalid points
+            prev_estimate: optional prompt input, shape (B, 1, C),
+                previous estimate for pose refinement.
+            condition_info: optional condition information that is concatenated with
+                the input tokens, shape (B, c)
+        """
+        batch_size = image_embeddings.shape[0]
+        # Initial estimation for residual prediction.
+        if init_estimate is None:
+            init_pose = self.init_pose.weight.expand(batch_size, -1).unsqueeze(dim=1)
+            if hasattr(self, "init_camera"):
+                init_camera = self.init_camera.weight.expand(batch_size, -1).unsqueeze(dim=1)
+            init_estimate = (
+                init_pose if not hasattr(self, "init_camera") else torch.cat([init_pose, init_camera], dim=-1)
+            )  # This is basically pose & camera translation at the end. B x 1 x (404 + 3)
+        init_input = (
+            torch.cat([condition_info.view(batch_size, 1, -1), init_estimate], dim=-1)
+            if condition_info is not None
+            else init_estimate
+        )  # B x 1 x 410 (this is with the CLIFF condition)
+        token_embeddings = self.init_to_token_mhr(init_input).view(batch_size, 1, -1)  # B x 1 x 1024 (linear layered)
+        num_pose_token = token_embeddings.shape[1]
+        assert num_pose_token == 1
+        image_augment, token_augment, token_mask = None, None, None
+        if hasattr(self, "prompt_encoder") and keypoints is not None:
+            if prev_estimate is None:
+                # Use initial embedding if no previous embedding
+                prev_estimate = init_estimate
+            # Previous estimate w/o the CLIFF condition.
+            prev_embeddings = self.prev_to_token_mhr(prev_estimate).view(
+                batch_size, 1, -1
+            )  # 407 -> B x 1 x 1024; linear layer-ed
+            if self.cfg.MODEL.BACKBONE.TYPE in [
+                "vit_hmr",
+                "vit",
+                "vit_b",
+                "vit_l",
+            ]:
+                # ViT backbone assumes a different aspect ratio as input size
+                image_augment = self.prompt_encoder.get_dense_pe((16, 16))[:, :, :, 2:-2]
+            elif self.cfg.MODEL.BACKBONE.TYPE in [
+                "vit_hmr_512_384",
+            ]:
+                # ViT backbone assumes a different aspect ratio as input size
+                image_augment = self.prompt_encoder.get_dense_pe((32, 32))[:, :, :, 4:-4]
+            else:
+                image_augment = self.prompt_encoder.get_dense_pe(image_embeddings.shape[-2:])  # (1, C, H, W)
+            image_embeddings = self.ray_cond_emb(image_embeddings, batch["ray_cond"])
+            # To start, keypoints is all [0, 0, -2]. The points get sent into self.pe_layer._pe_encoding,
+            # the labels determine the embedding weight (special one for -2, -1, then each of joint.)
+            prompt_embeddings, prompt_mask = self.prompt_encoder(keypoints=keypoints)  # B x 1 x 1280
+            prompt_embeddings = self.prompt_to_token(prompt_embeddings)  # Linear layered: B x 1 x 1024
+            # Concatenate pose tokens and prompt embeddings as decoder input
+            token_embeddings = torch.cat(
+                [
+                    token_embeddings,
+                    prev_embeddings,
+                    prompt_embeddings,
+                ],
+                dim=1,
+            )
+            token_augment = torch.zeros_like(token_embeddings)
+            token_augment[:, [num_pose_token]] = prev_embeddings
+            token_augment[:, (num_pose_token + 1) :] = prompt_embeddings
+            token_mask = None
+            if self.cfg.MODEL.DECODER.get("DO_HAND_DETECT_TOKENS", False):
+                # Put in a token for each hand
+                hand_det_emb_start_idx = token_embeddings.shape[1]
+                token_embeddings = torch.cat(
+                    [
+                        token_embeddings,
+                        self.hand_box_embedding.weight[None, :, :].repeat(batch_size, 1, 1),
+                    ],
+                    dim=1,
+                )  # B x 5 + 70 x 1024
+                # No positional embeddings
+                token_augment = torch.cat(
+                    [
+                        token_augment,
+                        torch.zeros_like(token_embeddings[:, token_augment.shape[1] :, :]),
+                    ],
+                    dim=1,
+                )  # B x 5 + 70 x 1024
+            assert self.cfg.MODEL.DECODER.get("DO_KEYPOINT_TOKENS", False)
+            # Put in a token for each keypoint
+            kps_emb_start_idx = token_embeddings.shape[1]
+            token_embeddings = torch.cat(
+                [
+                    token_embeddings,
+                    self.keypoint_embedding.weight[None, :, :].repeat(batch_size, 1, 1),
+                ],
+                dim=1,
+            )  # B x 3 + 70 x 1024
+            # No positional embeddings
+            token_augment = torch.cat(
+                [
+                    token_augment,
+                    torch.zeros_like(token_embeddings[:, token_augment.shape[1] :, :]),
+                ],
+                dim=1,
+            )  # B x 3 + 70 x 1024
+            if self.cfg.MODEL.DECODER.get("DO_KEYPOINT3D_TOKENS", False):
+                # Put in a token for each keypoint
+                kps3d_emb_start_idx = token_embeddings.shape[1]
+                token_embeddings = torch.cat(
+                    [
+                        token_embeddings,
+                        self.keypoint3d_embedding.weight[None, :, :].repeat(batch_size, 1, 1),
+                    ],
+                    dim=1,
+                )  # B x 3 + 70 + 70 x 1024
+                # No positional embeddings
+                token_augment = torch.cat(
+                    [
+                        token_augment,
+                        torch.zeros_like(token_embeddings[:, token_augment.shape[1] :, :]),
+                    ],
+                    dim=1,
+                )  # B x 3 + 70 + 70 x 1024
+        # We're doing intermediate model predictions
+        def token_to_pose_output_fn(tokens, prev_pose_output, layer_idx):
+            # Get the pose token
+            pose_token = tokens[:, 0]
+            prev_pose = init_pose.view(batch_size, -1)
+            prev_camera = init_camera.view(batch_size, -1)
+            # Get pose outputs
+            pose_output = self.head_pose(pose_token, prev_pose)
+            # Get Camera Translation
+            if hasattr(self, "head_camera"):
+                pred_cam = self.head_camera(pose_token, prev_camera)
+                pose_output["pred_cam"] = pred_cam
+            # Run camera projection
+            pose_output = self.camera_project(pose_output, batch)
+            # Get 2D KPS in crop
+            pose_output["pred_keypoints_2d_cropped"] = self._full_to_crop(
+                batch, pose_output["pred_keypoints_2d"], self.body_batch_idx
+            )
+            return pose_output
+        kp_token_update_fn = self.keypoint_token_update_fn
+        # Now for 3D
+        kp3d_token_update_fn = self.keypoint3d_token_update_fn
+        # Combine the 2D and 3D functionse
+        def keypoint_token_update_fn_comb(*args):
+            if kp_token_update_fn is not None:
+                args = kp_token_update_fn(kps_emb_start_idx, image_embeddings, *args)
+            if kp3d_token_update_fn is not None:
+                args = kp3d_token_update_fn(kps3d_emb_start_idx, *args)
+            return args
+        pose_token, pose_output = self.decoder(
+            token_embeddings,
+            image_embeddings,
+            token_augment,
+            image_augment,
+            token_mask,
+            token_to_pose_output_fn=token_to_pose_output_fn,
+            keypoint_token_update_fn=keypoint_token_update_fn_comb,
+        )
+        if self.cfg.MODEL.DECODER.get("DO_HAND_DETECT_TOKENS", False):
+            return (
+                pose_token[:, hand_det_emb_start_idx : hand_det_emb_start_idx + 2],
+                pose_output,
+            )
+        else:
+            return pose_token, pose_output
+    def forward_decoder_hand(
+        self,
+        image_embeddings: torch.Tensor,
+        init_estimate: torch.Tensor | None = None,
+        keypoints: torch.Tensor | None = None,
+        prev_estimate: torch.Tensor | None = None,
+        condition_info: torch.Tensor | None = None,
+        batch=None,
+    ):
+        """
+        Args:
+            image_embeddings: image features from the backbone, shape (B, C, H, W)
+            init_estimate: initial estimate to be refined on, shape (B, 1, C)
+            keypoints: optional prompt input, shape (B, N, 3),
+                3 for coordinates (x,y) + label.
+                (x, y) should be normalized to range [0, 1].
+                label==-1 indicates incorrect points,
+                label==-2 indicates invalid points
+            prev_estimate: optional prompt input, shape (B, 1, C),
+                previous estimate for pose refinement.
+            condition_info: optional condition information that is concatenated with
+                the input tokens, shape (B, c)
+        """
+        batch_size = image_embeddings.shape[0]
+        # Initial estimation for residual prediction.
+        if init_estimate is None:
+            init_pose = self.init_pose_hand.weight.expand(batch_size, -1).unsqueeze(dim=1)
+            if hasattr(self, "init_camera_hand"):
+                init_camera = self.init_camera_hand.weight.expand(batch_size, -1).unsqueeze(dim=1)
+            init_estimate = (
+                init_pose if not hasattr(self, "init_camera_hand") else torch.cat([init_pose, init_camera], dim=-1)
+            )  # This is basically pose & camera translation at the end. B x 1 x (404 + 3)
+        init_input = (
+            torch.cat([condition_info.view(batch_size, 1, -1), init_estimate], dim=-1)
+            if condition_info is not None
+            else init_estimate
+        )  # B x 1 x 410 (this is with the CLIFF condition)
+        token_embeddings = self.init_to_token_mhr_hand(init_input).view(
+            batch_size, 1, -1
+        )  # B x 1 x 1024 (linear layered)
+        num_pose_token = token_embeddings.shape[1]
+        image_augment, token_augment, token_mask = None, None, None
+        if hasattr(self, "prompt_encoder") and keypoints is not None:
+            if prev_estimate is None:
+                # Use initial embedding if no previous embedding
+                prev_estimate = init_estimate
+            # Previous estimate w/o the CLIFF condition.
+            prev_embeddings = self.prev_to_token_mhr_hand(prev_estimate).view(
+                batch_size, 1, -1
+            )  # 407 -> B x 1 x 1024; linear layer-ed
+            if self.cfg.MODEL.BACKBONE.TYPE in [
+                "vit_hmr",
+                "vit",
+                "vit_b",
+                "vit_l",
+            ]:
+                # ViT backbone assumes a different aspect ratio as input size
+                image_augment = self.hand_pe_layer((16, 16)).unsqueeze(0)[:, :, :, 2:-2]
+            elif self.cfg.MODEL.BACKBONE.TYPE in [
+                "vit_hmr_512_384",
+            ]:
+                # ViT backbone assumes a different aspect ratio as input size
+                image_augment = self.hand_pe_layer((32, 32)).unsqueeze(0)[:, :, :, 4:-4]
+            else:
+                image_augment = self.hand_pe_layer(image_embeddings.shape[-2:]).unsqueeze(0)  # (1, C, H, W)
+            image_embeddings = self.ray_cond_emb_hand(image_embeddings, batch["ray_cond_hand"])
+            # To start, keypoints is all [0, 0, -2]. The points get sent into self.pe_layer._pe_encoding,
+            # the labels determine the embedding weight (special one for -2, -1, then each of joint.)
+            prompt_embeddings, prompt_mask = self.prompt_encoder(keypoints=keypoints)  # B x 1 x 1280
+            prompt_embeddings = self.prompt_to_token(prompt_embeddings)  # Linear layered: B x 1 x 1024
+            # Concatenate pose tokens and prompt embeddings as decoder input
+            token_embeddings = torch.cat(
+                [
+                    token_embeddings,
+                    prev_embeddings,
+                    prompt_embeddings,
+                ],
+                dim=1,
+            )
+            token_augment = torch.zeros_like(token_embeddings)
+            token_augment[:, [num_pose_token]] = prev_embeddings
+            token_augment[:, (num_pose_token + 1) :] = prompt_embeddings
+            token_mask = None
+            if self.cfg.MODEL.DECODER.get("DO_HAND_DETECT_TOKENS", False):
+                # Put in a token for each hand
+                hand_det_emb_start_idx = token_embeddings.shape[1]
+                token_embeddings = torch.cat(
+                    [
+                        token_embeddings,
+                        self.hand_box_embedding.weight[None, :, :].repeat(batch_size, 1, 1),
+                    ],
+                    dim=1,
+                )  # B x 5 + 70 x 1024
+                # No positional embeddings
+                token_augment = torch.cat(
+                    [
+                        token_augment,
+                        torch.zeros_like(token_embeddings[:, token_augment.shape[1] :, :]),
+                    ],
+                    dim=1,
+                )  # B x 5 + 70 x 1024
+            assert self.cfg.MODEL.DECODER.get("DO_KEYPOINT_TOKENS", False)
+            # Put in a token for each keypoint
+            kps_emb_start_idx = token_embeddings.shape[1]
+            token_embeddings = torch.cat(
+                [
+                    token_embeddings,
+                    self.keypoint_embedding_hand.weight[None, :, :].repeat(batch_size, 1, 1),
+                ],
+                dim=1,
+            )  # B x 3 + 70 x 1024
+            # No positional embeddings
+            token_augment = torch.cat(
+                [
+                    token_augment,
+                    torch.zeros_like(token_embeddings[:, token_augment.shape[1] :, :]),
+                ],
+                dim=1,
+            )  # B x 3 + 70 x 1024
+            if self.cfg.MODEL.DECODER.get("DO_KEYPOINT3D_TOKENS", False):
+                # Put in a token for each keypoint
+                kps3d_emb_start_idx = token_embeddings.shape[1]
+                token_embeddings = torch.cat(
+                    [
+                        token_embeddings,
+                        self.keypoint3d_embedding_hand.weight[None, :, :].repeat(batch_size, 1, 1),
+                    ],
+                    dim=1,
+                )  # B x 3 + 70 + 70 x 1024
+                # No positional embeddings
+                token_augment = torch.cat(
+                    [
+                        token_augment,
+                        torch.zeros_like(token_embeddings[:, token_augment.shape[1] :, :]),
+                    ],
+                    dim=1,
+                )  # B x 3 + 70 + 70 x 1024
+        # We're doing intermediate model predictions
+        def token_to_pose_output_fn(tokens, prev_pose_output, layer_idx):
+            # Get the pose token
+            pose_token = tokens[:, 0]
+            prev_pose = init_pose.view(batch_size, -1)
+            prev_camera = init_camera.view(batch_size, -1)
+            # Get pose outputs
+            pose_output = self.head_pose_hand(pose_token, prev_pose)
+            # Get Camera Translation
+            if hasattr(self, "head_camera_hand"):
+                pred_cam = self.head_camera_hand(pose_token, prev_camera)
+                pose_output["pred_cam"] = pred_cam
+            # Run camera projection
+            pose_output = self.camera_project_hand(pose_output, batch)
+            # Get 2D KPS in crop
+            pose_output["pred_keypoints_2d_cropped"] = self._full_to_crop(
+                batch, pose_output["pred_keypoints_2d"], self.hand_batch_idx
+            )
+            return pose_output
+        kp_token_update_fn = self.keypoint_token_update_fn_hand
+        # Now for 3D
+        kp3d_token_update_fn = self.keypoint3d_token_update_fn_hand
+        # Combine the 2D and 3D functionse
+        def keypoint_token_update_fn_comb(*args):
+            if kp_token_update_fn is not None:
+                args = kp_token_update_fn(kps_emb_start_idx, image_embeddings, *args)
+            if kp3d_token_update_fn is not None:
+                args = kp3d_token_update_fn(kps3d_emb_start_idx, *args)
+            return args
+        pose_token, pose_output = self.decoder_hand(
+            token_embeddings,
+            image_embeddings,
+            token_augment,
+            image_augment,
+            token_mask,
+            token_to_pose_output_fn=token_to_pose_output_fn,
+            keypoint_token_update_fn=keypoint_token_update_fn_comb,
+        )
+        if self.cfg.MODEL.DECODER.get("DO_HAND_DETECT_TOKENS", False):
+            return (
+                pose_token[:, hand_det_emb_start_idx : hand_det_emb_start_idx + 2],
+                pose_output,
+            )
+        else:
+            return pose_token, pose_output
+    @torch.no_grad()
+    def _get_keypoint_prompt(self, batch, pred_keypoints_2d, force_dummy=False):
+        if self.camera_type == "perspective":
+            pred_keypoints_2d = self._full_to_crop(batch, pred_keypoints_2d)
+        gt_keypoints_2d = self._flatten_person(batch["keypoints_2d"]).clone()
+        keypoint_prompt = self.keypoint_prompt_sampler.sample(
+            gt_keypoints_2d,
+            pred_keypoints_2d,
+            is_train=self.training,
+            force_dummy=force_dummy,
+        )
+        return keypoint_prompt
+    def _get_mask_prompt(self, batch, image_embeddings):
+        x_mask = self._flatten_person(batch["mask"])
+        mask_embeddings, no_mask_embeddings = self.prompt_encoder.get_mask_embeddings(
+            x_mask, image_embeddings.shape[0], image_embeddings.shape[2:]
+        )
+        if self.cfg.MODEL.BACKBONE.TYPE in [
+            "vit_hmr",
+            "vit",
+        ]:
+            # ViT backbone assumes a different aspect ratio as input size
+            mask_embeddings = mask_embeddings[:, :, :, 2:-2]
+        elif self.cfg.MODEL.BACKBONE.TYPE in [
+            "vit_hmr_512_384",
+        ]:
+            # for x2 resolution
+            mask_embeddings = mask_embeddings[:, :, :, 4:-4]
+        mask_score = self._flatten_person(batch["mask_score"]).view(-1, 1, 1, 1)
+        mask_embeddings = torch.where(
+            mask_score > 0,
+            mask_score * mask_embeddings.to(image_embeddings),
+            no_mask_embeddings.to(image_embeddings),
+        )
+        return mask_embeddings
+    def _one_prompt_iter(self, batch, output, prev_prompt, full_output):
+        image_embeddings = output["image_embeddings"]
+        condition_info = output["condition_info"]
+        if "mhr" in output and output["mhr"] is not None:
+            pose_output = output["mhr"]  # body-only output
+            # Use previous estimate as initialization
+            prev_estimate = torch.cat(
+                [
+                    pose_output["pred_pose_raw"].detach(),  # (B, 6)
+                    pose_output["shape"].detach(),
+                    pose_output["scale"].detach(),
+                    pose_output["hand"].detach(),
+                    pose_output["face"].detach(),
+                ],
+                dim=1,
+            ).unsqueeze(dim=1)
+            if hasattr(self, "init_camera"):
+                prev_estimate = torch.cat(
+                    [prev_estimate, pose_output["pred_cam"].detach().unsqueeze(1)],
+                    dim=-1,
+                )
+            prev_shape = prev_estimate.shape[1:]
+            pred_keypoints_2d = output["mhr"]["pred_keypoints_2d"].detach().clone()
+            kpt_shape = pred_keypoints_2d.shape[1:]
+        if "mhr_hand" in output and output["mhr_hand"] is not None:
+            pose_output_hand = output["mhr_hand"]
+            # Use previous estimate as initialization
+            prev_estimate_hand = torch.cat(
+                [
+                    pose_output_hand["pred_pose_raw"].detach(),  # (B, 6)
+                    pose_output_hand["shape"].detach(),
+                    pose_output_hand["scale"].detach(),
+                    pose_output_hand["hand"].detach(),
+                    pose_output_hand["face"].detach(),
+                ],
+                dim=1,
+            ).unsqueeze(dim=1)
+            if hasattr(self, "init_camera_hand"):
+                prev_estimate_hand = torch.cat(
+                    [
+                        prev_estimate_hand,
+                        pose_output_hand["pred_cam"].detach().unsqueeze(1),
+                    ],
+                    dim=-1,
+                )
+            prev_shape = prev_estimate_hand.shape[1:]
+            pred_keypoints_2d_hand = output["mhr_hand"]["pred_keypoints_2d"].detach().clone()
+            kpt_shape = pred_keypoints_2d_hand.shape[1:]
+        all_prev_estimate = torch.zeros((image_embeddings.shape[0], *prev_shape), device=image_embeddings.device)
+        if "mhr" in output and output["mhr"] is not None:
+            all_prev_estimate[self.body_batch_idx] = prev_estimate
+        if "mhr_hand" in output and output["mhr_hand"] is not None:
+            all_prev_estimate[self.hand_batch_idx] = prev_estimate_hand
+        # Get keypoint prompts
+        all_pred_keypoints_2d = torch.zeros((image_embeddings.shape[0], *kpt_shape), device=image_embeddings.device)
+        if "mhr" in output and output["mhr"] is not None:
+            all_pred_keypoints_2d[self.body_batch_idx] = pred_keypoints_2d
+        if "mhr_hand" in output and output["mhr_hand"] is not None:
+            all_pred_keypoints_2d[self.hand_batch_idx] = pred_keypoints_2d_hand
+        keypoint_prompt = self._get_keypoint_prompt(batch, all_pred_keypoints_2d)
+        cur_keypoint_prompt = (
+            torch.cat(prev_prompt + [keypoint_prompt], dim=1) if len(prev_prompt) else keypoint_prompt
+        )  # [B, 1, 3]
+        pose_output, pose_output_hand = None, None
+        if len(self.body_batch_idx):
+            tokens_output, pose_output = self.forward_decoder(
+                image_embeddings[self.body_batch_idx],
+                init_estimate=None,  # not recurring previous estimate
+                keypoints=cur_keypoint_prompt[self.body_batch_idx],
+                prev_estimate=all_prev_estimate[self.body_batch_idx],
+                condition_info=condition_info[self.body_batch_idx],
+                batch=batch,
+                full_output=None,
+            )
+            pose_output = pose_output[-1]
+        # Update prediction output
+        output.update(
+            {
+                "mhr": pose_output,
+                "mhr_hand": pose_output_hand,
+            }
+        )
+        return output, keypoint_prompt
+    def _full_to_crop(
+        self,
+        batch: dict,
+        pred_keypoints_2d: torch.Tensor,
+        batch_idx: torch.Tensor | Sequence[int] | None = None,
+    ) -> torch.Tensor:
+        """Convert full-image keypoints coordinates to crop and normalize to [-0.5. 0.5]"""
+        pred_keypoints_2d_cropped = torch.cat(
+            [pred_keypoints_2d, torch.ones_like(pred_keypoints_2d[:, :, [-1]])], dim=-1
+        )
+        if batch_idx is not None:
+            affine_trans = self._flatten_person(batch["affine_trans"])[batch_idx].to(pred_keypoints_2d_cropped)
+            img_size = self._flatten_person(batch["img_size"])[batch_idx].unsqueeze(1)
+        else:
+            affine_trans = self._flatten_person(batch["affine_trans"]).to(pred_keypoints_2d_cropped)
+            img_size = self._flatten_person(batch["img_size"]).unsqueeze(1)
+        pred_keypoints_2d_cropped = pred_keypoints_2d_cropped @ affine_trans.mT
+        pred_keypoints_2d_cropped = pred_keypoints_2d_cropped[..., :2] / img_size - 0.5
+        return pred_keypoints_2d_cropped
+    def camera_project(self, pose_output: dict, batch: dict) -> dict:
+        """
+        Project 3D keypoints to 2D using the camera parameters.
+        Args:
+            pose_output (Dict): Dictionary containing the pose output.
+            batch (Dict): Dictionary containing the batch data.
+        Returns:
+            Dict: Dictionary containing the projected 2D keypoints.
+        """
+        if hasattr(self, "head_camera"):
+            head_camera = self.head_camera
+            pred_cam = pose_output["pred_cam"]
+        else:
+            raise AssertionError("head_camera is not defined")
+        cam_out = head_camera.perspective_projection(
+            pose_output["pred_keypoints_3d"],
+            pred_cam,
+            self._flatten_person(batch["bbox_center"])[self.body_batch_idx],
+            self._flatten_person(batch["bbox_scale"])[self.body_batch_idx, 0],
+            self._flatten_person(batch["ori_img_size"])[self.body_batch_idx],
+            self._flatten_person(batch["cam_int"].unsqueeze(1).expand(-1, batch["img"].shape[1], -1, -1).contiguous())[
+                self.body_batch_idx
+            ],
+            use_intrin_center=self.cfg.MODEL.DECODER.get("USE_INTRIN_CENTER", False),
+        )
+        if pose_output.get("pred_vertices") is not None:
+            cam_out_vertices = head_camera.perspective_projection(
+                pose_output["pred_vertices"],
+                pred_cam,
+                self._flatten_person(batch["bbox_center"])[self.body_batch_idx],
+                self._flatten_person(batch["bbox_scale"])[self.body_batch_idx, 0],
+                self._flatten_person(batch["ori_img_size"])[self.body_batch_idx],
+                self._flatten_person(
+                    batch["cam_int"].unsqueeze(1).expand(-1, batch["img"].shape[1], -1, -1).contiguous()
+                )[self.body_batch_idx],
+                use_intrin_center=self.cfg.MODEL.DECODER.get("USE_INTRIN_CENTER", False),
+            )
+            pose_output["pred_keypoints_2d_verts"] = cam_out_vertices["pred_keypoints_2d"]
+        pose_output.update(cam_out)
+        return pose_output
+    def camera_project_hand(self, pose_output: dict, batch: dict) -> dict:
+        """
+        Project 3D keypoints to 2D using the camera parameters.
+        Args:
+            pose_output (Dict): Dictionary containing the pose output.
+            batch (Dict): Dictionary containing the batch data.
+        Returns:
+            Dict: Dictionary containing the projected 2D keypoints.
+        """
+        if hasattr(self, "head_camera_hand"):
+            head_camera = self.head_camera_hand
+            pred_cam = pose_output["pred_cam"]
+        else:
+            raise AssertionError("head_camera_hand is not defined")
+        cam_out = head_camera.perspective_projection(
+            pose_output["pred_keypoints_3d"],
+            pred_cam,
+            self._flatten_person(batch["bbox_center"])[self.hand_batch_idx],
+            self._flatten_person(batch["bbox_scale"])[self.hand_batch_idx, 0],
+            self._flatten_person(batch["ori_img_size"])[self.hand_batch_idx],
+            self._flatten_person(batch["cam_int"].unsqueeze(1).expand(-1, batch["img"].shape[1], -1, -1).contiguous())[
+                self.hand_batch_idx
+            ],
+            use_intrin_center=self.cfg.MODEL.DECODER.get("USE_INTRIN_CENTER", False),
+        )
+        if pose_output.get("pred_vertices") is not None:
+            cam_out_vertices = head_camera.perspective_projection(
+                pose_output["pred_vertices"],
+                pred_cam,
+                self._flatten_person(batch["bbox_center"])[self.hand_batch_idx],
+                self._flatten_person(batch["bbox_scale"])[self.hand_batch_idx, 0],
+                self._flatten_person(batch["ori_img_size"])[self.hand_batch_idx],
+                self._flatten_person(
+                    batch["cam_int"].unsqueeze(1).expand(-1, batch["img"].shape[1], -1, -1).contiguous()
+                )[self.hand_batch_idx],
+                use_intrin_center=self.cfg.MODEL.DECODER.get("USE_INTRIN_CENTER", False),
+            )
+            pose_output["pred_keypoints_2d_verts"] = cam_out_vertices["pred_keypoints_2d"]
+        pose_output.update(cam_out)
+        return pose_output
+    def get_ray_condition(self, batch):
+        B, N, _, H, W = batch["img"].shape
+        meshgrid_xy = (
+            torch.stack(torch.meshgrid(torch.arange(H), torch.arange(W), indexing="xy"), dim=2)[None, None, :, :, :]
+            .repeat(B, N, 1, 1, 1)
+            .cuda()
+        )  # B x N x H x W x 2
+        meshgrid_xy = meshgrid_xy / batch["affine_trans"][:, :, None, None, [0, 1], [0, 1]]
+        meshgrid_xy = (
+            meshgrid_xy
+            - batch["affine_trans"][:, :, None, None, [0, 1], [2, 2]]
+            / batch["affine_trans"][:, :, None, None, [0, 1], [0, 1]]
+        )
+        # Subtract out center & normalize to be rays
+        meshgrid_xy = meshgrid_xy - batch["cam_int"][:, None, None, None, [0, 1], [2, 2]]
+        meshgrid_xy = meshgrid_xy / batch["cam_int"][:, None, None, None, [0, 1], [0, 1]]
+        return meshgrid_xy.permute(0, 1, 4, 2, 3).to(batch["img"].dtype)  # This is B x num_person x 2 x H x W
+    def forward_pose_branch(self, batch: dict) -> dict:
+        """Run a forward pass for the crop-image (pose) branch."""
+        batch_size, num_person = batch["img"].shape[:2]
+        # Forward backbone encoder
+        x = self.data_preprocess(
+            self._flatten_person(batch["img"]),
+            crop_width=(
+                self.cfg.MODEL.BACKBONE.TYPE
+                in [
+                    "vit_hmr",
+                    "vit",
+                    "vit_b",
+                    "vit_l",
+                    "vit_hmr_512_384",
+                ]
+            ),
+        )
+        # Optionally get ray conditioining
+        ray_cond = self.get_ray_condition(batch)  # This is B x num_person x 2 x H x W
+        ray_cond = self._flatten_person(ray_cond)
+        if self.cfg.MODEL.BACKBONE.TYPE in [
+            "vit_hmr",
+            "vit",
+            "vit_b",
+            "vit_l",
+        ]:
+            ray_cond = ray_cond[:, :, :, 32:-32]
+        elif self.cfg.MODEL.BACKBONE.TYPE in [
+            "vit_hmr_512_384",
+        ]:
+            ray_cond = ray_cond[:, :, :, 64:-64]
+        if len(self.body_batch_idx):
+            batch["ray_cond"] = ray_cond[self.body_batch_idx].clone()
+        if len(self.hand_batch_idx):
+            batch["ray_cond_hand"] = ray_cond[self.hand_batch_idx].clone()
+        ray_cond = None
+        image_embeddings = self.backbone(x.type(self.backbone_dtype), extra_embed=ray_cond)  # (B, C, H, W)
+        if isinstance(image_embeddings, tuple):
+            image_embeddings = image_embeddings[-1]
+        image_embeddings = image_embeddings.type(x.dtype)
+        # Mask condition if available
+        if self.cfg.MODEL.PROMPT_ENCODER.get("MASK_EMBED_TYPE", None) is not None:
+            # v1: non-iterative mask conditioning
+            if self.cfg.MODEL.PROMPT_ENCODER.get("MASK_PROMPT", "v1") == "v1":
+                mask_embeddings = self._get_mask_prompt(batch, image_embeddings)
+                image_embeddings = image_embeddings + mask_embeddings
+            else:
+                raise NotImplementedError
+        # Prepare input for promptable decoder
+        condition_info = self._get_decoder_condition(batch)
+        # Initial estimate with a dummy prompt
+        keypoints_prompt = torch.zeros((batch_size * num_person, 1, 3)).to(batch["img"])
+        keypoints_prompt[:, :, -1] = -2
+        # Forward promptable decoder to get updated pose tokens and regression output
+        pose_output, pose_output_hand = None, None
+        if len(self.body_batch_idx):
+            tokens_output, pose_output = self.forward_decoder(
+                image_embeddings[self.body_batch_idx],
+                init_estimate=None,
+                keypoints=keypoints_prompt[self.body_batch_idx],
+                prev_estimate=None,
+                condition_info=condition_info[self.body_batch_idx],
+                batch=batch,
+            )
+            pose_output = pose_output[-1]
+        if len(self.hand_batch_idx):
+            tokens_output_hand, pose_output_hand = self.forward_decoder_hand(
+                image_embeddings[self.hand_batch_idx],
+                init_estimate=None,
+                keypoints=keypoints_prompt[self.hand_batch_idx],
+                prev_estimate=None,
+                condition_info=condition_info[self.hand_batch_idx],
+                batch=batch,
+            )
+            pose_output_hand = pose_output_hand[-1]
+        output = {
+            # "pose_token": pose_token,
+            "mhr": pose_output,  # mhr prediction output
+            "mhr_hand": pose_output_hand,  # mhr prediction output
+            "condition_info": condition_info,
+            "image_embeddings": image_embeddings,
+        }
+        if self.cfg.MODEL.DECODER.get("DO_HAND_DETECT_TOKENS", False):
+            if len(self.body_batch_idx):
+                output_hand_box_tokens = tokens_output
+                hand_coords = self.bbox_embed(output_hand_box_tokens).sigmoid()  # x1, y1, w, h for body samples, 0 ~ 1
+                hand_logits = self.hand_cls_embed(output_hand_box_tokens)
+                output["mhr"]["hand_box"] = hand_coords
+                output["mhr"]["hand_logits"] = hand_logits
+            if len(self.hand_batch_idx):
+                output_hand_box_tokens_hand_batch = tokens_output_hand
+                hand_coords_hand_batch = self.bbox_embed(
+                    output_hand_box_tokens_hand_batch
+                ).sigmoid()  # x1, y1, w, h for hand samples
+                hand_logits_hand_batch = self.hand_cls_embed(output_hand_box_tokens_hand_batch)
+                output["mhr_hand"]["hand_box"] = hand_coords_hand_batch
+                output["mhr_hand"]["hand_logits"] = hand_logits_hand_batch
+        return output
+    def forward_step(self, batch: dict, decoder_type: str = "body") -> dict:
+        batch_size, num_person = batch["img"].shape[:2]
+        if decoder_type == "body":
+            self.hand_batch_idx = []
+            self.body_batch_idx = list(range(batch_size * num_person))
+        elif decoder_type == "hand":
+            self.hand_batch_idx = list(range(batch_size * num_person))
+            self.body_batch_idx = []
+        else:
+            ValueError("Invalid decoder type: ", decoder_type)
+        # Crop-image (pose) branch
+        pose_output = self.forward_pose_branch(batch)
+        return pose_output
+    def run_inference(
+        self,
+        img,
+        batch: dict,
+        inference_type: str = "full",
+        transform_hand: Any = None,
+        thresh_wrist_angle=1.4,
+    ):
+        """
+        Run 3DB inference (optionally with hand detector).
+        inference_type:
+            - full: full-body inference with both body and hand decoders
+            - body: inference with body decoder only (still full-body output)
+            - hand: inference with hand decoder only (only hand output)
+        """
+        height, width = img.shape[:2]
+        cam_int = batch["cam_int"].clone()
+        if inference_type == "body":
+            pose_output = self.forward_step(batch, decoder_type="body")
+            return BodyPredContainer(pose_output=pose_output)
+        elif inference_type == "hand":
+            pose_output = self.forward_step(batch, decoder_type="hand")
+            return BodyPredContainer(pose_output=pose_output)
+        elif inference_type != "full":
+            raise ValueError("Invalid inference type: ", inference_type)
+        # Step 1. For full-body inference, we first inference with the body decoder.
+        pose_output = self.forward_step(batch, decoder_type="body")
+        left_xyxy, right_xyxy = self._get_hand_box(pose_output, batch)
+        ori_local_wrist_rotmat = roma.euler_to_rotmat(
+            "XZY",
+            pose_output["mhr"]["body_pose"][:, [41, 43, 42, 31, 33, 32]].unflatten(1, (2, 3)),
+        )
+        # Step 2. Re-run with each hand
+        ## Left... Flip image & box
+        flipped_img = img[:, ::-1]
+        tmp = left_xyxy.copy()
+        left_xyxy[:, 0] = width - tmp[:, 2] - 1
+        left_xyxy[:, 2] = width - tmp[:, 0] - 1
+        batch_lhand = prepare_batch(flipped_img, transform_hand, left_xyxy, cam_int=cam_int.clone())
+        batch_lhand = recursive_to(batch_lhand, "cuda")
+        lhand_output = self.forward_step(batch_lhand, decoder_type="hand")
+        # Unflip output
+        ## Flip scale
+        ### Get MHR values
+        scale_r_hands_mean = self.head_pose.scale_mean[8].item()
+        scale_l_hands_mean = self.head_pose.scale_mean[9].item()
+        scale_r_hands_std = self.head_pose.scale_comps[8, 8].item()
+        scale_l_hands_std = self.head_pose.scale_comps[9, 9].item()
+        ### Apply
+        lhand_output["mhr_hand"]["scale"][:, 9] = (
+            (scale_r_hands_mean + scale_r_hands_std * lhand_output["mhr_hand"]["scale"][:, 8]) - scale_l_hands_mean
+        ) / scale_l_hands_std
+        ## Get the right hand global rotation, flip it, put it in as left.
+        lhand_output["mhr_hand"]["joint_global_rots"][:, 78] = lhand_output["mhr_hand"]["joint_global_rots"][
+            :, 42
+        ].clone()
+        lhand_output["mhr_hand"]["joint_global_rots"][:, 78, [1, 2], :] *= -1
+        ### Flip hand pose
+        lhand_output["mhr_hand"]["hand"][:, :54] = lhand_output["mhr_hand"]["hand"][:, 54:]
+        ### Unflip box
+        batch_lhand["bbox_center"][:, :, 0] = width - batch_lhand["bbox_center"][:, :, 0] - 1
+        ## Right...
+        batch_rhand = prepare_batch(img, transform_hand, right_xyxy, cam_int=cam_int.clone())
+        batch_rhand = recursive_to(batch_rhand, "cuda")
+        rhand_output = self.forward_step(batch_rhand, decoder_type="hand")
+        # Step 3. replace hand pose estimation from the body decoder.
+        ## CRITERIA 1: LOCAL WRIST POSE DIFFERENCE
+        joint_rotations = pose_output["mhr"]["joint_global_rots"]
+        ### Get lowarm
+        lowarm_joint_idxs = torch.LongTensor([76, 40]).cuda()  # left, right
+        lowarm_joint_rotations = joint_rotations[:, lowarm_joint_idxs]  # B x 2 x 3 x 3
+        ### Get zero-wrist pose
+        wrist_twist_joint_idxs = torch.LongTensor([77, 41]).cuda()  # left, right
+        wrist_zero_rot_pose = lowarm_joint_rotations @ self.head_pose.joint_rotation[wrist_twist_joint_idxs]
+        ### Get globals from left & right
+        left_joint_global_rots = lhand_output["mhr_hand"]["joint_global_rots"]
+        right_joint_global_rots = rhand_output["mhr_hand"]["joint_global_rots"]
+        pred_global_wrist_rotmat = torch.stack(
+            [
+                left_joint_global_rots[:, 78],
+                right_joint_global_rots[:, 42],
+            ],
+            dim=1,
+        )
+        ### Get the local poses that lead to the wrist being pred_global_wrist_rotmat
+        fused_local_wrist_rotmat = torch.einsum("kabc,kabd->kadc", pred_global_wrist_rotmat, wrist_zero_rot_pose)
+        angle_difference = rotation_angle_difference(ori_local_wrist_rotmat, fused_local_wrist_rotmat)  # B x 2 x 3 x3
+        angle_difference_valid_mask = angle_difference < thresh_wrist_angle
+        ## CRITERIA 2: hand box size
+        hand_box_size_thresh = 64
+        hand_box_size_valid_mask = torch.stack(
+            [
+                (batch_lhand["bbox_scale"].flatten(0, 1) > hand_box_size_thresh).all(dim=1),
+                (batch_rhand["bbox_scale"].flatten(0, 1) > hand_box_size_thresh).all(dim=1),
+            ],
+            dim=1,
+        )
+        ## CRITERIA 3: all hand 2D KPS (including wrist) inside of box.
+        hand_kps2d_thresh = 0.5
+        hand_kps2d_valid_mask = torch.stack(
+            [
+                lhand_output["mhr_hand"]["pred_keypoints_2d_cropped"].abs().amax(dim=(1, 2)) < hand_kps2d_thresh,
+                rhand_output["mhr_hand"]["pred_keypoints_2d_cropped"].abs().amax(dim=(1, 2)) < hand_kps2d_thresh,
+            ],
+            dim=1,
+        )
+        ## CRITERIA 4: 2D wrist distance.
+        hand_wrist_kps2d_thresh = 0.25
+        kps_right_wrist_idx = 41
+        kps_left_wrist_idx = 62
+        right_kps_full = rhand_output["mhr_hand"]["pred_keypoints_2d"][:, [kps_right_wrist_idx]].clone()
+        left_kps_full = lhand_output["mhr_hand"]["pred_keypoints_2d"][:, [kps_right_wrist_idx]].clone()
+        left_kps_full[:, :, 0] = width - left_kps_full[:, :, 0] - 1  # Flip left hand
+        body_right_kps_full = pose_output["mhr"]["pred_keypoints_2d"][:, [kps_right_wrist_idx]].clone()
+        body_left_kps_full = pose_output["mhr"]["pred_keypoints_2d"][:, [kps_left_wrist_idx]].clone()
+        right_kps_dist = (right_kps_full - body_right_kps_full).flatten(0, 1).norm(dim=-1) / batch_lhand[
+            "bbox_scale"
+        ].flatten(0, 1)[:, 0]
+        left_kps_dist = (left_kps_full - body_left_kps_full).flatten(0, 1).norm(dim=-1) / batch_rhand[
+            "bbox_scale"
+        ].flatten(0, 1)[:, 0]
+        hand_wrist_kps2d_valid_mask = torch.stack(
+            [
+                left_kps_dist < hand_wrist_kps2d_thresh,
+                right_kps_dist < hand_wrist_kps2d_thresh,
+            ],
+            dim=1,
+        )
+        ## Left-right
+        hand_valid_mask = (
+            angle_difference_valid_mask & hand_box_size_valid_mask & hand_kps2d_valid_mask & hand_wrist_kps2d_valid_mask
+        )
+        # Keypoint prompting with the body decoder.
+        # We use the wrist location from the hand decoder and the elbow location
+        # from the body decoder as prompts to get an updated body pose estimation.
+        batch_size, num_person = batch["img"].shape[:2]
+        self.hand_batch_idx = []
+        self.body_batch_idx = list(range(batch_size * num_person))
+        ## Get right & left wrist keypoints from crops; full image. Each are B x 1 x 2
+        kps_right_wrist_idx = 41
+        kps_left_wrist_idx = 62
+        right_kps_full = rhand_output["mhr_hand"]["pred_keypoints_2d"][:, [kps_right_wrist_idx]].clone()
+        left_kps_full = lhand_output["mhr_hand"]["pred_keypoints_2d"][:, [kps_right_wrist_idx]].clone()
+        left_kps_full[:, :, 0] = width - left_kps_full[:, :, 0] - 1  # Flip left hand
+        # Next, get them to crop-normalized space.
+        right_kps_crop = self._full_to_crop(batch, right_kps_full)
+        left_kps_crop = self._full_to_crop(batch, left_kps_full)
+        # Get right & left elbow keypoints from crops; full image. Each are B x 1 x 2
+        kps_right_elbow_idx = 8
+        kps_left_elbow_idx = 7
+        right_kps_elbow_full = pose_output["mhr"]["pred_keypoints_2d"][:, [kps_right_elbow_idx]].clone()
+        left_kps_elbow_full = pose_output["mhr"]["pred_keypoints_2d"][:, [kps_left_elbow_idx]].clone()
+        # Next, get them to crop-normalized space.
+        right_kps_elbow_crop = self._full_to_crop(batch, right_kps_elbow_full)
+        left_kps_elbow_crop = self._full_to_crop(batch, left_kps_elbow_full)
+        # Assemble them into keypoint prompts
+        keypoint_prompt = torch.cat(
+            [right_kps_crop, left_kps_crop, right_kps_elbow_crop, left_kps_elbow_crop],
+            dim=1,
+        )
+        keypoint_prompt = torch.cat([keypoint_prompt, keypoint_prompt[..., [-1]]], dim=-1)
+        keypoint_prompt[:, 0, -1] = kps_right_wrist_idx
+        keypoint_prompt[:, 1, -1] = kps_left_wrist_idx
+        keypoint_prompt[:, 2, -1] = kps_right_elbow_idx
+        keypoint_prompt[:, 3, -1] = kps_left_elbow_idx
+        if keypoint_prompt.shape[0] > 1:
+            # Replace invalid keypoints to dummy prompts
+            invalid_prompt = (
+                (keypoint_prompt[..., 0] < -0.5)
+                | (keypoint_prompt[..., 0] > 0.5)
+                | (keypoint_prompt[..., 1] < -0.5)
+                | (keypoint_prompt[..., 1] > 0.5)
+                | (~hand_valid_mask[..., [1, 0, 1, 0]])
+            ).unsqueeze(-1)
+            dummy_prompt = torch.zeros((1, 1, 3)).to(keypoint_prompt)
+            dummy_prompt[:, :, -1] = -2
+            keypoint_prompt[:, :, :2] = torch.clamp(
+                keypoint_prompt[:, :, :2] + 0.5, min=0.0, max=1.0
+            )  # [-0.5, 0.5] --> [0, 1]
+            keypoint_prompt = torch.where(invalid_prompt, dummy_prompt, keypoint_prompt)
+        else:
+            # Only keep valid keypoints
+            valid_keypoint = (
+                torch.all(
+                    (keypoint_prompt[:, :, :2] > -0.5) & (keypoint_prompt[:, :, :2] < 0.5),
+                    dim=2,
+                )
+                & hand_valid_mask[..., [1, 0, 1, 0]]
+            ).squeeze()
+            keypoint_prompt = keypoint_prompt[:, valid_keypoint]
+            keypoint_prompt[:, :, :2] = torch.clamp(
+                keypoint_prompt[:, :, :2] + 0.5, min=0.0, max=1.0
+            )  # [-0.5, 0.5] --> [0, 1]
+        if keypoint_prompt.numel() != 0:
+            pose_output, _ = self.run_keypoint_prompt(batch, pose_output, keypoint_prompt)
+        ##############################################################################
+        # Drop in hand pose
+        left_hand_pose_params = lhand_output["mhr_hand"]["hand"][:, :54]
+        right_hand_pose_params = rhand_output["mhr_hand"]["hand"][:, 54:]
+        updated_hand_pose = torch.cat([left_hand_pose_params, right_hand_pose_params], dim=1)
+        # Drop in hand scales
+        updated_scale = pose_output["mhr"]["scale"].clone()
+        updated_scale[:, 9] = lhand_output["mhr_hand"]["scale"][:, 9]
+        updated_scale[:, 8] = rhand_output["mhr_hand"]["scale"][:, 8]
+        updated_scale[:, 18:] = (
+            lhand_output["mhr_hand"]["scale"][:, 18:] + rhand_output["mhr_hand"]["scale"][:, 18:]
+        ) / 2
+        # Update hand shape
+        updated_shape = pose_output["mhr"]["shape"].clone()
+        updated_shape[:, 40:] = (
+            lhand_output["mhr_hand"]["shape"][:, 40:] + rhand_output["mhr_hand"]["shape"][:, 40:]
+        ) / 2
+        ############################ Doing IK ############################
+        # First, forward just FK
+        joint_rotations = self.head_pose.mhr_forward(
+            global_trans=pose_output["mhr"]["global_rot"] * 0,
+            global_rot=pose_output["mhr"]["global_rot"],
+            body_pose_params=pose_output["mhr"]["body_pose"],
+            hand_pose_params=updated_hand_pose,
+            scale_params=updated_scale,
+            shape_params=updated_shape,
+            expr_params=pose_output["mhr"]["face"],
+            return_joint_rotations=True,
+        )[1]
+        # Get lowarm
+        lowarm_joint_idxs = torch.LongTensor([76, 40]).cuda()  # left, right
+        lowarm_joint_rotations = joint_rotations[:, lowarm_joint_idxs]  # B x 2 x 3 x 3
+        # Get zero-wrist pose
+        wrist_twist_joint_idxs = torch.LongTensor([77, 41]).cuda()  # left, right
+        wrist_zero_rot_pose = lowarm_joint_rotations @ self.head_pose.joint_rotation[wrist_twist_joint_idxs]
+        # Get globals from left & right
+        left_joint_global_rots = lhand_output["mhr_hand"]["joint_global_rots"]
+        right_joint_global_rots = rhand_output["mhr_hand"]["joint_global_rots"]
+        pred_global_wrist_rotmat = torch.stack(
+            [
+                left_joint_global_rots[:, 78],
+                right_joint_global_rots[:, 42],
+            ],
+            dim=1,
+        )
+        # Now we want to get the local poses that lead to the wrist being pred_global_wrist_rotmat
+        fused_local_wrist_rotmat = torch.einsum("kabc,kabd->kadc", pred_global_wrist_rotmat, wrist_zero_rot_pose)
+        wrist_xzy = fix_wrist_euler(roma.rotmat_to_euler("XZY", fused_local_wrist_rotmat))
+        # Put it in.
+        angle_difference = rotation_angle_difference(ori_local_wrist_rotmat, fused_local_wrist_rotmat)  # B x 2 x 3 x3
+        valid_angle = angle_difference < thresh_wrist_angle
+        valid_angle = valid_angle & hand_valid_mask
+        valid_angle = valid_angle.unsqueeze(-1)
+        body_pose = pose_output["mhr"]["body_pose"][:, [41, 43, 42, 31, 33, 32]].unflatten(1, (2, 3))
+        updated_body_pose = torch.where(valid_angle, wrist_xzy, body_pose)
+        pose_output["mhr"]["body_pose"][:, [41, 43, 42, 31, 33, 32]] = updated_body_pose.flatten(1, 2)
+        hand_pose = pose_output["mhr"]["hand"].unflatten(1, (2, 54))
+        pose_output["mhr"]["hand"] = torch.where(
+            valid_angle, updated_hand_pose.unflatten(1, (2, 54)), hand_pose
+        ).flatten(1, 2)
+        hand_scale = torch.stack(
+            [pose_output["mhr"]["scale"][:, 9], pose_output["mhr"]["scale"][:, 8]],
+            dim=1,
+        )
+        updated_hand_scale = torch.stack([updated_scale[:, 9], updated_scale[:, 8]], dim=1)
+        masked_hand_scale = torch.where(valid_angle.squeeze(-1), updated_hand_scale, hand_scale)
+        pose_output["mhr"]["scale"][:, 9] = masked_hand_scale[:, 0]
+        pose_output["mhr"]["scale"][:, 8] = masked_hand_scale[:, 1]
+        # Replace shared shape and scale
+        pose_output["mhr"]["scale"][:, 18:] = torch.where(
+            valid_angle.squeeze(-1).sum(dim=1, keepdim=True) > 0,
+            (
+                lhand_output["mhr_hand"]["scale"][:, 18:] * valid_angle.squeeze(-1)[:, [0]]
+                + rhand_output["mhr_hand"]["scale"][:, 18:] * valid_angle.squeeze(-1)[:, [1]]
+            )
+            / (valid_angle.squeeze(-1).sum(dim=1, keepdim=True) + 1e-8),
+            pose_output["mhr"]["scale"][:, 18:],
+        )
+        pose_output["mhr"]["shape"][:, 40:] = torch.where(
+            valid_angle.squeeze(-1).sum(dim=1, keepdim=True) > 0,
+            (
+                lhand_output["mhr_hand"]["shape"][:, 40:] * valid_angle.squeeze(-1)[:, [0]]
+                + rhand_output["mhr_hand"]["shape"][:, 40:] * valid_angle.squeeze(-1)[:, [1]]
+            )
+            / (valid_angle.squeeze(-1).sum(dim=1, keepdim=True) + 1e-8),
+            pose_output["mhr"]["shape"][:, 40:],
+        )
+        ########################################################
+        # Re-run forward
+        with torch.no_grad():
+            verts, j3d, jcoords, mhr_model_params, joint_global_rots = self.head_pose.mhr_forward(
+                global_trans=pose_output["mhr"]["global_rot"] * 0,
+                global_rot=pose_output["mhr"]["global_rot"],
+                body_pose_params=pose_output["mhr"]["body_pose"],
+                hand_pose_params=pose_output["mhr"]["hand"],
+                scale_params=pose_output["mhr"]["scale"],
+                shape_params=pose_output["mhr"]["shape"],
+                expr_params=pose_output["mhr"]["face"],
+                return_keypoints=True,
+                return_joint_coords=True,
+                return_model_params=True,
+                return_joint_rotations=True,
+            )
+            j3d = j3d[:, :70]  # 308 --> 70 keypoints
+            verts[..., [1, 2]] *= -1  # Camera system difference
+            j3d[..., [1, 2]] *= -1  # Camera system difference
+            jcoords[..., [1, 2]] *= -1
+            pose_output["mhr"]["pred_keypoints_3d"] = j3d
+            pose_output["mhr"]["pred_vertices"] = verts
+            pose_output["mhr"]["pred_joint_coords"] = jcoords
+            pose_output["mhr"]["pred_pose_raw"][...] = 0  # pred_pose_raw is not valid anymore
+            pose_output["mhr"]["mhr_model_params"] = mhr_model_params
+        ########################################################
+        # Project to 2D
+        pred_keypoints_3d_proj = pose_output["mhr"]["pred_keypoints_3d"] + pose_output["mhr"]["pred_cam_t"][:, None, :]
+        pred_keypoints_3d_proj[:, :, [0, 1]] *= pose_output["mhr"]["focal_length"][:, None, None]
+        pred_keypoints_3d_proj[:, :, [0, 1]] = (
+            pred_keypoints_3d_proj[:, :, [0, 1]]
+            + torch.FloatTensor([width / 2, height / 2]).to(pred_keypoints_3d_proj)[None, None, :]
+            * pred_keypoints_3d_proj[:, :, [2]]
+        )
+        pred_keypoints_3d_proj[:, :, :2] = pred_keypoints_3d_proj[:, :, :2] / pred_keypoints_3d_proj[:, :, [2]]
+        pose_output["mhr"]["pred_keypoints_2d"] = pred_keypoints_3d_proj[:, :, :2]
+        return BodyPredContainer(
+            pose_output=pose_output,
+            batch_lhand=batch_lhand,
+            batch_rhand=batch_rhand,
+            lhand_output=lhand_output,
+            rhand_output=rhand_output,
+        )
+    def run_keypoint_prompt(self, batch, output, keypoint_prompt):
+        image_embeddings = output["image_embeddings"]
+        condition_info = output["condition_info"]
+        pose_output = output["mhr"]  # body-only output
+        # Use previous estimate as initialization
+        prev_estimate = torch.cat(
+            [
+                pose_output["pred_pose_raw"].detach(),  # (B, 6)
+                pose_output["shape"].detach(),
+                pose_output["scale"].detach(),
+                pose_output["hand"].detach(),
+                pose_output["face"].detach(),
+            ],
+            dim=1,
+        ).unsqueeze(dim=1)
+        if hasattr(self, "init_camera"):
+            prev_estimate = torch.cat(
+                [prev_estimate, pose_output["pred_cam"].detach().unsqueeze(1)],
+                dim=-1,
+            )
+        tokens_output, pose_output = self.forward_decoder(
+            image_embeddings,
+            init_estimate=None,  # not recurring previous estimate
+            keypoints=keypoint_prompt,
+            prev_estimate=prev_estimate,
+            condition_info=condition_info,
+            batch=batch,
+        )
+        pose_output = pose_output[-1]
+        output.update({"mhr": pose_output})
+        return output, keypoint_prompt
+    def _get_hand_box(self, pose_output, batch):
+        """Get hand bbox from the hand detector"""
+        pred_left_hand_box = pose_output["mhr"]["hand_box"][:, 0].detach().cpu().numpy() * self.cfg.MODEL.IMAGE_SIZE[0]
+        pred_right_hand_box = pose_output["mhr"]["hand_box"][:, 1].detach().cpu().numpy() * self.cfg.MODEL.IMAGE_SIZE[0]
+        # Change boxes into squares
+        batch["left_center"] = pred_left_hand_box[:, :2]
+        batch["left_scale"] = pred_left_hand_box[:, 2:].max(axis=1, keepdims=True).repeat(2, axis=1)
+        batch["right_center"] = pred_right_hand_box[:, :2]
+        batch["right_scale"] = pred_right_hand_box[:, 2:].max(axis=1, keepdims=True).repeat(2, axis=1)
+        # Crop to full. batch["affine_trans"] is full-to-crop, right application
+        batch["left_scale"] = batch["left_scale"] / batch["affine_trans"][0, :, 0, 0].cpu().numpy()[:, None]
+        batch["right_scale"] = batch["right_scale"] / batch["affine_trans"][0, :, 0, 0].cpu().numpy()[:, None]
+        batch["left_center"] = (
+            batch["left_center"] - batch["affine_trans"][0, :, [0, 1], [2, 2]].cpu().numpy()
+        ) / batch["affine_trans"][0, :, 0, 0].cpu().numpy()[:, None]
+        batch["right_center"] = (
+            batch["right_center"] - batch["affine_trans"][0, :, [0, 1], [2, 2]].cpu().numpy()
+        ) / batch["affine_trans"][0, :, 0, 0].cpu().numpy()[:, None]
+        left_xyxy = np.concatenate(
+            [
+                (batch["left_center"][:, 0] - batch["left_scale"][:, 0] * 1 / 2).reshape(-1, 1),
+                (batch["left_center"][:, 1] - batch["left_scale"][:, 1] * 1 / 2).reshape(-1, 1),
+                (batch["left_center"][:, 0] + batch["left_scale"][:, 0] * 1 / 2).reshape(-1, 1),
+                (batch["left_center"][:, 1] + batch["left_scale"][:, 1] * 1 / 2).reshape(-1, 1),
+            ],
+            axis=1,
+        )
+        right_xyxy = np.concatenate(
+            [
+                (batch["right_center"][:, 0] - batch["right_scale"][:, 0] * 1 / 2).reshape(-1, 1),
+                (batch["right_center"][:, 1] - batch["right_scale"][:, 1] * 1 / 2).reshape(-1, 1),
+                (batch["right_center"][:, 0] + batch["right_scale"][:, 0] * 1 / 2).reshape(-1, 1),
+                (batch["right_center"][:, 1] + batch["right_scale"][:, 1] * 1 / 2).reshape(-1, 1),
+            ],
+            axis=1,
+        )
+        return left_xyxy, right_xyxy
+    def keypoint_token_update_fn(
+        self,
+        kps_emb_start_idx,
+        image_embeddings,
+        token_embeddings,
+        token_augment,
+        pose_output,
+        layer_idx,
+    ):
+        # It's already after the last layer, we're done.
+        if layer_idx == len(self.decoder.layers) - 1:
+            return token_embeddings, token_augment, pose_output, layer_idx
+        # Clone
+        token_embeddings = token_embeddings.clone()
+        token_augment = token_augment.clone()
+        num_keypoints = self.keypoint_embedding.weight.shape[0]
+        # Get current 2D KPS predictions
+        pred_keypoints_2d_cropped = pose_output["pred_keypoints_2d_cropped"].clone()  # These are -0.5 ~ 0.5
+        pred_keypoints_2d_depth = pose_output["pred_keypoints_2d_depth"].clone()
+        pred_keypoints_2d_cropped = pred_keypoints_2d_cropped[:, self.keypoint_embedding_idxs]
+        pred_keypoints_2d_depth = pred_keypoints_2d_depth[:, self.keypoint_embedding_idxs]
+        # Get 2D KPS to be 0 ~ 1
+        pred_keypoints_2d_cropped_01 = pred_keypoints_2d_cropped + 0.5
+        # Get a mask of those that are 1) beyond image boundaries or 2) behind the camera
+        invalid_mask = (
+            (pred_keypoints_2d_cropped_01[:, :, 0] < 0)
+            | (pred_keypoints_2d_cropped_01[:, :, 0] > 1)
+            | (pred_keypoints_2d_cropped_01[:, :, 1] < 0)
+            | (pred_keypoints_2d_cropped_01[:, :, 1] > 1)
+            | (pred_keypoints_2d_depth[:, :] < 1e-5)
+        )
+        # Run them through the prompt encoder's pos emb function
+        token_augment[:, kps_emb_start_idx : kps_emb_start_idx + num_keypoints, :] = self.keypoint_posemb_linear(
+            pred_keypoints_2d_cropped
+        ) * (~invalid_mask[:, :, None])
+        # Also maybe update token_embeddings with the grid sampled 2D feature.
+        # Remember that pred_keypoints_2d_cropped are -0.5 ~ 0.5. We want -1 ~ 1
+        # Sample points...
+        ## Get sampling points
+        pred_keypoints_2d_cropped_sample_points = pred_keypoints_2d_cropped * 2
+        if self.cfg.MODEL.BACKBONE.TYPE in [
+            "vit_hmr",
+            "vit",
+            "vit_b",
+            "vit_l",
+            "vit_hmr_512_384",
+        ]:
+            # Need to go from 256 x 256 coords to 256 x 192 (HW) because image_embeddings is 16x12
+            # Aka, for x, what was normally -1 ~ 1 for 256 should be -16/12 ~ 16/12 (since to sample at original 256, need to overflow)
+            pred_keypoints_2d_cropped_sample_points[:, :, 0] = (
+                pred_keypoints_2d_cropped_sample_points[:, :, 0] / 12 * 16
+            )
+        # Version 2 is projecting & bilinear sampling
+        pred_keypoints_2d_cropped_feats = (
+            F.grid_sample(
+                image_embeddings,
+                pred_keypoints_2d_cropped_sample_points[:, :, None, :],  # -1 ~ 1, xy
+                mode="bilinear",
+                padding_mode="zeros",
+                align_corners=False,
+            )
+            .squeeze(3)
+            .permute(0, 2, 1)
+        )  # B x kps x C
+        # Zero out invalid locations...
+        pred_keypoints_2d_cropped_feats = pred_keypoints_2d_cropped_feats * (~invalid_mask[:, :, None])
+        # This is ADDING
+        token_embeddings = token_embeddings.clone()
+        token_embeddings[
+            :,
+            kps_emb_start_idx : kps_emb_start_idx + num_keypoints,
+            :,
+        ] += self.keypoint_feat_linear(pred_keypoints_2d_cropped_feats)
+        return token_embeddings, token_augment, pose_output, layer_idx
+    def keypoint3d_token_update_fn(
+        self,
+        kps3d_emb_start_idx,
+        token_embeddings,
+        token_augment,
+        pose_output,
+        layer_idx,
+    ):
+        # It's already after the last layer, we're done.
+        if layer_idx == len(self.decoder.layers) - 1:
+            return token_embeddings, token_augment, pose_output, layer_idx
+        num_keypoints3d = self.keypoint3d_embedding.weight.shape[0]
+        # Get current 3D kps predictions
+        pred_keypoints_3d = pose_output["pred_keypoints_3d"].clone()
+        # Now, pelvis normalize
+        pred_keypoints_3d = (
+            pred_keypoints_3d
+            - (pred_keypoints_3d[:, [self.pelvis_idx[0]], :] + pred_keypoints_3d[:, [self.pelvis_idx[1]], :]) / 2
+        )
+        # Get the kps we care about, _after_ pelvis norm (just in case idxs shift)
+        pred_keypoints_3d = pred_keypoints_3d[:, self.keypoint3d_embedding_idxs]
+        # Run through embedding MLP & put in
+        token_augment = token_augment.clone()
+        token_augment[
+            :,
+            kps3d_emb_start_idx : kps3d_emb_start_idx + num_keypoints3d,
+            :,
+        ] = self.keypoint3d_posemb_linear(pred_keypoints_3d)
+        return token_embeddings, token_augment, pose_output, layer_idx
+    def keypoint_token_update_fn_hand(
+        self,
+        kps_emb_start_idx,
+        image_embeddings,
+        token_embeddings,
+        token_augment,
+        pose_output,
+        layer_idx,
+    ):
+        # It's already after the last layer, we're done.
+        if layer_idx == len(self.decoder_hand.layers) - 1:
+            return token_embeddings, token_augment, pose_output, layer_idx
+        # Clone
+        token_embeddings = token_embeddings.clone()
+        token_augment = token_augment.clone()
+        num_keypoints = self.keypoint_embedding_hand.weight.shape[0]
+        # Get current 2D KPS predictions
+        pred_keypoints_2d_cropped = pose_output["pred_keypoints_2d_cropped"].clone()  # These are -0.5 ~ 0.5
+        pred_keypoints_2d_depth = pose_output["pred_keypoints_2d_depth"].clone()
+        pred_keypoints_2d_cropped = pred_keypoints_2d_cropped[:, self.keypoint_embedding_idxs_hand]
+        pred_keypoints_2d_depth = pred_keypoints_2d_depth[:, self.keypoint_embedding_idxs_hand]
+        # Get 2D KPS to be 0 ~ 1
+        pred_keypoints_2d_cropped_01 = pred_keypoints_2d_cropped + 0.5
+        # Get a mask of those that are 1) beyond image boundaries or 2) behind the camera
+        invalid_mask = (
+            (pred_keypoints_2d_cropped_01[:, :, 0] < 0)
+            | (pred_keypoints_2d_cropped_01[:, :, 0] > 1)
+            | (pred_keypoints_2d_cropped_01[:, :, 1] < 0)
+            | (pred_keypoints_2d_cropped_01[:, :, 1] > 1)
+            | (pred_keypoints_2d_depth[:, :] < 1e-5)
+        )
+        # Run them through the prompt encoder's pos emb function
+        token_augment[:, kps_emb_start_idx : kps_emb_start_idx + num_keypoints, :] = self.keypoint_posemb_linear_hand(
+            pred_keypoints_2d_cropped
+        ) * (~invalid_mask[:, :, None])
+        # Also maybe update token_embeddings with the grid sampled 2D feature.
+        # Remember that pred_keypoints_2d_cropped are -0.5 ~ 0.5. We want -1 ~ 1
+        # Sample points...
+        ## Get sampling points
+        pred_keypoints_2d_cropped_sample_points = pred_keypoints_2d_cropped * 2
+        if self.cfg.MODEL.BACKBONE.TYPE in [
+            "vit_hmr",
+            "vit",
+            "vit_b",
+            "vit_l",
+            "vit_hmr_512_384",
+        ]:
+            # Need to go from 256 x 256 coords to 256 x 192 (HW) because image_embeddings is 16x12
+            # Aka, for x, what was normally -1 ~ 1 for 256 should be -16/12 ~ 16/12 (since to sample at original 256, need to overflow)
+            pred_keypoints_2d_cropped_sample_points[:, :, 0] = (
+                pred_keypoints_2d_cropped_sample_points[:, :, 0] / 12 * 16
+            )
+        # Version 2 is projecting & bilinear sampling
+        pred_keypoints_2d_cropped_feats = (
+            F.grid_sample(
+                image_embeddings,
+                pred_keypoints_2d_cropped_sample_points[:, :, None, :],  # -1 ~ 1, xy
+                mode="bilinear",
+                padding_mode="zeros",
+                align_corners=False,
+            )
+            .squeeze(3)
+            .permute(0, 2, 1)
+        )  # B x kps x C
+        # Zero out invalid locations...
+        pred_keypoints_2d_cropped_feats = pred_keypoints_2d_cropped_feats * (~invalid_mask[:, :, None])
+        # This is ADDING
+        token_embeddings = token_embeddings.clone()
+        token_embeddings[
+            :,
+            kps_emb_start_idx : kps_emb_start_idx + num_keypoints,
+            :,
+        ] += self.keypoint_feat_linear_hand(pred_keypoints_2d_cropped_feats)
+        return token_embeddings, token_augment, pose_output, layer_idx
+    def keypoint3d_token_update_fn_hand(
+        self,
+        kps3d_emb_start_idx,
+        token_embeddings,
+        token_augment,
+        pose_output,
+        layer_idx,
+    ):
+        # It's already after the last layer, we're done.
+        if layer_idx == len(self.decoder_hand.layers) - 1:
+            return token_embeddings, token_augment, pose_output, layer_idx
+        num_keypoints3d = self.keypoint3d_embedding_hand.weight.shape[0]
+        # Get current 3D kps predictions
+        pred_keypoints_3d = pose_output["pred_keypoints_3d"].clone()
+        # Now, pelvis normalize
+        pred_keypoints_3d = (
+            pred_keypoints_3d
+            - (pred_keypoints_3d[:, [self.pelvis_idx[0]], :] + pred_keypoints_3d[:, [self.pelvis_idx[1]], :]) / 2
+        )
+        # Get the kps we care about, _after_ pelvis norm (just in case idxs shift)
+        pred_keypoints_3d = pred_keypoints_3d[:, self.keypoint3d_embedding_idxs_hand]
+        # Run through embedding MLP & put in
+        token_augment = token_augment.clone()
+        token_augment[
+            :,
+            kps3d_emb_start_idx : kps3d_emb_start_idx + num_keypoints3d,
+            :,
+        ] = self.keypoint3d_posemb_linear_hand(pred_keypoints_3d)
+        return token_embeddings, token_augment, pose_output, layer_idx

src/sam3d_body/models/modules/__init__.py ADDED Viewed

	@@ -0,0 +1,18 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+from .geometry_utils import (
+    aa_to_rotmat,
+    cam_crop_to_full,
+    focal_length_normalization,
+    get_focalLength_from_fieldOfView,
+    get_intrinsic_matrix,
+    inverse_perspective_projection,
+    log_depth,
+    perspective_projection,
+    rot6d_to_rotmat,
+    transform_points,
+    undo_focal_length_normalization,
+    undo_log_depth,
+)
+from .misc import to_2tuple, to_3tuple, to_4tuple, to_ntuple

src/sam3d_body/models/modules/camera_embed.py ADDED Viewed

	@@ -0,0 +1,111 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+import einops
+import numpy as np
+import torch
+import torch.nn.functional as F
+from sam3d_body.models.modules.transformer import LayerNorm2d
+from torch import nn
+class CameraEncoder(nn.Module):
+    def __init__(self, embed_dim, patch_size=14):
+        super().__init__()
+        self.patch_size = patch_size
+        self.embed_dim = embed_dim
+        self.camera = FourierPositionEncoding(n=3, num_bands=16, max_resolution=64)
+        self.conv = nn.Conv2d(embed_dim + 99, embed_dim, kernel_size=1, bias=False)
+        self.norm = LayerNorm2d(embed_dim)
+    def forward(self, img_embeddings, rays):
+        B, D, _h, _w = img_embeddings.shape
+        with torch.no_grad():
+            scale = 1 / self.patch_size
+            rays = F.interpolate(
+                rays,
+                scale_factor=(scale, scale),
+                mode="bilinear",
+                align_corners=False,
+                antialias=True,
+            )
+            rays = rays.permute(0, 2, 3, 1).contiguous()  # [b, h, w, 2]
+            rays = torch.cat([rays, torch.ones_like(rays[..., :1])], dim=-1)
+            rays_embeddings = self.camera(
+                pos=rays.reshape(B, -1, 3)
+            )  # (bs, N, 99): rays fourier embedding
+            rays_embeddings = einops.rearrange(
+                rays_embeddings, "b (h w) c -> b c h w", h=_h, w=_w
+            ).contiguous()
+        z = torch.concat([img_embeddings, rays_embeddings], dim=1)
+        z = self.norm(self.conv(z))
+        return z
+class FourierPositionEncoding(nn.Module):
+    def __init__(self, n, num_bands, max_resolution):
+        """
+        Module that generate Fourier encoding - no learning involved
+        """
+        super().__init__()
+        self.num_bands = num_bands
+        self.max_resolution = [max_resolution] * n
+    @property
+    def channels(self):
+        """
+        Return the output dimension
+        """
+        num_dims = len(self.max_resolution)
+        encoding_size = self.num_bands * num_dims
+        encoding_size *= 2  # sin-cos
+        encoding_size += num_dims  # concat
+        return encoding_size
+    def forward(self, pos):
+        """
+        Forward pass that take rays as input and generate Fourier positional encodings
+        """
+        fourier_pos_enc = _generate_fourier_features(
+            pos, num_bands=self.num_bands, max_resolution=self.max_resolution
+        )
+        return fourier_pos_enc
+def _generate_fourier_features(pos, num_bands, max_resolution):
+    """Generate fourier features from a given set of positions and frequencies"""
+    b, n = pos.shape[:2]
+    device = pos.device
+    # Linear frequency sampling
+    min_freq = 1.0
+    freq_bands = torch.stack(
+        [
+            torch.linspace(start=min_freq, end=res / 2, steps=num_bands, device=device)
+            for res in max_resolution
+        ],
+        dim=0,
+    )
+    # Stacking
+    per_pos_features = torch.stack(
+        [pos[i, :, :][:, :, None] * freq_bands[None, :, :] for i in range(b)], 0
+    )
+    per_pos_features = per_pos_features.reshape(b, n, -1)
+    # Sin-Cos
+    per_pos_features = torch.cat(
+        [torch.sin(np.pi * per_pos_features), torch.cos(np.pi * per_pos_features)],
+        dim=-1,
+    )
+    # Concat with initial pos
+    per_pos_features = torch.cat([pos, per_pos_features], dim=-1)
+    return per_pos_features

src/sam3d_body/models/modules/drop_path.py ADDED Viewed

	@@ -0,0 +1,42 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+import torch
+import torch.nn as nn
+def drop_path(
+    x: torch.Tensor, drop_prob: float = 0.0, training: bool = False
+) -> torch.Tensor:
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of
+    residual blocks).
+    We follow the implementation
+    https://github.com/rwightman/pytorch-image-models/blob/a2727c1bf78ba0d7b5727f5f95e37fb7f8866b1f/timm/models/layers/drop.py  # noqa: E501
+    """
+    if not training:
+        return x
+    keep_prob = 1 - drop_prob
+    # handle tensors with different dimensions, not just 4D tensors.
+    shape = (x.shape[0],) + (1,) * (x.ndim - 1)
+    random_tensor = keep_prob + torch.rand(shape, dtype=x.dtype, device=x.device)
+    output = x.div(keep_prob) * random_tensor.floor()
+    return output
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of
+    residual blocks).
+    We follow the implementation
+    https://github.com/rwightman/pytorch-image-models/blob/a2727c1bf78ba0d7b5727f5f95e37fb7f8866b1f/timm/models/layers/drop.py  # noqa: E501
+    Args:
+        drop_prob (float): Probability of the path to be zeroed. Default: 0.1
+    """
+    def __init__(self, drop_prob: float = 0.1):
+        super().__init__()
+        self.drop_prob = drop_prob
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return drop_path(x, self.drop_prob, self.training)

src/sam3d_body/models/modules/geometry_utils.py ADDED Viewed

	@@ -0,0 +1,304 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+from typing import Optional
+import cv2
+import numpy as np
+import torch
+from torch.nn import functional as F
+from jaxtyping import Float
+def cam_crop_to_full(cam_bbox, box_center, box_size, img_size, focal_length=5000.0):
+    # Convert cam_bbox to full image
+    img_w, img_h = img_size[:, 0], img_size[:, 1]
+    cx, cy, b = box_center[:, 0], box_center[:, 1], box_size
+    w_2, h_2 = img_w / 2.0, img_h / 2.0
+    bs = b * cam_bbox[:, 0] + 1e-9
+    if type(focal_length) is float:
+        focal_length = torch.ones_like(cam_bbox[:, 0]) * focal_length
+    tz = 2 * focal_length / bs
+    tx = (2 * (cx - w_2) / bs) + cam_bbox[:, 1]
+    ty = (2 * (cy - h_2) / bs) + cam_bbox[:, 2]
+    full_cam = torch.stack([tx, ty, tz], dim=-1)
+    return full_cam
+def aa_to_rotmat(theta: torch.Tensor):
+    """
+    Convert axis-angle representation to rotation matrix.
+    Works by first converting it to a quaternion.
+    Args:
+        theta (torch.Tensor): Tensor of shape (B, 3) containing axis-angle representations.
+    Returns:
+        torch.Tensor: Corresponding rotation matrices with shape (B, 3, 3).
+    Alternatives:
+        import roma
+        y = roma.rotvec_to_rotmat(x)
+    """
+    norm = torch.norm(theta + 1e-8, p=2, dim=1)
+    angle = torch.unsqueeze(norm, -1)
+    normalized = torch.div(theta, angle)
+    angle = angle * 0.5
+    v_cos = torch.cos(angle)
+    v_sin = torch.sin(angle)
+    quat = torch.cat([v_cos, v_sin * normalized], dim=1)
+    return _quat_to_rotmat(quat)
+def _quat_to_rotmat(quat: torch.Tensor) -> torch.Tensor:
+    """
+    Convert quaternion representation to rotation matrix.
+    Args:
+        quat (torch.Tensor) of shape (B, 4); 4 <===> (w, x, y, z).
+    Returns:
+        torch.Tensor: Corresponding rotation matrices with shape (B, 3, 3).
+    """
+    norm_quat = quat
+    norm_quat = norm_quat / norm_quat.norm(p=2, dim=1, keepdim=True)
+    w, x, y, z = norm_quat[:, 0], norm_quat[:, 1], norm_quat[:, 2], norm_quat[:, 3]
+    B = quat.size(0)
+    w2, x2, y2, z2 = w.pow(2), x.pow(2), y.pow(2), z.pow(2)
+    wx, wy, wz = w * x, w * y, w * z
+    xy, xz, yz = x * y, x * z, y * z
+    rotMat = torch.stack(
+        [
+            w2 + x2 - y2 - z2,
+            2 * xy - 2 * wz,
+            2 * wy + 2 * xz,
+            2 * wz + 2 * xy,
+            w2 - x2 + y2 - z2,
+            2 * yz - 2 * wx,
+            2 * xz - 2 * wy,
+            2 * wx + 2 * yz,
+            w2 - x2 - y2 + z2,
+        ],
+        dim=1,
+    ).view(B, 3, 3)
+    return rotMat
+def rot6d_to_rotmat(x: torch.Tensor) -> torch.Tensor:
+    """
+    Convert 6D rotation representation to 3x3 rotation matrix.
+    Based on Zhou et al., "On the Continuity of Rotation Representations in Neural Networks", CVPR 2019
+    Args:
+        x (torch.Tensor): (B,6) Batch of 6-D rotation representations.
+    Returns:
+        torch.Tensor: Batch of corresponding rotation matrices with shape (B,3,3).
+    Alternatives:
+        import roma
+        x = x.reshape(-1,2,3).permute(0, 2, 1).contiguous()
+        y = roma.special_gramschmidt(x)
+    """
+    x = x.reshape(-1, 2, 3).permute(0, 2, 1).contiguous()
+    a1 = x[:, :, 0]
+    a2 = x[:, :, 1]
+    b1 = F.normalize(a1)
+    b2 = F.normalize(a2 - torch.einsum("bi,bi->b", b1, a2).unsqueeze(-1) * b1)
+    b3 = torch.linalg.cross(b1, b2)
+    return torch.stack((b1, b2, b3), dim=-1)
+def rotmat_to_rot6d(x: torch.Tensor) -> torch.Tensor:
+    """
+    Converts rotation matrices to 6D rotation representation by Zhou et al. [1]
+    by dropping the last row. Note that 6D representation is not unique.
+    Args:
+        x: batch of rotation matrices of size (B, 3, 3)
+    Returns:
+        6D rotation representation, of size (B, 6)
+    [1] Zhou, Y., Barnes, C., Lu, J., Yang, J., & Li, H.
+    On the Continuity of Rotation Representations in Neural Networks.
+    IEEE Conference on Computer Vision and Pattern Recognition, 2019.
+    Retrieved from http://arxiv.org/abs/1812.07035
+    """
+    batch_dim = x.size()[:-2]
+    return x[..., :2, :].clone().reshape(batch_dim + (6,))
+def rot_aa(aa: Float[np.ndarray, "3"], rot: float) -> Float[np.ndarray, "3"]:
+    """
+    Rotate axis angle parameters.
+    Args:
+        aa (np.array): Axis-angle vector of shape (3,).
+        rot (np.array): Rotation angle in degrees.
+    Returns:
+        np.array: Rotated axis-angle vector.
+    """
+    # pose parameters
+    R: Float[np.ndarray, "3 3"] = np.array(
+        [
+            [np.cos(np.deg2rad(-rot)), -np.sin(np.deg2rad(-rot)), 0],
+            [np.sin(np.deg2rad(-rot)), np.cos(np.deg2rad(-rot)), 0],
+            [0, 0, 1],
+        ],
+        dtype=np.float64,
+    )
+    # find the rotation of the body in camera frame
+    per_rdg: Float[np.ndarray, "3 3"]
+    per_rdg, _ = cv2.Rodrigues(aa)
+    # apply the global rotation to the global orientation
+    resrot: Float[np.ndarray, "3 3"]
+    resrot, _ = cv2.Rodrigues(np.dot(R, per_rdg))
+    aa_vec: Float[np.ndarray, "3"] = (resrot.T)[0]
+    return aa_vec.astype(np.float32)
+def transform_points(
+    points: torch.Tensor,
+    translation: Optional[torch.Tensor] = None,
+    rotation: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    """
+    Transform a set of 3D points given translation and rotation.
+    Args:
+        points (torch.Tensor): Tensor of shape (B, N, 3) containing the input 3D points.
+        translation (torch.Tensor): Tensor of shape (B, 3) containing the 3D camera translation.
+        rotation (torch.Tensor): Tensor of shape (B, 3, 3) containing the camera rotation.
+    Returns:
+        torch.Tensor: Tensor of shape (B, N, 3) containing the transformed points.
+    """
+    if rotation is not None:
+        points = torch.einsum("bij,bkj->bki", rotation, points)
+    if translation is not None:
+        points = points + translation.unsqueeze(1)
+    return points
+def get_intrinsic_matrix(
+    focal_length: torch.Tensor, principle: torch.Tensor
+) -> torch.Tensor:
+    """
+    Populate intrinsic camera matrix K given focal length and principle point.
+    Args:
+        focal_length: Tensor of shape (2,)
+        principle: Tensor of shape (2,)
+    Returns:
+        Tensor of shape (3, 3)
+    """
+    if isinstance(focal_length, float):
+        fl_x = fl_y = focal_length
+    elif len(focal_length) == 1:
+        fl_x = fl_y = focal_length[0]
+    else:
+        fl_x, fl_y = focal_length[0], focal_length[1]
+    K = torch.eye(3)
+    K[0, 0] = fl_x
+    K[1, 1] = fl_y
+    K[0, -1] = principle[0]
+    K[1, -1] = principle[1]
+    return K
+def perspective_projection(x, K):
+    """
+    Computes the perspective projection of a set of points assuming the extrinsinc params have already been applied
+    Args:
+        - x [bs,N,3]: 3D points
+        - K [bs,3,3]: Camera instrincs params
+    """
+    # Apply perspective distortion
+    y = x / x[:, :, -1].unsqueeze(-1)  # (bs, N, 3)
+    # Apply camera intrinsics
+    y = torch.einsum("bij,bkj->bki", K, y)  # (bs, N, 3)
+    return y[:, :, :2]
+def inverse_perspective_projection(points, K, distance):
+    """
+    Computes the inverse perspective projection of a set of points given an estimated distance.
+    Input:
+        points (bs, N, 2): 2D points
+        K (bs,3,3): camera intrinsics params
+        distance (bs, N, 1): distance in the 3D world
+    Similar to:
+        - pts_l_norm = cv2.undistortPoints(np.expand_dims(pts_l, axis=1), cameraMatrix=K_l, distCoeffs=None)
+    """
+    # Apply camera intrinsics
+    points = torch.cat([points, torch.ones_like(points[..., :1])], -1)
+    points = torch.einsum("bij,bkj->bki", torch.inverse(K), points)
+    # Apply perspective distortion
+    if distance == None:
+        return points
+    points = points * distance
+    return points
+def get_cam_intrinsics(img_size, fov=55, p_x=None, p_y=None):
+    """Given image size, fov and principal point coordinates, return K the camera parameter matrix"""
+    K = np.eye(3)
+    # Get focal length.
+    focal = get_focalLength_from_fieldOfView(fov=fov, img_size=img_size)
+    K[0, 0], K[1, 1] = focal, focal
+    # Set principal point
+    if p_x is not None and p_y is not None:
+        K[0, -1], K[1, -1] = p_x * img_size, p_y * img_size
+    else:
+        K[0, -1], K[1, -1] = img_size // 2, img_size // 2
+    return K
+def get_focalLength_from_fieldOfView(fov=60, img_size=512):
+    """
+    Compute the focal length of the camera lens by assuming a certain FOV for the entire image
+    Args:
+        - fov: float, expressed in degree
+        - img_size: int
+    Return:
+        focal: float
+    """
+    focal = img_size / (2 * np.tan(np.radians(fov) / 2))
+    return focal
+def focal_length_normalization(x, f, fovn=60, img_size=448):
+    """
+    Section 3.1 of https://arxiv.org/pdf/1904.02028.pdf
+    E = (fn/f) * E' where E is 1/d
+    """
+    fn = get_focalLength_from_fieldOfView(fov=fovn, img_size=img_size)
+    y = x * (fn / f)
+    return y
+def undo_focal_length_normalization(y, f, fovn=60, img_size=448):
+    """
+    Undo focal_length_normalization()
+    """
+    fn = get_focalLength_from_fieldOfView(fov=fovn, img_size=img_size)
+    x = y * (f / fn)
+    return x
+EPS_LOG = 1e-10
+def log_depth(x, eps=EPS_LOG):
+    """
+    Move depth to log space
+    """
+    return torch.log(x + eps)
+def undo_log_depth(y, eps=EPS_LOG):
+    """
+    Undo log_depth()
+    """
+    return torch.exp(y) - eps

src/sam3d_body/models/modules/layer_scale.py ADDED Viewed

	@@ -0,0 +1,45 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+from typing import Union
+import torch
+import torch.nn as nn
+class LayerScale(nn.Module):
+    """LayerScale layer.
+    Args:
+        dim (int): Dimension of input features.
+        layer_scale_init_value (float or torch.Tensor): Init value of layer
+            scale. Defaults to 1e-5.
+        inplace (bool): inplace: can optionally do the
+            operation in-place. Defaults to False.
+        data_format (str): The input data format, could be 'channels_last'
+             or 'channels_first', representing (B, C, H, W) and
+             (B, N, C) format data respectively. Defaults to 'channels_last'.
+    """
+    def __init__(
+        self,
+        dim: int,
+        layer_scale_init_value: Union[float, torch.Tensor] = 1e-5,
+        inplace: bool = False,
+        data_format: str = "channels_last",
+    ):
+        super().__init__()
+        assert data_format in (
+            "channels_last",
+            "channels_first",
+        ), "'data_format' could only be channels_last or channels_first."
+        self.inplace = inplace
+        self.data_format = data_format
+        self.weight = nn.Parameter(torch.ones(dim) * layer_scale_init_value)
+    def forward(self, x):
+        if self.data_format == "channels_first":
+            if self.inplace:
+                return x.mul_(self.weight.view(-1, 1, 1))
+            else:
+                return x * self.weight.view(-1, 1, 1)
+        return x.mul_(self.weight) if self.inplace else x * self.weight

src/sam3d_body/models/modules/mhr_utils.py ADDED Viewed

	@@ -0,0 +1,392 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+import json
+import math
+import os.path as osp
+import pickle
+import cv2
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+def rotation_angle_difference(A: torch.Tensor, B: torch.Tensor) -> torch.Tensor:
+    """
+    Compute the angle difference (magnitude) between two batches of SO(3) rotation matrices.
+    Args:
+        A: Tensor of shape (*, 3, 3), batch of rotation matrices.
+        B: Tensor of shape (*, 3, 3), batch of rotation matrices.
+    Returns:
+        Tensor of shape (*,), angle differences in radians.
+    """
+    # Compute relative rotation matrix
+    R_rel = torch.matmul(A, B.transpose(-2, -1))  # (B, 3, 3)
+    # Compute trace of relative rotation
+    trace = R_rel[..., 0, 0] + R_rel[..., 1, 1] + R_rel[..., 2, 2]  # (B,)
+    # Compute angle using the trace formula
+    cos_theta = (trace - 1) / 2
+    # Clamp for numerical stability
+    cos_theta_clamped = torch.clamp(cos_theta, -1.0, 1.0)
+    # Compute angle difference
+    angle = torch.acos(cos_theta_clamped)
+    return angle
+def fix_wrist_euler(
+    wrist_xzy, limits_x=(-2.2, 1.0), limits_z=(-2.2, 1.5), limits_y=(-1.2, 1.5)
+):
+    """
+    wrist_xzy: B x 2 x 3 (X, Z, Y angles)
+    Returns: Fixed angles within joint limits
+    """
+    x, z, y = wrist_xzy[..., 0], wrist_xzy[..., 1], wrist_xzy[..., 2]
+    x_alt = torch.atan2(torch.sin(x + torch.pi), torch.cos(x + torch.pi))
+    z_alt = torch.atan2(torch.sin(-(z + torch.pi)), torch.cos(-(z + torch.pi)))
+    y_alt = torch.atan2(torch.sin(y + torch.pi), torch.cos(y + torch.pi))
+    # Calculate L2 violation distance
+    def calc_violation(val, limits):
+        below = torch.clamp(limits[0] - val, min=0.0)
+        above = torch.clamp(val - limits[1], min=0.0)
+        return below**2 + above**2
+    violation_orig = (
+        calc_violation(x, limits_x)
+        + calc_violation(z, limits_z)
+        + calc_violation(y, limits_y)
+    )
+    violation_alt = (
+        calc_violation(x_alt, limits_x)
+        + calc_violation(z_alt, limits_z)
+        + calc_violation(y_alt, limits_y)
+    )
+    # Use alternative where it has lower L2 violation
+    use_alt = violation_alt < violation_orig
+    # Stack alternative and apply mask
+    wrist_xzy_alt = torch.stack([x_alt, z_alt, y_alt], dim=-1)
+    result = torch.where(use_alt.unsqueeze(-1), wrist_xzy_alt, wrist_xzy)
+    return result
+def batch6DFromXYZ(r, return_9D=False):
+    """
+    Generate a matrix representing a rotation defined by a XYZ-Euler
+    rotation.
+    Args:
+        r: ... x 3 rotation vectors
+    Returns:
+        ... x 6
+    """
+    rc = torch.cos(r)
+    rs = torch.sin(r)
+    cx = rc[..., 0]
+    cy = rc[..., 1]
+    cz = rc[..., 2]
+    sx = rs[..., 0]
+    sy = rs[..., 1]
+    sz = rs[..., 2]
+    result = torch.empty(list(r.shape[:-1]) + [3, 3], dtype=r.dtype).to(r.device)
+    result[..., 0, 0] = cy * cz
+    result[..., 0, 1] = -cx * sz + sx * sy * cz
+    result[..., 0, 2] = sx * sz + cx * sy * cz
+    result[..., 1, 0] = cy * sz
+    result[..., 1, 1] = cx * cz + sx * sy * sz
+    result[..., 1, 2] = -sx * cz + cx * sy * sz
+    result[..., 2, 0] = -sy
+    result[..., 2, 1] = sx * cy
+    result[..., 2, 2] = cx * cy
+    if not return_9D:
+        return torch.cat([result[..., :, 0], result[..., :, 1]], dim=-1)
+    else:
+        return result
+# https://github.com/papagina/RotationContinuity/blob/758b0ce551c06372cab7022d4c0bdf331c89c696/shapenet/code/tools.py#L82
+def batchXYZfrom6D(poses):
+    # Args: poses: ... x 6, where "6" is the combined first and second columns
+    # First, get the rotaiton matrix
+    x_raw = poses[..., :3]
+    y_raw = poses[..., 3:]
+    x = F.normalize(x_raw, dim=-1)
+    z = torch.cross(x, y_raw, dim=-1)
+    z = F.normalize(z, dim=-1)
+    y = torch.cross(z, x, dim=-1)
+    matrix = torch.stack([x, y, z], dim=-1)  # ... x 3 x 3
+    # Now get it into euler
+    # https://github.com/papagina/RotationContinuity/blob/758b0ce551c06372cab7022d4c0bdf331c89c696/shapenet/code/tools.py#L412
+    sy = torch.sqrt(
+        matrix[..., 0, 0] * matrix[..., 0, 0] + matrix[..., 1, 0] * matrix[..., 1, 0]
+    )
+    singular = sy < 1e-6
+    singular = singular.float()
+    x = torch.atan2(matrix[..., 2, 1], matrix[..., 2, 2])
+    y = torch.atan2(-matrix[..., 2, 0], sy)
+    z = torch.atan2(matrix[..., 1, 0], matrix[..., 0, 0])
+    xs = torch.atan2(-matrix[..., 1, 2], matrix[..., 1, 1])
+    ys = torch.atan2(-matrix[..., 2, 0], sy)
+    zs = matrix[..., 1, 0] * 0
+    out_euler = torch.zeros_like(matrix[..., 0])
+    out_euler[..., 0] = x * (1 - singular) + xs * singular
+    out_euler[..., 1] = y * (1 - singular) + ys * singular
+    out_euler[..., 2] = z * (1 - singular) + zs * singular
+    return out_euler
+def resize_image(image_array, scale_factor, interpolation=cv2.INTER_LINEAR):
+    new_height = int(image_array.shape[0] // scale_factor)
+    new_width = int(image_array.shape[1] // scale_factor)
+    resized_image = cv2.resize(
+        image_array, (new_width, new_height), interpolation=interpolation
+    )
+    return resized_image
+def compact_cont_to_model_params_hand(hand_cont):
+    # These are ordered by joint, not model params ^^
+    assert hand_cont.shape[-1] == 54
+    hand_dofs_in_order = torch.tensor([3, 1, 1, 3, 1, 1, 3, 1, 1, 3, 1, 1, 2, 3, 1, 1])
+    assert sum(hand_dofs_in_order) == 27
+    # Mask of 3DoFs into hand_cont
+    mask_cont_threedofs = torch.cat(
+        [torch.ones(2 * k).bool() * (k in [3]) for k in hand_dofs_in_order]
+    )
+    # Mask of 1DoFs (including 2DoF) into hand_cont
+    mask_cont_onedofs = torch.cat(
+        [torch.ones(2 * k).bool() * (k in [1, 2]) for k in hand_dofs_in_order]
+    )
+    # Mask of 3DoFs into hand_model_params
+    mask_model_params_threedofs = torch.cat(
+        [torch.ones(k).bool() * (k in [3]) for k in hand_dofs_in_order]
+    )
+    # Mask of 1DoFs (including 2DoF) into hand_model_params
+    mask_model_params_onedofs = torch.cat(
+        [torch.ones(k).bool() * (k in [1, 2]) for k in hand_dofs_in_order]
+    )
+    # Convert hand_cont to eulers
+    ## First for 3DoFs
+    hand_cont_threedofs = hand_cont[..., mask_cont_threedofs].unflatten(-1, (-1, 6))
+    hand_model_params_threedofs = batchXYZfrom6D(hand_cont_threedofs).flatten(-2, -1)
+    ## Next for 1DoFs
+    hand_cont_onedofs = hand_cont[..., mask_cont_onedofs].unflatten(
+        -1, (-1, 2)
+    )  # (sincos)
+    hand_model_params_onedofs = torch.atan2(
+        hand_cont_onedofs[..., -2], hand_cont_onedofs[..., -1]
+    )
+    # Finally, assemble into a 27-dim vector, ordered by joint, then XYZ.
+    hand_model_params = torch.zeros(*hand_cont.shape[:-1], 27).to(hand_cont)
+    hand_model_params[..., mask_model_params_threedofs] = hand_model_params_threedofs
+    hand_model_params[..., mask_model_params_onedofs] = hand_model_params_onedofs
+    return hand_model_params
+def compact_model_params_to_cont_hand(hand_model_params):
+    # These are ordered by joint, not model params ^^
+    assert hand_model_params.shape[-1] == 27
+    hand_dofs_in_order = torch.tensor([3, 1, 1, 3, 1, 1, 3, 1, 1, 3, 1, 1, 2, 3, 1, 1])
+    assert sum(hand_dofs_in_order) == 27
+    # Mask of 3DoFs into hand_cont
+    mask_cont_threedofs = torch.cat(
+        [torch.ones(2 * k).bool() * (k in [3]) for k in hand_dofs_in_order]
+    )
+    # Mask of 1DoFs (including 2DoF) into hand_cont
+    mask_cont_onedofs = torch.cat(
+        [torch.ones(2 * k).bool() * (k in [1, 2]) for k in hand_dofs_in_order]
+    )
+    # Mask of 3DoFs into hand_model_params
+    mask_model_params_threedofs = torch.cat(
+        [torch.ones(k).bool() * (k in [3]) for k in hand_dofs_in_order]
+    )
+    # Mask of 1DoFs (including 2DoF) into hand_model_params
+    mask_model_params_onedofs = torch.cat(
+        [torch.ones(k).bool() * (k in [1, 2]) for k in hand_dofs_in_order]
+    )
+    # Convert eulers to hand_cont hand_cont
+    ## First for 3DoFs
+    hand_model_params_threedofs = hand_model_params[
+        ..., mask_model_params_threedofs
+    ].unflatten(-1, (-1, 3))
+    hand_cont_threedofs = batch6DFromXYZ(hand_model_params_threedofs).flatten(-2, -1)
+    ## Next for 1DoFs
+    hand_model_params_onedofs = hand_model_params[..., mask_model_params_onedofs]
+    hand_cont_onedofs = torch.stack(
+        [hand_model_params_onedofs.sin(), hand_model_params_onedofs.cos()], dim=-1
+    ).flatten(-2, -1)
+    # Finally, assemble into a 27-dim vector, ordered by joint, then XYZ.
+    hand_cont = torch.zeros(*hand_model_params.shape[:-1], 54).to(hand_model_params)
+    hand_cont[..., mask_cont_threedofs] = hand_cont_threedofs
+    hand_cont[..., mask_cont_onedofs] = hand_cont_onedofs
+    return hand_cont
+def batch9Dfrom6D(poses):
+    # Args: poses: ... x 6, where "6" is the combined first and second columns
+    # First, get the rotaiton matrix
+    x_raw = poses[..., :3]
+    y_raw = poses[..., 3:]
+    x = F.normalize(x_raw, dim=-1)
+    z = torch.cross(x, y_raw, dim=-1)
+    z = F.normalize(z, dim=-1)
+    y = torch.cross(z, x, dim=-1)
+    matrix = torch.stack([x, y, z], dim=-1).flatten(-2, -1)  # ... x 3 x 3 -> x9
+    return matrix
+def batch4Dfrom2D(poses):
+    # Args: poses: ... x 2, where "2" is sincos
+    poses_norm = F.normalize(poses, dim=-1)
+    poses_4d = torch.stack(
+        [
+            poses_norm[..., 1],
+            poses_norm[..., 0],
+            -poses_norm[..., 0],
+            poses_norm[..., 1],
+        ],
+        dim=-1,
+    )  # Flattened SO2.
+    return poses_4d  # .... x 4
+def compact_cont_to_rotmat_body(body_pose_cont, inflate_trans=False):
+    # fmt: off
+    all_param_3dof_rot_idxs = torch.LongTensor([(0, 2, 4), (6, 8, 10), (12, 13, 14), (15, 16, 17), (18, 19, 20), (21, 22, 23), (24, 25, 26), (27, 28, 29), (34, 35, 36), (37, 38, 39), (44, 45, 46), (53, 54, 55), (64, 65, 66), (85, 69, 73), (86, 70, 79), (87, 71, 82), (88, 72, 76), (91, 92, 93), (112, 96, 100), (113, 97, 106), (114, 98, 109), (115, 99, 103), (130, 131, 132)])
+    all_param_1dof_rot_idxs = torch.LongTensor([1, 3, 5, 7, 9, 11, 30, 31, 32, 33, 40, 41, 42, 43, 47, 48, 49, 50, 51, 52, 56, 57, 58, 59, 60, 61, 62, 63, 67, 68, 74, 75, 77, 78, 80, 81, 83, 84, 89, 90, 94, 95, 101, 102, 104, 105, 107, 108, 110, 111, 116, 117, 118, 119, 120, 121, 122, 123])
+    all_param_1dof_trans_idxs = torch.LongTensor([124, 125, 126, 127, 128, 129])
+    # fmt: on
+    num_3dof_angles = len(all_param_3dof_rot_idxs) * 3
+    num_1dof_angles = len(all_param_1dof_rot_idxs)
+    num_1dof_trans = len(all_param_1dof_trans_idxs)
+    assert body_pose_cont.shape[-1] == (
+        2 * num_3dof_angles + 2 * num_1dof_angles + num_1dof_trans
+    )
+    # Get subsets
+    body_cont_3dofs = body_pose_cont[..., : 2 * num_3dof_angles]
+    body_cont_1dofs = body_pose_cont[
+        ..., 2 * num_3dof_angles : 2 * num_3dof_angles + 2 * num_1dof_angles
+    ]
+    body_cont_trans = body_pose_cont[..., 2 * num_3dof_angles + 2 * num_1dof_angles :]
+    # Convert conts to model params
+    ## First for 3dofs
+    body_cont_3dofs = body_cont_3dofs.unflatten(-1, (-1, 6))
+    body_rotmat_3dofs = batch9Dfrom6D(body_cont_3dofs).flatten(-2, -1)
+    ## Next for 1dofs
+    body_cont_1dofs = body_cont_1dofs.unflatten(-1, (-1, 2))  # (sincos)
+    body_rotmat_1dofs = batch4Dfrom2D(body_cont_1dofs).flatten(-2, -1)
+    if inflate_trans:
+        assert (
+            False
+        ), "This is left as a possibility to increase the space/contribution/supervision trans params gets compared to rots"
+    else:
+        ## Nothing to do for trans
+        body_rotmat_trans = body_cont_trans
+    # Put them together
+    body_rotmat_params = torch.cat(
+        [body_rotmat_3dofs, body_rotmat_1dofs, body_rotmat_trans], dim=-1
+    )
+    return body_rotmat_params
+def compact_cont_to_model_params_body(body_pose_cont):
+    # fmt: off
+    all_param_3dof_rot_idxs = torch.LongTensor([(0, 2, 4), (6, 8, 10), (12, 13, 14), (15, 16, 17), (18, 19, 20), (21, 22, 23), (24, 25, 26), (27, 28, 29), (34, 35, 36), (37, 38, 39), (44, 45, 46), (53, 54, 55), (64, 65, 66), (85, 69, 73), (86, 70, 79), (87, 71, 82), (88, 72, 76), (91, 92, 93), (112, 96, 100), (113, 97, 106), (114, 98, 109), (115, 99, 103), (130, 131, 132)])
+    all_param_1dof_rot_idxs = torch.LongTensor([1, 3, 5, 7, 9, 11, 30, 31, 32, 33, 40, 41, 42, 43, 47, 48, 49, 50, 51, 52, 56, 57, 58, 59, 60, 61, 62, 63, 67, 68, 74, 75, 77, 78, 80, 81, 83, 84, 89, 90, 94, 95, 101, 102, 104, 105, 107, 108, 110, 111, 116, 117, 118, 119, 120, 121, 122, 123])
+    all_param_1dof_trans_idxs = torch.LongTensor([124, 125, 126, 127, 128, 129])
+    # fmt: on
+    num_3dof_angles = len(all_param_3dof_rot_idxs) * 3
+    num_1dof_angles = len(all_param_1dof_rot_idxs)
+    num_1dof_trans = len(all_param_1dof_trans_idxs)
+    assert body_pose_cont.shape[-1] == (
+        2 * num_3dof_angles + 2 * num_1dof_angles + num_1dof_trans
+    )
+    # Get subsets
+    body_cont_3dofs = body_pose_cont[..., : 2 * num_3dof_angles]
+    body_cont_1dofs = body_pose_cont[
+        ..., 2 * num_3dof_angles : 2 * num_3dof_angles + 2 * num_1dof_angles
+    ]
+    body_cont_trans = body_pose_cont[..., 2 * num_3dof_angles + 2 * num_1dof_angles :]
+    # Convert conts to model params
+    ## First for 3dofs
+    body_cont_3dofs = body_cont_3dofs.unflatten(-1, (-1, 6))
+    body_params_3dofs = batchXYZfrom6D(body_cont_3dofs).flatten(-2, -1)
+    ## Next for 1dofs
+    body_cont_1dofs = body_cont_1dofs.unflatten(-1, (-1, 2))  # (sincos)
+    body_params_1dofs = torch.atan2(body_cont_1dofs[..., -2], body_cont_1dofs[..., -1])
+    ## Nothing to do for trans
+    body_params_trans = body_cont_trans
+    # Put them together
+    body_pose_params = torch.zeros(*body_pose_cont.shape[:-1], 133).to(body_pose_cont)
+    body_pose_params[..., all_param_3dof_rot_idxs.flatten()] = body_params_3dofs
+    body_pose_params[..., all_param_1dof_rot_idxs] = body_params_1dofs
+    body_pose_params[..., all_param_1dof_trans_idxs] = body_params_trans
+    return body_pose_params
+def compact_model_params_to_cont_body(body_pose_params):
+    # fmt: off
+    all_param_3dof_rot_idxs = torch.LongTensor([(0, 2, 4), (6, 8, 10), (12, 13, 14), (15, 16, 17), (18, 19, 20), (21, 22, 23), (24, 25, 26), (27, 28, 29), (34, 35, 36), (37, 38, 39), (44, 45, 46), (53, 54, 55), (64, 65, 66), (85, 69, 73), (86, 70, 79), (87, 71, 82), (88, 72, 76), (91, 92, 93), (112, 96, 100), (113, 97, 106), (114, 98, 109), (115, 99, 103), (130, 131, 132)])
+    all_param_1dof_rot_idxs = torch.LongTensor([1, 3, 5, 7, 9, 11, 30, 31, 32, 33, 40, 41, 42, 43, 47, 48, 49, 50, 51, 52, 56, 57, 58, 59, 60, 61, 62, 63, 67, 68, 74, 75, 77, 78, 80, 81, 83, 84, 89, 90, 94, 95, 101, 102, 104, 105, 107, 108, 110, 111, 116, 117, 118, 119, 120, 121, 122, 123])
+    all_param_1dof_trans_idxs = torch.LongTensor([124, 125, 126, 127, 128, 129])
+    # fmt: on
+    num_3dof_angles = len(all_param_3dof_rot_idxs) * 3
+    num_1dof_angles = len(all_param_1dof_rot_idxs)
+    num_1dof_trans = len(all_param_1dof_trans_idxs)
+    assert body_pose_params.shape[-1] == (
+        num_3dof_angles + num_1dof_angles + num_1dof_trans
+    )
+    # Take out params
+    body_params_3dofs = body_pose_params[..., all_param_3dof_rot_idxs.flatten()]
+    body_params_1dofs = body_pose_params[..., all_param_1dof_rot_idxs]
+    body_params_trans = body_pose_params[..., all_param_1dof_trans_idxs]
+    # params to cont
+    body_cont_3dofs = batch6DFromXYZ(body_params_3dofs.unflatten(-1, (-1, 3))).flatten(
+        -2, -1
+    )
+    body_cont_1dofs = torch.stack(
+        [body_params_1dofs.sin(), body_params_1dofs.cos()], dim=-1
+    ).flatten(-2, -1)
+    body_cont_trans = body_params_trans
+    # Put them together
+    body_pose_cont = torch.cat(
+        [body_cont_3dofs, body_cont_1dofs, body_cont_trans], dim=-1
+    )
+    return body_pose_cont
+# fmt: off
+mhr_param_hand_idxs = [62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115]
+mhr_cont_hand_idxs = [72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237]
+mhr_param_hand_mask = torch.zeros(133).bool(); mhr_param_hand_mask[mhr_param_hand_idxs] = True
+mhr_cont_hand_mask = torch.zeros(260).bool(); mhr_cont_hand_mask[mhr_cont_hand_idxs] = True
+# fmt: on

src/sam3d_body/models/modules/misc.py ADDED Viewed

	@@ -0,0 +1,31 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+import collections.abc
+from itertools import repeat
+# From PyTorch internals
+def _ntuple(n):
+    """A `to_tuple` function generator.
+    It returns a function, this function will repeat the input to a tuple of
+    length ``n`` if the input is not an Iterable object, otherwise, return the
+    input directly.
+    Args:
+        n (int): The number of the target length.
+    """
+    def parse(x):
+        if isinstance(x, collections.abc.Iterable):
+            return x
+        return tuple(repeat(x, n))
+    return parse
+to_1tuple = _ntuple(1)
+to_2tuple = _ntuple(2)
+to_3tuple = _ntuple(3)
+to_4tuple = _ntuple(4)
+to_ntuple = _ntuple

src/sam3d_body/models/modules/swiglu_ffn.py ADDED Viewed

	@@ -0,0 +1,96 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+from typing import Optional
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from .drop_path import DropPath
+from .layer_scale import LayerScale
+class SwiGLUFFN(nn.Module):
+    """SwiGLU FFN layer.
+    Modified from https://github.com/facebookresearch/dinov2/blob/main/dinov2/layers/swiglu_ffn.py
+    """  # noqa
+    def __init__(
+        self,
+        embed_dims: int,
+        feedforward_channels: Optional[int] = None,
+        out_dims: Optional[int] = None,
+        layer_scale_init_value: float = 0.0,
+        bias: bool = True,
+        drop_path_rate: float = 0.0,
+        norm_layer: nn.Module = nn.LayerNorm,
+        add_identity: bool = True,
+    ) -> None:
+        super().__init__()
+        self.embed_dims = embed_dims
+        self.out_dims = out_dims or embed_dims
+        hidden_dims = feedforward_channels or embed_dims
+        self.w12 = nn.Linear(self.embed_dims, 2 * hidden_dims, bias=bias)
+        self.norm = norm_layer
+        self.w3 = nn.Linear(hidden_dims, self.out_dims, bias=bias)
+        if layer_scale_init_value > 0:
+            self.gamma2 = LayerScale(
+                dim=embed_dims, layer_scale_init_value=layer_scale_init_value
+            )
+        else:
+            self.gamma2 = nn.Identity()
+        self.dropout_layer = DropPath(drop_path_rate)
+        self.add_identity = add_identity
+    def forward(
+        self, x: torch.Tensor, identity: Optional[torch.Tensor] = None
+    ) -> torch.Tensor:
+        x12 = self.w12(x)
+        x1, x2 = x12.chunk(2, dim=-1)
+        hidden = F.silu(x1) * x2
+        hidden = self.norm(hidden)
+        out = self.w3(hidden)
+        out = self.gamma2(out)
+        out = self.dropout_layer(out)
+        if self.out_dims != self.embed_dims or not self.add_identity:
+            # due to the dimension inconsistence or user setting
+            # not to apply residual operation
+            return out
+        if identity is None:
+            identity = x
+        return identity + out
+class SwiGLUFFNFused(SwiGLUFFN):
+    """SwiGLU FFN layer with fusing.
+    Modified from https://github.com/facebookresearch/dinov2/blob/main/dinov2/layers/swiglu_ffn.py
+    """  # noqa
+    def __init__(
+        self,
+        embed_dims: int,
+        feedforward_channels: Optional[int] = None,
+        out_dims: Optional[int] = None,
+        layer_scale_init_value: float = 0.0,
+        bias: bool = True,
+    ) -> None:
+        out_dims = out_dims or embed_dims
+        feedforward_channels = feedforward_channels or embed_dims
+        feedforward_channels = (int(feedforward_channels * 2 / 3) + 7) // 8 * 8
+        super().__init__(
+            embed_dims=embed_dims,
+            feedforward_channels=feedforward_channels,
+            out_dims=out_dims,
+            layer_scale_init_value=layer_scale_init_value,
+            bias=bias,
+        )

src/sam3d_body/models/modules/transformer.py ADDED Viewed

	@@ -0,0 +1,651 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+from typing import Dict, Optional
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from .drop_path import DropPath
+from .layer_scale import LayerScale
+from .swiglu_ffn import SwiGLUFFNFused
+class MLP(nn.Module):
+    # borrowed from DET R
+    """Very simple multi-layer perceptron (also called FFN)"""
+    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
+        super().__init__()
+        self.num_layers = num_layers
+        h = [hidden_dim] * (num_layers - 1)
+        self.layers = nn.ModuleList(
+            nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])
+        )
+    def forward(self, x):
+        for i, layer in enumerate(self.layers):
+            x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
+        return x
+class LayerNorm32(nn.LayerNorm):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return super().forward(x.float()).type(x.dtype)
+def build_norm_layer(cfg: Dict, num_features: int):
+    """Build normalization layer.
+    Args:
+        cfg (dict): The norm layer config, which should contain:
+            - type (str): Layer type.
+            - layer args: Args needed to instantiate a norm layer.
+            - requires_grad (bool, optional): Whether stop gradient updates.
+        num_features (int): Number of input channels.
+        postfix (int | str): The postfix to be appended into norm abbreviation
+            to create named layer.
+    Returns:
+        tuple[str, nn.Module]: The first element is the layer name consisting
+        of abbreviation and postfix, e.g., bn1, gn. The second element is the
+        created norm layer.
+    """
+    if not isinstance(cfg, dict):
+        raise TypeError("cfg must be a dict")
+    if "type" not in cfg:
+        raise KeyError('the cfg dict must contain the key "type"')
+    cfg_ = cfg.copy()
+    layer_type = cfg_.pop("type")
+    if layer_type == "LN":
+        norm_layer = LayerNorm32
+    else:
+        raise ValueError("Unsupported norm layer: ", layer_type)
+    requires_grad = cfg_.pop("requires_grad", True)
+    cfg_.setdefault("eps", 1e-5)
+    if norm_layer is not nn.GroupNorm:
+        layer = norm_layer(num_features, **cfg_)
+        if layer_type == "SyncBN" and hasattr(layer, "_specify_ddp_gpu_num"):
+            layer._specify_ddp_gpu_num(1)
+    else:
+        assert "num_groups" in cfg_
+        layer = norm_layer(num_channels=num_features, **cfg_)
+    for param in layer.parameters():
+        param.requires_grad = requires_grad
+    return layer
+class LayerNorm2d(nn.Module):
+    def __init__(self, num_channels: int, eps: float = 1e-6) -> None:
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(num_channels))
+        self.bias = nn.Parameter(torch.zeros(num_channels))
+        self.eps = eps
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        u = x.mean(1, keepdim=True)
+        s = (x - u).pow(2).mean(1, keepdim=True)
+        x = (x - u) / torch.sqrt(s + self.eps)
+        x = self.weight[:, None, None] * x + self.bias[:, None, None]
+        return x
+class FFN(nn.Module):
+    """Implements feed-forward networks (FFNs) with identity connection.
+    Args:
+        embed_dims (int): The feature dimension. Same as
+            `MultiheadAttention`. Defaults: 256.
+        feedforward_channels (int): The hidden dimension of FFNs.
+            Defaults: 1024.
+        num_fcs (int, optional): The number of fully-connected layers in
+            FFNs. Default: 2.
+        act_layer (nn.Module, optional): The activation layer for FFNs.
+            Default: nn.ReLU
+        ffn_drop (float, optional): Probability of an element to be
+            zeroed in FFN. Default 0.0.
+        add_identity (bool, optional): Whether to add the
+            identity connection. Default: `True`.
+        drop_path_rate (float): Stochastic depth rate. Defaults to 0.
+        layer_scale_init_value (float): Initial value of scale factor in
+            LayerScale. Default: 1.0
+    """
+    # @deprecated_api_warning(
+    #     {
+    #         'dropout': 'ffn_drop',
+    #         'add_residual': 'add_identity'
+    #     },
+    #     cls_name='FFN')
+    def __init__(
+        self,
+        embed_dims=256,
+        feedforward_channels=1024,
+        output_dims=None,
+        num_fcs=2,
+        act_layer=nn.ReLU,
+        ffn_drop=0.0,
+        drop_path_rate=0.0,
+        add_identity=True,
+        layer_scale_init_value=0.0,
+    ):
+        super().__init__()
+        self.embed_dims = embed_dims
+        self.feedforward_channels = feedforward_channels
+        self.output_dims = output_dims or embed_dims
+        self.num_fcs = num_fcs
+        layers = []
+        in_channels = embed_dims
+        for _ in range(num_fcs - 1):
+            layers.append(
+                nn.Sequential(
+                    nn.Linear(in_channels, feedforward_channels),
+                    act_layer(),
+                    nn.Dropout(ffn_drop),
+                )
+            )
+            in_channels = feedforward_channels
+        layers.append(nn.Linear(in_channels, self.output_dims))
+        layers.append(nn.Dropout(ffn_drop))
+        self.layers = nn.Sequential(*layers)
+        self.dropout_layer = (
+            DropPath(drop_path_rate) if drop_path_rate > 0.0 else torch.nn.Identity()
+        )
+        self.add_identity = add_identity
+        if layer_scale_init_value > 0:
+            self.gamma2 = LayerScale(embed_dims, scale=layer_scale_init_value)
+        else:
+            self.gamma2 = nn.Identity()
+    # @deprecated_api_warning({'residual': 'identity'}, cls_name='FFN')
+    def forward(self, x, identity=None):
+        """Forward function for `FFN`.
+        The function would add x to the output tensor if residue is None.
+        """
+        out = self.layers(x)
+        out = self.gamma2(out)
+        if not self.add_identity:
+            return self.dropout_layer(out)
+        if identity is None:
+            identity = x
+        return identity + self.dropout_layer(out)
+class MultiheadAttention(nn.Module):
+    """Multi-head Attention Module.
+    This module implements multi-head attention that supports different input
+    dims and embed dims. And it also supports a shortcut from ``value``, which
+    is useful if input dims is not the same with embed dims.
+    Args:
+        embed_dims (int): The embedding dimension.
+        num_heads (int): Parallel attention heads.
+        input_dims (int, optional): The input dimension, and if None,
+            use ``embed_dims``. Defaults to None.
+        attn_drop (float): Dropout rate of the dropout layer after the
+            attention calculation of query and key. Defaults to 0.
+        proj_drop (float): Dropout rate of the dropout layer after the
+            output projection. Defaults to 0.
+        drop_path_rate (float): Stochastic depth rate. Defaults to 0.
+        qkv_bias (bool): If True, add a learnable bias to q, k, v.
+            Defaults to True.
+        qk_scale (float, optional): Override default qk scale of
+            ``head_dim ** -0.5`` if set. Defaults to None.
+        proj_bias (bool) If True, add a learnable bias to output projection.
+            Defaults to True.
+        v_shortcut (bool): Add a shortcut from value to output. It's usually
+            used if ``input_dims`` is different from ``embed_dims``.
+            Defaults to False.
+        use_layer_scale (bool): Whether to use layer scale. Defaults to False.
+        layer_scale_init_value (float or torch.Tensor): Init value of layer
+            scale. Defaults to 0.
+    """
+    def __init__(
+        self,
+        embed_dims,
+        num_heads,
+        input_dims=None,
+        attn_drop=0.0,
+        proj_drop=0.0,
+        drop_path_rate=0.0,
+        qkv_bias=True,
+        proj_bias=True,
+        v_shortcut=False,
+        layer_scale_init_value=0.0,
+    ):
+        super().__init__()
+        self.input_dims = input_dims or embed_dims
+        self.embed_dims = embed_dims
+        self.num_heads = num_heads
+        self.v_shortcut = v_shortcut
+        self.head_dims = embed_dims // num_heads
+        self.qkv = nn.Linear(self.input_dims, embed_dims * 3, bias=qkv_bias)
+        self.attn_drop = attn_drop
+        self.proj = nn.Linear(embed_dims, embed_dims, bias=proj_bias)
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.out_drop = DropPath(drop_path_rate)
+        if layer_scale_init_value > 0:
+            layer_scale_init_value = layer_scale_init_value or 1e-5
+            self.gamma1 = LayerScale(
+                embed_dims, layer_scale_init_value=layer_scale_init_value
+            )
+        else:
+            self.gamma1 = nn.Identity()
+    def forward(self, x):
+        B, N, _ = x.shape
+        qkv = (
+            self.qkv(x)
+            .reshape(B, N, 3, self.num_heads, self.head_dims)
+            .permute(2, 0, 3, 1, 4)
+        )
+        q, k, v = qkv[0], qkv[1], qkv[2]
+        attn_drop = self.attn_drop if self.training else 0.0
+        x = F.scaled_dot_product_attention(q, k, v, dropout_p=attn_drop)
+        x = x.transpose(1, 2).reshape(B, N, self.embed_dims)
+        x = self.proj(x)
+        x = self.out_drop(self.gamma1(self.proj_drop(x)))
+        if self.v_shortcut:
+            x = v.squeeze(1) + x
+        return x
+class Attention(nn.Module):
+    """Multi-head Attention Module for both self and cross attention.
+    Support masking invalid elements for attention.
+    Args:
+        embed_dims (int): The embedding dimension.
+        num_heads (int): Parallel attention heads.
+        input_dims (int, optional): The input dimension, and if None,
+            use ``embed_dims``. Defaults to None.
+        attn_drop (float): Dropout rate of the dropout layer after the
+            attention calculation of query and key. Defaults to 0.
+        proj_drop (float): Dropout rate of the dropout layer after the
+            output projection. Defaults to 0.
+        drop_path_rate (float): Stochastic depth rate. Defaults to 0.
+        qkv_bias (bool): If True, add a learnable bias to q, k, v.
+            Defaults to True.
+        qk_scale (float, optional): Override default qk scale of
+            ``head_dim ** -0.5`` if set. Defaults to None.
+        proj_bias (bool) If True, add a learnable bias to output projection.
+            Defaults to True.
+        v_shortcut (bool): Add a shortcut from value to output. It's usually
+            used if ``input_dims`` is different from ``embed_dims``.
+            Defaults to False.
+        use_layer_scale (bool): Whether to use layer scale. Defaults to False.
+        layer_scale_init_value (float or torch.Tensor): Init value of layer
+            scale. Defaults to 0.
+    """
+    def __init__(
+        self,
+        embed_dims,
+        num_heads,
+        query_dims=None,
+        key_dims=None,
+        value_dims=None,
+        attn_drop=0.0,
+        proj_drop=0.0,
+        drop_path_rate=0.0,
+        qkv_bias=True,
+        proj_bias=True,
+        v_shortcut=False,
+        layer_scale_init_value=0.0,
+    ):
+        super().__init__()
+        self.query_dims = query_dims or embed_dims
+        self.key_dims = key_dims or embed_dims
+        self.value_dims = value_dims or embed_dims
+        self.embed_dims = embed_dims
+        self.num_heads = num_heads
+        self.v_shortcut = v_shortcut
+        self.head_dims = embed_dims // num_heads
+        self.q_proj = nn.Linear(self.query_dims, embed_dims, bias=qkv_bias)
+        self.k_proj = nn.Linear(self.key_dims, embed_dims, bias=qkv_bias)
+        self.v_proj = nn.Linear(self.value_dims, embed_dims, bias=qkv_bias)
+        self.attn_drop = attn_drop
+        self.proj = nn.Linear(embed_dims, self.query_dims, bias=proj_bias)
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.out_drop = DropPath(drop_path_rate)
+        if layer_scale_init_value > 0:
+            layer_scale_init_value = layer_scale_init_value or 1e-5
+            self.gamma1 = LayerScale(
+                embed_dims, layer_scale_init_value=layer_scale_init_value
+            )
+        else:
+            self.gamma1 = nn.Identity()
+    def _separate_heads(self, x: torch.Tensor) -> torch.Tensor:
+        b, n, _ = x.shape
+        x = x.reshape(b, n, self.num_heads, self.head_dims)
+        return x.transpose(1, 2)  # B x N_heads x N_tokens x C_per_head
+    def forward(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        attn_mask: Optional[torch.Tensor] = None,
+    ):
+        B, N, _ = q.shape
+        q = self._separate_heads(self.q_proj(q))
+        k = self._separate_heads(self.k_proj(k))
+        v = self._separate_heads(self.v_proj(v))
+        attn_drop = self.attn_drop if self.training else 0.0
+        if attn_mask is not None:
+            attn_mask = attn_mask.unsqueeze(1).expand(-1, self.num_heads, -1, -1)
+        x = F.scaled_dot_product_attention(
+            q, k, v, attn_mask=attn_mask, dropout_p=attn_drop
+        )
+        x = x.transpose(1, 2).reshape(B, N, self.embed_dims)
+        x = self.proj(x)
+        x = self.out_drop(self.gamma1(self.proj_drop(x)))
+        if self.v_shortcut:
+            x = v.squeeze(1) + x
+        return x
+class TransformerEncoderLayer(nn.Module):
+    """Implements one encoder layer in Vision Transformer.
+    Args:
+        embed_dims (int): The feature dimension
+        num_heads (int): Parallel attention heads
+        feedforward_channels (int): The hidden dimension for FFNs
+        layer_scale_init_value (float or torch.Tensor): Init value of layer
+            scale. Defaults to 0.
+        drop_rate (float): Probability of an element to be zeroed
+            after the feed forward layer. Defaults to 0.
+        attn_drop_rate (float): The drop out rate for attention output weights.
+            Defaults to 0.
+        drop_path_rate (float): Stochastic depth rate. Defaults to 0.
+        num_fcs (int): The number of fully-connected layers for FFNs.
+            Defaults to 2.
+        qkv_bias (bool): enable bias for qkv if True. Defaults to True.
+        ffn_type (str): Select the type of ffn layers. Defaults to 'origin'.
+        act_layer (nn.Module, optional): The activation layer for FFNs.
+            Default: nn.GELU
+        norm_cfg (dict): Config dict for normalization layer.
+            Defaults to ``dict(type='LN')``.
+    """
+    def __init__(
+        self,
+        embed_dims,
+        num_heads,
+        feedforward_channels,
+        layer_scale_init_value=0.0,
+        drop_rate=0.0,
+        attn_drop_rate=0.0,
+        drop_path_rate=0.0,
+        num_fcs=2,
+        qkv_bias=True,
+        ffn_type="origin",
+        act_layer=nn.GELU,
+        norm_cfg=dict(type="LN", eps=1e-6),
+    ):
+        super().__init__()
+        self.embed_dims = embed_dims
+        self.ln1 = build_norm_layer(norm_cfg, self.embed_dims)
+        self.attn = MultiheadAttention(
+            embed_dims=embed_dims,
+            num_heads=num_heads,
+            attn_drop=attn_drop_rate,
+            proj_drop=drop_rate,
+            drop_path_rate=drop_path_rate,
+            qkv_bias=qkv_bias,
+            layer_scale_init_value=layer_scale_init_value,
+        )
+        self.ln2 = build_norm_layer(norm_cfg, self.embed_dims)
+        if ffn_type == "origin":
+            self.ffn = FFN(
+                embed_dims=embed_dims,
+                feedforward_channels=feedforward_channels,
+                num_fcs=num_fcs,
+                ffn_drop=drop_rate,
+                drop_path_rate=drop_path_rate,
+                act_layer=act_layer,
+                layer_scale_init_value=layer_scale_init_value,
+            )
+        elif ffn_type == "swiglu_fused":
+            self.ffn = SwiGLUFFNFused(
+                embed_dims=embed_dims,
+                feedforward_channels=feedforward_channels,
+                layer_scale_init_value=layer_scale_init_value,
+            )
+        else:
+            raise NotImplementedError
+    @property
+    def norm1(self):
+        return self.ln1
+    @property
+    def norm2(self):
+        return self.ln2
+    def forward(self, x):
+        x = x + self.attn(self.ln1(x))
+        x = self.ffn(self.ln2(x), identity=x)
+        return x
+class TransformerDecoderLayer(nn.Module):
+    """Implements one decoder layer in cross-attention Transformer.
+    Adapted from Segment Anything Model (SAM) implementation.
+    Args:
+        embed_dims (int): The feature dimension
+        num_heads (int): Parallel attention heads
+        feedforward_channels (int): The hidden dimension for FFNs
+        layer_scale_init_value (float or torch.Tensor): Init value of layer
+            scale. Defaults to 0.
+        drop_rate (float): Probability of an element to be zeroed
+            after the feed forward layer. Defaults to 0.
+        attn_drop_rate (float): The drop out rate for attention output weights.
+            Defaults to 0.
+        drop_path_rate (float): Stochastic depth rate. Defaults to 0.
+        num_fcs (int): The number of fully-connected layers for FFNs.
+            Defaults to 2.
+        qkv_bias (bool): enable bias for qkv if True. Defaults to True.
+        ffn_type (str): Select the type of ffn layers. Defaults to 'origin'.
+        act_layer (nn.Module, optional): The activation layer for FFNs.
+            Default: nn.GELU
+        norm_cfg (dict): Config dict for normalization layer.
+            Defaults to ``dict(type='LN')``.
+        enable_twoway (bool): Whether to enable two-way Transformer (used in SAM).
+        repeat_pe (bool): Whether to re-add PE at each layer (used in SAM)
+        skip_first_pe (bool)
+    """
+    def __init__(
+        self,
+        token_dims: int,
+        context_dims: int,
+        num_heads: int = 8,
+        head_dims: int = 64,
+        mlp_dims: int = 1024,
+        layer_scale_init_value: float = 0.0,
+        drop_rate: float = 0.0,
+        attn_drop_rate: float = 0.0,
+        drop_path_rate: float = 0.0,
+        ffn_type: str = "origin",
+        act_layer: type[nn.Module] | nn.Module = nn.GELU,
+        norm_cfg: Dict = dict(type="LN", eps=1e-6),
+        enable_twoway: bool = False,
+        repeat_pe: bool = False,
+        skip_first_pe: bool = False,
+    ):
+        super().__init__()
+        self.repeat_pe = repeat_pe
+        self.skip_first_pe = skip_first_pe
+        if self.repeat_pe:
+            self.ln_pe_1 = build_norm_layer(norm_cfg, token_dims)
+            self.ln_pe_2 = build_norm_layer(norm_cfg, context_dims)
+        self.ln1 = build_norm_layer(norm_cfg, token_dims)
+        self.self_attn = Attention(
+            embed_dims=num_heads * head_dims,
+            num_heads=num_heads,
+            query_dims=token_dims,
+            key_dims=token_dims,
+            value_dims=token_dims,
+            attn_drop=attn_drop_rate,
+            proj_drop=drop_rate,
+            drop_path_rate=drop_path_rate,
+            layer_scale_init_value=layer_scale_init_value,
+        )
+        self.ln2_1 = build_norm_layer(norm_cfg, token_dims)
+        self.ln2_2 = build_norm_layer(norm_cfg, context_dims)
+        self.cross_attn = Attention(
+            embed_dims=num_heads * head_dims,
+            num_heads=num_heads,
+            query_dims=token_dims,
+            key_dims=context_dims,
+            value_dims=context_dims,
+            attn_drop=attn_drop_rate,
+            proj_drop=drop_rate,
+            drop_path_rate=drop_path_rate,
+            layer_scale_init_value=layer_scale_init_value,
+        )
+        self.ln3 = build_norm_layer(norm_cfg, token_dims)
+        if ffn_type == "origin":
+            self.ffn = FFN(
+                embed_dims=token_dims,
+                feedforward_channels=mlp_dims,
+                ffn_drop=drop_rate,
+                drop_path_rate=drop_path_rate,
+                act_layer=act_layer,
+                layer_scale_init_value=layer_scale_init_value,
+            )
+        elif ffn_type == "swiglu_fused":
+            self.ffn = SwiGLUFFNFused(
+                embed_dims=token_dims,
+                feedforward_channels=mlp_dims,
+                layer_scale_init_value=layer_scale_init_value,
+            )
+        else:
+            raise NotImplementedError
+        self.enable_twoway = enable_twoway
+        if self.enable_twoway:
+            self.ln4_1 = build_norm_layer(norm_cfg, context_dims)
+            self.ln4_2 = build_norm_layer(norm_cfg, token_dims)
+            self.cross_attn_2 = Attention(
+                embed_dims=num_heads * head_dims,
+                num_heads=num_heads,
+                query_dims=context_dims,
+                key_dims=token_dims,
+                value_dims=token_dims,
+                attn_drop=attn_drop_rate,
+                proj_drop=drop_rate,
+                drop_path_rate=drop_path_rate,
+                layer_scale_init_value=layer_scale_init_value,
+            )
+    def forward(
+        self,
+        x: torch.Tensor,
+        context: torch.Tensor,
+        x_pe: Optional[torch.Tensor] = None,
+        context_pe: Optional[torch.Tensor] = None,
+        x_mask: Optional[torch.Tensor] = None,
+    ):
+        """
+        Args:
+            x: shape [B, N, C]
+            context: shape [B, N, C]
+            x_mask: shape [B, N]
+        """
+        if self.repeat_pe and context_pe is not None:
+            # LaPE: https://openaccess.thecvf.com/content/ICCV2023/papers/Yu_LaPE_Layer-adaptive_Position_Embedding_for_Vision_Transformers_with_Independent_Layer_ICCV_2023_paper.pdf
+            x_pe = self.ln_pe_1(x_pe)
+            context_pe = self.ln_pe_2(context_pe)
+        # Self attention block for tokens
+        if self.repeat_pe and not self.skip_first_pe and x_pe is not None:
+            q = k = self.ln1(x) + x_pe
+            v = self.ln1(x)
+        else:
+            q = k = v = self.ln1(x)
+        attn_mask = None
+        if x_mask is not None:
+            attn_mask = x_mask[:, :, None] @ x_mask[:, None, :]
+            # Set diagonal to 1 to prevent nan output
+            attn_mask.diagonal(dim1=1, dim2=2).fill_(1)
+            attn_mask = attn_mask > 0
+        x = x + self.self_attn(q=q, k=k, v=v, attn_mask=attn_mask)
+        # Cross attention block, tokens attending to image embedding
+        if self.repeat_pe and context_pe is not None:
+            q = self.ln2_1(x) + x_pe
+            k = self.ln2_2(context) + context_pe
+            v = self.ln2_2(context)
+        else:
+            q = self.ln2_1(x)
+            k = v = self.ln2_2(context)
+        x = x + self.cross_attn(q=q, k=k, v=v)
+        # MLP block
+        x = self.ffn(self.ln3(x), identity=x)
+        # (Optional) Cross attention block, image embeddings attending to tokens
+        if self.enable_twoway:
+            if self.repeat_pe and context_pe is not None:
+                q = self.ln4_1(context) + context_pe
+                k = self.ln4_2(x) + x_pe
+                v = self.ln4_2(x)
+            else:
+                q = self.ln4_1(context)
+                k = v = self.ln4_2(x)
+            attn_mask = (
+                (x_mask[:, None, :].repeat(1, context.shape[1], 1)) > 0
+                if x_mask is not None
+                else None
+            )
+            context = context + self.cross_attn_2(q=q, k=k, v=v, attn_mask=attn_mask)
+        return x, context