Spaces:

imw34531
/

nvlabs-sana

Configuration error

App Files Files Community

imw34531 commited on 7 days ago

Commit

87e21d1

•

1 Parent(s): c10cc8d

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +1 -0
.github/workflows/bot-autolint.yaml +50 -0
.github/workflows/ci.yaml +54 -0
.gitignore +178 -0
.pre-commit-config.yaml +62 -0
CIs/add_license_all.sh +2 -0
Dockerfile +20 -0
LICENSE +117 -0
README.md +231 -12
app/app_sana.py +488 -0
app/app_sana_multithread.py +565 -0
app/safety_check.py +72 -0
app/sana_pipeline.py +324 -0
asset/Sana.jpg +3 -0
asset/docs/metrics_toolkit.md +118 -0
asset/example_data/00000000.txt +1 -0
asset/examples.py +69 -0
asset/model-incremental.jpg +0 -0
asset/model_paths.txt +2 -0
asset/samples.txt +125 -0
asset/samples_mini.txt +10 -0
configs/sana_app_config/Sana_1600M_app.yaml +107 -0
configs/sana_app_config/Sana_600M_app.yaml +105 -0
configs/sana_base.yaml +140 -0
configs/sana_config/1024ms/Sana_1600M_img1024.yaml +109 -0
configs/sana_config/1024ms/Sana_600M_img1024.yaml +105 -0
configs/sana_config/512ms/Sana_1600M_img512.yaml +108 -0
configs/sana_config/512ms/Sana_600M_img512.yaml +107 -0
configs/sana_config/512ms/ci_Sana_600M_img512.yaml +107 -0
configs/sana_config/512ms/sample_dataset.yaml +107 -0
diffusion/__init__.py +9 -0
diffusion/data/__init__.py +2 -0
diffusion/data/builder.py +76 -0
diffusion/data/datasets/__init__.py +3 -0
diffusion/data/datasets/sana_data.py +467 -0
diffusion/data/datasets/sana_data_multi_scale.py +265 -0
diffusion/data/datasets/utils.py +506 -0
diffusion/data/transforms.py +46 -0
diffusion/data/wids/__init__.py +16 -0
diffusion/data/wids/wids.py +1051 -0
diffusion/data/wids/wids_dl.py +174 -0
diffusion/data/wids/wids_lru.py +81 -0
diffusion/data/wids/wids_mmtar.py +168 -0
diffusion/data/wids/wids_specs.py +192 -0
diffusion/data/wids/wids_tar.py +98 -0
diffusion/dpm_solver.py +69 -0
diffusion/flow_euler_sampler.py +74 -0
diffusion/iddpm.py +76 -0
diffusion/lcm_scheduler.py +457 -0
diffusion/model/__init__.py +1 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+asset/Sana.jpg filter=lfs diff=lfs merge=lfs -text

.github/workflows/bot-autolint.yaml ADDED Viewed

	@@ -0,0 +1,50 @@

+name: Auto Lint (triggered by "auto lint" label)
+on:
+  pull_request:
+    types:
+      - opened
+      - edited
+      - closed
+      - reopened
+      - synchronize
+      - labeled
+      - unlabeled
+# run only one unit test for a branch / tag.
+concurrency:
+  group: ci-lint-${{ github.ref }}
+  cancel-in-progress: true
+jobs:
+  lint-by-label:
+    if: contains(github.event.pull_request.labels.*.name, 'lint wanted')
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check out Git repository
+        uses: actions/checkout@v4
+        with:
+          token: ${{ secrets.PAT }}
+          ref: ${{ github.event.pull_request.head.ref }}
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.10'
+      - name: Test pre-commit hooks
+        continue-on-error: true
+        uses: pre-commit/action@v3.0.0 # sync with https://github.com/Efficient-Large-Model/VILA-Internal/blob/main/.github/workflows/pre-commit.yaml
+        with:
+          extra_args: --all-files
+      - name: Check if there are any changes
+        id: verify_diff
+        run: |
+          git diff --quiet . || echo "changed=true" >> $GITHUB_OUTPUT
+      - name: Commit files
+        if: steps.verify_diff.outputs.changed == 'true'
+        run: |
+          git config --local user.email "action@github.com"
+          git config --local user.name "GitHub Action"
+          git add .
+          git commit -m "[CI-Lint] Fix code style issues with pre-commit ${{ github.sha }}" -a
+          git push
+      - name: Remove label(s) after lint
+        uses: actions-ecosystem/action-remove-labels@v1
+        with:
+          labels: lint wanted

.github/workflows/ci.yaml ADDED Viewed

	@@ -0,0 +1,54 @@

+name: ci
+on:
+  pull_request:
+  push:
+    branches: [main, feat/Sana-public, feat/Sana-public-for-NVLab]
+concurrency:
+  group: ci-${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+# if: ${{ github.repository == 'Efficient-Large-Model/Sana' }}
+jobs:
+  pre-commit:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check out Git repository
+        uses: actions/checkout@v4
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: 3.10.10
+      - name: Test pre-commit hooks
+        uses: pre-commit/action@v3.0.1
+  tests-bash:
+    # needs: pre-commit
+    runs-on: self-hosted
+    steps:
+      - name: Check out Git repository
+        uses: actions/checkout@v4
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: 3.10.10
+      - name: Set up the environment
+        run: |
+          bash environment_setup.sh
+      - name: Run tests with Slurm
+        run: |
+          sana-run --pty -m ci -J tests-bash bash tests/bash/entry.sh
+# tests-python:
+#     needs: pre-commit
+#     runs-on: self-hosted
+#     steps:
+#         - name: Check out Git repository
+#           uses: actions/checkout@v4
+#         - name: Set up Python
+#           uses: actions/setup-python@v5
+#           with:
+#               python-version: 3.10.10
+#         - name: Set up the environment
+#           run: |
+#               ./environment_setup.sh
+#         - name: Run tests with Slurm
+#           run: |
+#               sana-run --pty -m ci -J tests-python pytest tests/python

.gitignore ADDED Viewed

	@@ -0,0 +1,178 @@

+# Sana related files
+.idea/
+*.png
+*.json
+tmp*
+output*
+output/
+outputs/
+wandb/
+.vscode/
+private/
+ldm_ae*
+data/*
+*.pth
+.gradio/
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/

.pre-commit-config.yaml ADDED Viewed

	@@ -0,0 +1,62 @@

+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v5.0.0
+    hooks:
+      - id: trailing-whitespace
+        name: (Common) Remove trailing whitespaces
+      - id: mixed-line-ending
+        name: (Common) Fix mixed line ending
+        args: [--fix=lf]
+      - id: end-of-file-fixer
+        name: (Common) Remove extra EOF newlines
+      - id: check-merge-conflict
+        name: (Common) Check for merge conflicts
+      - id: requirements-txt-fixer
+        name: (Common) Sort "requirements.txt"
+      - id: fix-encoding-pragma
+        name: (Python) Remove encoding pragmas
+        args: [--remove]
+        # - id: debug-statements
+        #   name: (Python) Check for debugger imports
+      - id: check-json
+        name: (JSON) Check syntax
+      - id: check-yaml
+        name: (YAML) Check syntax
+      - id: check-toml
+        name: (TOML) Check syntax
+  # - repo: https://github.com/shellcheck-py/shellcheck-py
+  #   rev: v0.10.0.1
+  #   hooks:
+  #     - id: shellcheck
+  - repo: https://github.com/google/yamlfmt
+    rev: v0.13.0
+    hooks:
+      - id: yamlfmt
+  - repo: https://github.com/executablebooks/mdformat
+    rev: 0.7.16
+    hooks:
+      - id: mdformat
+        name: (Markdown) Format docs with mdformat
+  - repo: https://github.com/asottile/pyupgrade
+    rev: v3.2.2
+    hooks:
+      - id: pyupgrade
+        name: (Python) Update syntax for newer versions
+        args: [--py37-plus]
+  - repo: https://github.com/psf/black
+    rev: 22.10.0
+    hooks:
+      - id: black
+        name: (Python) Format code with black
+  - repo: https://github.com/pycqa/isort
+    rev: 5.12.0
+    hooks:
+      - id: isort
+        name: (Python) Sort imports with isort
+  - repo: https://github.com/pre-commit/mirrors-clang-format
+    rev: v15.0.4
+    hooks:
+      - id: clang-format
+        name: (C/C++/CUDA) Format code with clang-format
+        args: [-style=google, -i]
+        types_or: [c, c++, cuda]

CIs/add_license_all.sh ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ #/bin/bash
2	+ addlicense -s -c 'NVIDIA CORPORATION & AFFILIATES' -ignore "*/__init__.py" */.py

Dockerfile ADDED Viewed

	@@ -0,0 +1,20 @@

+FROM nvcr.io/nvidia/pytorch:24.06-py3
+WORKDIR /app
+RUN curl https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -o ~/miniconda.sh \
+    && sh ~/miniconda.sh -b -p /opt/conda \
+    && rm ~/miniconda.sh
+ENV PATH /opt/conda/bin:$PATH
+COPY pyproject.toml pyproject.toml
+COPY diffusion diffusion
+COPY configs configs
+COPY sana sana
+COPY app app
+COPY environment_setup.sh environment_setup.sh
+RUN ./environment_setup.sh sana
+# COPY server.py server.py
+CMD ["conda", "run", "-n", "sana", "--no-capture-output", "python", "-u", "-W", "ignore", "app/app_sana.py", "--config=configs/sana_config/1024ms/Sana_1600M_img1024.yaml", "--model_path=hf://Sana_1600M_1024px/checkpoints/Sana_1600M_1024px.pth",]

LICENSE ADDED Viewed

	@@ -0,0 +1,117 @@

+Copyright (c) 2019, NVIDIA Corporation. All rights reserved.
+Nvidia Source Code License-NC
+=======================================================================
+1. Definitions
+“Licensor” means any person or entity that distributes its Work.
+“Work” means (a) the original work of authorship made available under
+this license, which may include software, documentation, or other
+files, and (b) any additions to or derivative works  thereof
+that are made available under this license.
+“NVIDIA Processors” means any central processing unit (CPU),
+graphics processing unit (GPU), field-programmable gate array (FPGA),
+application-specific integrated circuit (ASIC) or any combination
+thereof designed, made, sold, or provided by NVIDIA or its affiliates.
+The terms “reproduce,” “reproduction,” “derivative works,” and
+“distribution” have the meaning as provided under U.S. copyright law;
+provided, however, that for the purposes of this license, derivative
+works shall not include works that remain separable from, or merely
+link (or bind by name) to the interfaces of, the Work.
+Works are “made available” under this license by including in or with
+the Work either (a) a copyright notice referencing the applicability
+of this license to the Work, or (b) a copy of this license.
+"Safe Model" means ShieldGemma-2B, which is a series of safety
+content moderation models designed to moderate four categories of
+harmful content: sexually explicit material, dangerous content,
+hate speech, and harassment, and which you separately obtain
+from Google at https://huggingface.co/google/shieldgemma-2b.
+2. License Grant
+2.1 Copyright Grant. Subject to the terms and conditions of this
+license, each Licensor grants to you a perpetual, worldwide,
+non-exclusive, royalty-free, copyright license to use, reproduce,
+prepare derivative works of, publicly display, publicly perform,
+sublicense and distribute its Work and any resulting derivative
+works in any form.
+3. Limitations
+3.1 Redistribution. You may reproduce or distribute the Work only if
+(a) you do so under this license, (b) you include a complete copy of
+this license with your distribution, and (c) you retain without
+modification any copyright, patent, trademark, or attribution notices
+that are present in the Work.
+3.2 Derivative Works. You may specify that additional or different
+terms apply to the use, reproduction, and distribution of your
+derivative works of the Work (“Your Terms”) only if (a) Your Terms
+provide that the use limitation in Section 3.3 applies to your
+derivative works, and (b) you identify the specific derivative works
+that are subject to Your Terms. Notwithstanding Your Terms, this
+license (including the redistribution requirements in Section 3.1)
+will continue to apply to the Work itself.
+3.3 Use Limitation. The Work and any derivative works thereof only may
+be used or intended for use non-commercially and with NVIDIA Processors,
+in accordance with Section 3.4, below. Notwithstanding the foregoing,
+NVIDIA Corporation and its affiliates may use the Work and any
+derivative works commercially. As used herein, “non-commercially”
+means for research or evaluation purposes only.
+3.4  You shall filter your input content to the Work and any derivative
+works thereof through the Safe Model to ensure that no content described
+as Not Safe For Work (NSFW) is processed or generated. You shall not use
+the Work to process or generate NSFW content. You are solely responsible
+for any damages and liabilities arising from your failure to adequately
+filter content in accordance with this section.  As used herein,
+“Not Safe For Work” or “NSFW” means content, videos or website pages
+that contain potentially disturbing subject matter, including but not
+limited to content that is sexually explicit, dangerous, hate,
+or harassment.
+3.5 Patent Claims. If you bring or threaten to bring a patent claim
+against any Licensor (including any claim, cross-claim or counterclaim
+in a lawsuit) to enforce any patents that you allege are infringed by
+any Work, then your rights under this license from such Licensor
+(including the grant in Section 2.1) will terminate immediately.
+3.6 Trademarks. This license does not grant any rights to use any
+Licensor’s or its affiliates’ names, logos, or trademarks, except as
+necessary to reproduce the notices described in this license.
+3.7 Termination. If you violate any term of this license, then your
+rights under this license (including the grant in Section 2.1) will
+terminate immediately.
+4. Disclaimer of Warranty.
+THE WORK IS PROVIDED “AS IS” WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WARRANTIES OR CONDITIONS OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE OR
+NON-INFRINGEMENT. YOU BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES
+UNDER THIS LICENSE.
+5. Limitation of Liability.
+EXCEPT AS PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL
+THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE
+SHALL ANY LICENSOR BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT,
+INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF
+OR RELATED TO THIS LICENSE, THE USE OR INABILITY TO USE THE WORK
+(INCLUDING BUT NOT LIMITED TO LOSS OF GOODWILL, BUSINESS INTERRUPTION,
+LOST PROFITS OR DATA, COMPUTER FAILURE OR MALFUNCTION, OR ANY OTHER
+DAMAGES OR LOSSES), EVEN IF THE LICENSOR HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+=======================================================================

README.md CHANGED Viewed

@@ -1,12 +1,231 @@
----
-title: Nvlabs Sana
-emoji: 😻
-colorFrom: purple
-colorTo: yellow
-sdk: gradio
-sdk_version: 5.6.0
-app_file: app.py
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+<p align="center" style="border-radius: 10px">
+  <img src="asset/logo.png" width="35%" alt="logo"/>
+</p>
+# ⚡️Sana: Efficient High-Resolution Image Synthesis with Linear Diffusion Transformer
+<div align="center">
+  <a href="https://nvlabs.github.io/Sana/"><img src="https://img.shields.io/static/v1?label=Project&message=Github&color=blue&logo=github-pages"></a> &ensp;
+  <a href="https://hanlab.mit.edu/projects/sana/"><img src="https://img.shields.io/static/v1?label=Page&message=MIT&color=darkred&logo=github-pages"></a> &ensp;
+  <a href="https://arxiv.org/abs/2410.10629"><img src="https://img.shields.io/static/v1?label=Arxiv&message=Sana&color=red&logo=arxiv"></a> &ensp;
+  <a href="https://nv-sana.mit.edu/"><img src="https://img.shields.io/static/v1?label=Demo&message=MIT&color=yellow"></a> &ensp;
+  <a href="https://discord.gg/rde6eaE5Ta"><img src="https://img.shields.io/static/v1?label=Discuss&message=Discord&color=purple&logo=discord"></a> &ensp;
+</div>
+<p align="center" border-raduis="10px">
+  <img src="asset/Sana.jpg" width="90%" alt="teaser_page1"/>
+</p>
+## 💡 Introduction
+We introduce Sana, a text-to-image framework that can efficiently generate images up to 4096 × 4096 resolution.
+Sana can synthesize high-resolution, high-quality images with strong text-image alignment at a remarkably fast speed, deployable on laptop GPU.
+Core designs include:
+(1) [**DC-AE**](https://hanlab.mit.edu/projects/dc-ae): unlike traditional AEs, which compress images only 8×, we trained an AE that can compress images 32×, effectively reducing the number of latent tokens. \
+(2) **Linear DiT**: we replace all vanilla attention in DiT with linear attention, which is more efficient at high resolutions without sacrificing quality. \
+(3) **Decoder-only text encoder**: we replaced T5 with modern decoder-only small LLM as the text encoder and designed complex human instruction with in-context learning to enhance the image-text alignment. \
+(4) **Efficient training and sampling**: we propose **Flow-DPM-Solver** to reduce sampling steps, with efficient caption labeling and selection to accelerate convergence.
+As a result, Sana-0.6B is very competitive with modern giant diffusion model (e.g. Flux-12B), being 20 times smaller and 100+ times faster in measured throughput. Moreover, Sana-0.6B can be deployed on a 16GB laptop GPU, taking less than 1 second to generate a 1024 × 1024 resolution image. Sana enables content creation at low cost.
+<p align="center" border-raduis="10px">
+  <img src="asset/model-incremental.jpg" width="90%" alt="teaser_page2"/>
+</p>
+## 🔥🔥 News
+- (🔥 New) \[2024/11\] 1.6B [Sana models](https://huggingface.co/collections/Efficient-Large-Model/sana-673efba2a57ed99843f11f9e) are released.
+- (🔥 New) \[2024/11\] Training & Inference & Metrics code are released.
+- (🔥 New) \[2024/11\] Working on [`diffusers`](https://github.com/huggingface/diffusers/pull/9982).
+- \[2024/10\] [Demo](https://nv-sana.mit.edu/) is released.
+- \[2024/10\] [DC-AE Code](https://github.com/mit-han-lab/efficientvit/blob/master/applications/dc_ae/README.md) and [weights](https://huggingface.co/collections/mit-han-lab/dc-ae-670085b9400ad7197bb1009b) are released!
+- \[2024/10\] [Paper](https://arxiv.org/abs/2410.10629) is on Arxiv!
+## Performance
+| Methods (1024x1024)                      | Throughput (samples/s) | Latency (s) | Params (B) | Speedup   | FID 👆      | CLIP 👆      | GenEval 👆  | DPG 👆      |
+|------------------------------|------------------------|-------------|------------|-----------|-------------|--------------|-------------|-------------|
+| FLUX-dev                     | 0.04                   | 23.0        | 12.0       | 1.0×      | 10.15       | 27.47        | _0.67_      | _84.0_      |
+| **Sana-0.6B**                | 1.7                    | 0.9         | 0.6        | **39.5×** | <u>5.81</u> | 28.36        | 0.64        | 83.6        |
+| **Sana-1.6B**                | 1.0                    | 1.2         | 1.6        | **23.3×** | **5.76**    | <u>28.67</u> | <u>0.66</u> | **84.8**    |
+<details>
+  <summary><h3>Click to show all</h3></summary>
+| Methods                      | Throughput (samples/s) | Latency (s) | Params (B) | Speedup   | FID 👆      | CLIP 👆      | GenEval 👆  | DPG 👆      |
+|------------------------------|------------------------|-------------|------------|-----------|-------------|--------------|-------------|-------------|
+| _**512 × 512 resolution**_   |                        |             |            |           |             |              |             |             |
+| PixArt-α                     | 1.5                    | 1.2         | 0.6        | 1.0×      | 6.14        | 27.55        | 0.48        | 71.6        |
+| PixArt-Σ                     | 1.5                    | 1.2         | 0.6        | 1.0×      | _6.34_      | _27.62_      | <u>0.52</u> | _79.5_      |
+| **Sana-0.6B**                | 6.7                    | 0.8         | 0.6        | 5.0×      | <u>5.67</u> | <u>27.92</u> | _0.64_      | <u>84.3</u> |
+| **Sana-1.6B**                | 3.8                    | 0.6         | 1.6        | 2.5×      | **5.16**    | **28.19**    | **0.66**    | **85.5**    |
+| _**1024 × 1024 resolution**_ |                        |             |            |           |             |              |             |             |
+| LUMINA-Next                  | 0.12                   | 9.1         | 2.0        | 2.8×      | 7.58        | 26.84        | 0.46        | 74.6        |
+| SDXL                         | 0.15                   | 6.5         | 2.6        | 3.5×      | 6.63        | _29.03_      | 0.55        | 74.7        |
+| PlayGroundv2.5               | 0.21                   | 5.3         | 2.6        | 4.9×      | _6.09_      | **29.13**    | 0.56        | 75.5        |
+| Hunyuan-DiT                  | 0.05                   | 18.2        | 1.5        | 1.2×      | 6.54        | 28.19        | 0.63        | 78.9        |
+| PixArt-Σ                     | 0.4                    | 2.7         | 0.6        | 9.3×      | 6.15        | 28.26        | 0.54        | 80.5        |
+| DALLE3                       | -                      | -           | -          | -         | -           | -            | _0.67_      | 83.5        |
+| SD3-medium                   | 0.28                   | 4.4         | 2.0        | 6.5×      | 11.92       | 27.83        | 0.62        | <u>84.1</u> |
+| FLUX-dev                     | 0.04                   | 23.0        | 12.0       | 1.0×      | 10.15       | 27.47        | _0.67_      | _84.0_      |
+| FLUX-schnell                 | 0.5                    | 2.1         | 12.0       | 11.6×     | 7.94        | 28.14        | **0.71**    | **84.8**    |
+| **Sana-0.6B**                | 1.7                    | 0.9         | 0.6        | **39.5×** | <u>5.81</u> | 28.36        | 0.64        | 83.6        |
+| **Sana-1.6B**                | 1.0                    | 1.2         | 1.6        | **23.3×** | **5.76**    | <u>28.67</u> | <u>0.66</u> | **84.8**    |
+</details>
+## Contents
+- [Env](#-1-dependencies-and-installation)
+- [Demo](#-3-how-to-inference)
+- [Training](#-2-how-to-train)
+- [Testing](#-4-how-to-inference--test-metrics-fid-clip-score-geneval-dpg-bench-etc)
+- [TODO](#to-do-list)
+- [Citation](#bibtex)
+# 🔧 1. Dependencies and Installation
+- Python >= 3.10.0 (Recommend to use [Anaconda](https://www.anaconda.com/download/#linux) or [Miniconda](https://docs.conda.io/en/latest/miniconda.html))
+- [PyTorch >= 2.0.1+cu12.1](https://pytorch.org/)
+```bash
+git clone https://github.com/NVlabs/Sana.git
+cd Sana
+./environment_setup.sh sana
+# or you can install each components step by step following environment_setup.sh
+```
+# 💻 2. How to Play with Sana (Inference)
+## 💰Hardware requirement
+- 9GB VRAM is required for 0.6B model and 12GB VRAM for 1.6B model. Our later quantization version will require less than 8GB for inference.
+- All the tests are done on A100 GPUs. Different GPU version may be different.
+## 🔛 Quick start with [Gradio](https://www.gradio.app/guides/quickstart)
+```bash
+# official online demo
+DEMO_PORT=15432 \
+python app/app_sana.py \
+      --config=configs/sana_config/1024ms/Sana_1600M_img1024.yaml \
+      --model_path=hf://Efficient-Large-Model/Sana_1600M_1024px/checkpoints/Sana_1600M_1024px.pth
+```
+```python
+import torch
+from app.sana_pipeline import SanaPipeline
+from torchvision.utils import save_image
+device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+generator = torch.Generator(device=device).manual_seed(42)
+sana = SanaPipeline("configs/sana_config/1024ms/Sana_1600M_img1024.yaml")
+sana.from_pretrained("hf://Efficient-Large-Model/Sana_1600M_1024px/checkpoints/Sana_1600M_1024px.pth")
+prompt = 'a cyberpunk cat with a neon sign that says "Sana"'
+image = sana(
+    prompt=prompt,
+    height=1024,
+    width=1024,
+    guidance_scale=5.0,
+    pag_guidance_scale=2.0,
+    num_inference_steps=18,
+    generator=generator,
+)
+save_image(image, 'output/sana.png', nrow=1, normalize=True, value_range=(-1, 1))
+```
+## 🔛 Run inference with TXT or JSON files
+```bash
+# Run samples in a txt file
+python scripts/inference.py \
+      --config=configs/sana_config/1024ms/Sana_1600M_img1024.yaml \
+      --model_path=hf://Efficient-Large-Model/Sana_1600M_1024px/checkpoints/Sana_1600M_1024px.pth
+      --txt_file=asset/samples_mini.txt
+# Run samples in a json file
+python scripts/inference.py \
+      --config=configs/sana_config/1024ms/Sana_1600M_img1024.yaml \
+      --model_path=hf://Efficient-Large-Model/Sana_1600M_1024px/checkpoints/Sana_1600M_1024px.pth
+      --json_file=asset/samples_mini.json
+```
+where each line of [`asset/samples_mini.txt`](asset/samples_mini.txt) contains a prompt to generate
+# 🔥 3. How to Train Sana
+## 💰Hardware requirement
+- 32GB VRAM is required for both 0.6B and 1.6B model's training
+We provide a training example here and you can also select your desired config file from [config files dir](configs/sana_config) based on your data structure.
+To launch Sana training, you will first need to prepare data in the following formats
+```bash
+asset/example_data
+├── AAA.txt
+├── AAA.png
+├── BCC.txt
+├── BCC.png
+├── ......
+├── CCC.txt
+└── CCC.png
+```
+Then Sana's training can be launched via
+```bash
+# Example of training Sana 0.6B with 512x512 resolution
+bash train_scripts/train.sh \
+  configs/sana_config/512ms/Sana_600M_img512.yaml \
+  --data.data_dir="[asset/example_data]" \
+  --data.type=SanaImgDataset \
+  --model.multi_scale=false \
+  --train.train_batch_size=32
+# Example of training Sana 1.6B with 1024x1024 resolution
+bash train_scripts/train.sh \
+  configs/sana_config/1024ms/Sana_1600M_img1024.yaml \
+  --data.data_dir="[asset/example_data]" \
+  --data.type=SanaImgDataset \
+  --model.multi_scale=false \
+  --train.train_batch_size=8
+```
+# 💻 4. Metric toolkit
+Refer to [Toolkit Manual](asset/docs/metrics_toolkit.md).
+# 💪To-Do List
+We will try our best to release
+- \[x\] Training code
+- \[x\] Inference code
+- \[+\] Model zoo
+- \[ \] working on Diffusers(https://github.com/huggingface/diffusers/pull/9982)
+- \[ \] ComfyUI
+- \[ \] Laptop development
+# 🤗Acknowledgements
+- Thanks to [PixArt-α](https://github.com/PixArt-alpha/PixArt-alpha), [PixArt-Σ](https://github.com/PixArt-alpha/PixArt-sigma) and [Efficient-ViT](https://github.com/mit-han-lab/efficientvit) for their wonderful work and codebase!
+# 📖BibTeX
+```
+@misc{xie2024sana,
+      title={Sana: Efficient High-Resolution Image Synthesis with Linear Diffusion Transformer},
+      author={Enze Xie and Junsong Chen and Junyu Chen and Han Cai and Haotian Tang and Yujun Lin and Zhekai Zhang and Muyang Li and Ligeng Zhu and Yao Lu and Song Han},
+      year={2024},
+      eprint={2410.10629},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV},
+      url={https://arxiv.org/abs/2410.10629},
+    }
+```

app/app_sana.py ADDED Viewed

	@@ -0,0 +1,488 @@

+#!/usr/bin/env python
+# Copyright 2024 NVIDIA CORPORATION & AFFILIATES
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+import argparse
+import os
+import random
+import time
+import uuid
+from datetime import datetime
+import gradio as gr
+import numpy as np
+import spaces
+import torch
+from PIL import Image
+from torchvision.utils import make_grid, save_image
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from app import safety_check
+from app.sana_pipeline import SanaPipeline
+MAX_SEED = np.iinfo(np.int32).max
+CACHE_EXAMPLES = torch.cuda.is_available() and os.getenv("CACHE_EXAMPLES", "1") == "1"
+MAX_IMAGE_SIZE = int(os.getenv("MAX_IMAGE_SIZE", "4096"))
+USE_TORCH_COMPILE = os.getenv("USE_TORCH_COMPILE", "0") == "1"
+ENABLE_CPU_OFFLOAD = os.getenv("ENABLE_CPU_OFFLOAD", "0") == "1"
+DEMO_PORT = int(os.getenv("DEMO_PORT", "15432"))
+os.environ["GRADIO_EXAMPLES_CACHE"] = "./.gradio/cache"
+device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+style_list = [
+    {
+        "name": "(No style)",
+        "prompt": "{prompt}",
+        "negative_prompt": "",
+    },
+    {
+        "name": "Cinematic",
+        "prompt": "cinematic still {prompt} . emotional, harmonious, vignette, highly detailed, high budget, bokeh, "
+        "cinemascope, moody, epic, gorgeous, film grain, grainy",
+        "negative_prompt": "anime, cartoon, graphic, text, painting, crayon, graphite, abstract, glitch, deformed, mutated, ugly, disfigured",
+    },
+    {
+        "name": "Photographic",
+        "prompt": "cinematic photo {prompt} . 35mm photograph, film, bokeh, professional, 4k, highly detailed",
+        "negative_prompt": "drawing, painting, crayon, sketch, graphite, impressionist, noisy, blurry, soft, deformed, ugly",
+    },
+    {
+        "name": "Anime",
+        "prompt": "anime artwork {prompt} . anime style, key visual, vibrant, studio anime,  highly detailed",
+        "negative_prompt": "photo, deformed, black and white, realism, disfigured, low contrast",
+    },
+    {
+        "name": "Manga",
+        "prompt": "manga style {prompt} . vibrant, high-energy, detailed, iconic, Japanese comic style",
+        "negative_prompt": "ugly, deformed, noisy, blurry, low contrast, realism, photorealistic, Western comic style",
+    },
+    {
+        "name": "Digital Art",
+        "prompt": "concept art {prompt} . digital artwork, illustrative, painterly, matte painting, highly detailed",
+        "negative_prompt": "photo, photorealistic, realism, ugly",
+    },
+    {
+        "name": "Pixel art",
+        "prompt": "pixel-art {prompt} . low-res, blocky, pixel art style, 8-bit graphics",
+        "negative_prompt": "sloppy, messy, blurry, noisy, highly detailed, ultra textured, photo, realistic",
+    },
+    {
+        "name": "Fantasy art",
+        "prompt": "ethereal fantasy concept art of  {prompt} . magnificent, celestial, ethereal, painterly, epic, "
+        "majestic, magical, fantasy art, cover art, dreamy",
+        "negative_prompt": "photographic, realistic, realism, 35mm film, dslr, cropped, frame, text, deformed, "
+        "glitch, noise, noisy, off-center, deformed, cross-eyed, closed eyes, bad anatomy, ugly, "
+        "disfigured, sloppy, duplicate, mutated, black and white",
+    },
+    {
+        "name": "Neonpunk",
+        "prompt": "neonpunk style {prompt} . cyberpunk, vaporwave, neon, vibes, vibrant, stunningly beautiful, crisp, "
+        "detailed, sleek, ultramodern, magenta highlights, dark purple shadows, high contrast, cinematic, "
+        "ultra detailed, intricate, professional",
+        "negative_prompt": "painting, drawing, illustration, glitch, deformed, mutated, cross-eyed, ugly, disfigured",
+    },
+    {
+        "name": "3D Model",
+        "prompt": "professional 3d model {prompt} . octane render, highly detailed, volumetric, dramatic lighting",
+        "negative_prompt": "ugly, deformed, noisy, low poly, blurry, painting",
+    },
+]
+styles = {k["name"]: (k["prompt"], k["negative_prompt"]) for k in style_list}
+STYLE_NAMES = list(styles.keys())
+DEFAULT_STYLE_NAME = "(No style)"
+SCHEDULE_NAME = ["Flow_DPM_Solver"]
+DEFAULT_SCHEDULE_NAME = "Flow_DPM_Solver"
+NUM_IMAGES_PER_PROMPT = 1
+TEST_TIMES = 0
+INFER_SPEED = 0
+FILENAME = f"output/port{DEMO_PORT}_inference_count.txt"
+def read_inference_count():
+    global TEST_TIMES
+    try:
+        with open(FILENAME) as f:
+            count = int(f.read().strip())
+    except FileNotFoundError:
+        count = 0
+    TEST_TIMES = count
+    return count
+def write_inference_count(count):
+    with open(FILENAME, "w") as f:
+        f.write(str(count))
+def run_inference(num_imgs=1):
+    TEST_TIMES = read_inference_count()
+    TEST_TIMES += int(num_imgs)
+    write_inference_count(TEST_TIMES)
+    return (
+        f"<span style='font-size: 16px; font-weight: bold;'>Total inference runs: </span><span style='font-size: "
+        f"16px; color:red; font-weight: bold;'>{TEST_TIMES}</span>"
+    )
+def update_inference_count():
+    count = read_inference_count()
+    return (
+        f"<span style='font-size: 16px; font-weight: bold;'>Total inference runs: </span><span style='font-size: "
+        f"16px; color:red; font-weight: bold;'>{count}</span>"
+    )
+def apply_style(style_name: str, positive: str, negative: str = "") -> tuple[str, str]:
+    p, n = styles.get(style_name, styles[DEFAULT_STYLE_NAME])
+    if not negative:
+        negative = ""
+    return p.replace("{prompt}", positive), n + negative
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config", type=str, help="config")
+    parser.add_argument(
+        "--model_path",
+        nargs="?",
+        default="hf://Efficient-Large-Model/Sana_1600M_1024px/checkpoints/Sana_1600M_1024px.pth",
+        type=str,
+        help="Path to the model file (positional)",
+    )
+    parser.add_argument("--output", default="./", type=str)
+    parser.add_argument("--bs", default=1, type=int)
+    parser.add_argument("--image_size", default=1024, type=int)
+    parser.add_argument("--cfg_scale", default=5.0, type=float)
+    parser.add_argument("--pag_scale", default=2.0, type=float)
+    parser.add_argument("--seed", default=42, type=int)
+    parser.add_argument("--step", default=-1, type=int)
+    parser.add_argument("--custom_image_size", default=None, type=int)
+    parser.add_argument(
+        "--shield_model_path",
+        type=str,
+        help="The path to shield model, we employ ShieldGemma-2B by default.",
+        default="google/shieldgemma-2b",
+    )
+    return parser.parse_known_args()[0]
+args = get_args()
+if torch.cuda.is_available():
+    weight_dtype = torch.float16
+    model_path = args.model_path
+    pipe = SanaPipeline(args.config)
+    pipe.from_pretrained(model_path)
+    pipe.register_progress_bar(gr.Progress())
+    # safety checker
+    safety_checker_tokenizer = AutoTokenizer.from_pretrained(args.shield_model_path)
+    safety_checker_model = AutoModelForCausalLM.from_pretrained(
+        args.shield_model_path,
+        device_map="auto",
+        torch_dtype=torch.bfloat16,
+    ).to(device)
+def save_image_sana(img, seed="", save_img=False):
+    unique_name = f"{str(uuid.uuid4())}_{seed}.png"
+    save_path = os.path.join(f"output/online_demo_img/{datetime.now().date()}")
+    os.umask(0o000)  # file permission: 666; dir permission: 777
+    os.makedirs(save_path, exist_ok=True)
+    unique_name = os.path.join(save_path, unique_name)
+    if save_img:
+        save_image(img, unique_name, nrow=1, normalize=True, value_range=(-1, 1))
+    return unique_name
+def randomize_seed_fn(seed: int, randomize_seed: bool) -> int:
+    if randomize_seed:
+        seed = random.randint(0, MAX_SEED)
+    return seed
+@torch.no_grad()
+@torch.inference_mode()
+@spaces.GPU(enable_queue=True)
+def generate(
+    prompt: str = None,
+    negative_prompt: str = "",
+    style: str = DEFAULT_STYLE_NAME,
+    use_negative_prompt: bool = False,
+    num_imgs: int = 1,
+    seed: int = 0,
+    height: int = 1024,
+    width: int = 1024,
+    flow_dpms_guidance_scale: float = 5.0,
+    flow_dpms_pag_guidance_scale: float = 2.0,
+    flow_dpms_inference_steps: int = 20,
+    randomize_seed: bool = False,
+):
+    global TEST_TIMES
+    global INFER_SPEED
+    # seed = 823753551
+    seed = int(randomize_seed_fn(seed, randomize_seed))
+    generator = torch.Generator(device=device).manual_seed(seed)
+    print(f"PORT: {DEMO_PORT}, model_path: {model_path}, time_times: {TEST_TIMES}")
+    if safety_check.is_dangerous(safety_checker_tokenizer, safety_checker_model, prompt, threshold=0.2):
+        prompt = "A red heart."
+    print(prompt)
+    num_inference_steps = flow_dpms_inference_steps
+    guidance_scale = flow_dpms_guidance_scale
+    pag_guidance_scale = flow_dpms_pag_guidance_scale
+    if not use_negative_prompt:
+        negative_prompt = None  # type: ignore
+    prompt, negative_prompt = apply_style(style, prompt, negative_prompt)
+    pipe.progress_fn(0, desc="Sana Start")
+    time_start = time.time()
+    images = pipe(
+        prompt=prompt,
+        height=height,
+        width=width,
+        negative_prompt=negative_prompt,
+        guidance_scale=guidance_scale,
+        pag_guidance_scale=pag_guidance_scale,
+        num_inference_steps=num_inference_steps,
+        num_images_per_prompt=num_imgs,
+        generator=generator,
+    )
+    pipe.progress_fn(1.0, desc="Sana End")
+    INFER_SPEED = (time.time() - time_start) / num_imgs
+    save_img = False
+    if save_img:
+        img = [save_image_sana(img, seed, save_img=save_image) for img in images]
+        print(img)
+    else:
+        if num_imgs > 1:
+            nrow = 2
+        else:
+            nrow = 1
+        img = make_grid(images, nrow=nrow, normalize=True, value_range=(-1, 1))
+        img = img.mul(255).add_(0.5).clamp_(0, 255).permute(1, 2, 0).to("cpu", torch.uint8).numpy()
+        img = [Image.fromarray(img.astype(np.uint8))]
+    torch.cuda.empty_cache()
+    return (
+        img,
+        seed,
+        f"<span style='font-size: 16px; font-weight: bold;'>Inference Speed: {INFER_SPEED:.3f} s/Img</span>",
+    )
+TEST_TIMES = read_inference_count()
+model_size = "1.6" if "D20" in args.model_path else "0.6"
+title = f"""
+    <div style='display: flex; align-items: center; justify-content: center; text-align: center;'>
+        <img src="https://raw.githubusercontent.com/NVlabs/Sana/refs/heads/main/asset/logo.png" width="50%" alt="logo"/>
+    </div>
+"""
+DESCRIPTION = f"""
+        <p><span style="font-size: 36px; font-weight: bold;">Sana-{model_size}B</span><span style="font-size: 20px; font-weight: bold;">{args.image_size}px</span></p>
+        <p style="font-size: 16px; font-weight: bold;">Sana: Efficient High-Resolution Image Synthesis with Linear Diffusion Transformer</p>
+        <p><span style="font-size: 16px;"><a href="https://arxiv.org/abs/2410.10629">[Paper]</a></span> <span style="font-size: 16px;"><a href="https://github.com/NVlabs/Sana">[Github(coming soon)]</a></span> <span style="font-size: 16px;"><a href="https://nvlabs.github.io/Sana">[Project]</a></span</p>
+        <p style="font-size: 16px; font-weight: bold;">Powered by <a href="https://hanlab.mit.edu/projects/dc-ae">DC-AE</a> with 32x latent space</p>, running on A6000 node.
+        <p style="font-size: 16px; font-weight: bold;">Unsafe word will give you a 'Red Heart' in the image instead.</p>
+        """
+if model_size == "0.6":
+    DESCRIPTION += "\n<p>0.6B model's text rendering ability is limited.</p>"
+if not torch.cuda.is_available():
+    DESCRIPTION += "\n<p>Running on CPU 🥶 This demo does not work on CPU.</p>"
+examples = [
+    'a cyberpunk cat with a neon sign that says "Sana"',
+    "A very detailed and realistic full body photo set of a tall, slim, and athletic Shiba Inu in a white oversized straight t-shirt, white shorts, and short white shoes.",
+    "Pirate ship trapped in a cosmic maelstrom nebula, rendered in cosmic beach whirlpool engine, volumetric lighting, spectacular, ambient lights, light pollution, cinematic atmosphere, art nouveau style, illustration art artwork by SenseiJaye, intricate detail.",
+    "portrait photo of a girl, photograph, highly detailed face, depth of field",
+    'make me a logo that says "So Fast"  with a really cool flying dragon shape with lightning sparks all over the sides and all of it contains Indonesian language',
+    "🐶 Wearing 🕶 flying on the 🌈",
+    "👧 with 🌹 in the ❄️",
+    "an old rusted robot wearing pants and a jacket riding skis in a supermarket.",
+    "professional portrait photo of an anthropomorphic cat wearing fancy gentleman hat and jacket walking in autumn forest.",
+    "Astronaut in a jungle, cold color palette, muted colors, detailed",
+    "a stunning and luxurious bedroom carved into a rocky mountainside seamlessly blending nature with modern design with a plush earth-toned bed textured stone walls circular fireplace massive uniquely shaped window framing snow-capped mountains dense forests",
+]
+css = """
+.gradio-container{max-width: 640px !important}
+h1{text-align:center}
+"""
+with gr.Blocks(css=css) as demo:
+    gr.Markdown(title)
+    gr.Markdown(DESCRIPTION)
+    gr.DuplicateButton(
+        value="Duplicate Space for private use",
+        elem_id="duplicate-button",
+        visible=os.getenv("SHOW_DUPLICATE_BUTTON") == "1",
+    )
+    info_box = gr.Markdown(
+        value=f"<span style='font-size: 16px; font-weight: bold;'>Total inference runs: </span><span style='font-size: 16px; color:red; font-weight: bold;'>{read_inference_count()}</span>"
+    )
+    demo.load(fn=update_inference_count, outputs=info_box)  # update the value when re-loading the page
+    # with gr.Row(equal_height=False):
+    with gr.Group():
+        with gr.Row():
+            prompt = gr.Text(
+                label="Prompt",
+                show_label=False,
+                max_lines=1,
+                placeholder="Enter your prompt",
+                container=False,
+            )
+            run_button = gr.Button("Run", scale=0)
+        result = gr.Gallery(label="Result", show_label=False, columns=NUM_IMAGES_PER_PROMPT, format="png")
+    speed_box = gr.Markdown(
+        value=f"<span style='font-size: 16px; font-weight: bold;'>Inference speed: {INFER_SPEED} s/Img</span>"
+    )
+    with gr.Accordion("Advanced options", open=False):
+        with gr.Group():
+            with gr.Row(visible=True):
+                height = gr.Slider(
+                    label="Height",
+                    minimum=256,
+                    maximum=MAX_IMAGE_SIZE,
+                    step=32,
+                    value=1024,
+                )
+                width = gr.Slider(
+                    label="Width",
+                    minimum=256,
+                    maximum=MAX_IMAGE_SIZE,
+                    step=32,
+                    value=1024,
+                )
+            with gr.Row():
+                flow_dpms_inference_steps = gr.Slider(
+                    label="Sampling steps",
+                    minimum=5,
+                    maximum=40,
+                    step=1,
+                    value=18,
+                )
+                flow_dpms_guidance_scale = gr.Slider(
+                    label="CFG Guidance scale",
+                    minimum=1,
+                    maximum=10,
+                    step=0.1,
+                    value=5.0,
+                )
+                flow_dpms_pag_guidance_scale = gr.Slider(
+                    label="PAG Guidance scale",
+                    minimum=1,
+                    maximum=4,
+                    step=0.5,
+                    value=2.0,
+                )
+            with gr.Row():
+                use_negative_prompt = gr.Checkbox(label="Use negative prompt", value=False, visible=True)
+            negative_prompt = gr.Text(
+                label="Negative prompt",
+                max_lines=1,
+                placeholder="Enter a negative prompt",
+                visible=True,
+            )
+            style_selection = gr.Radio(
+                show_label=True,
+                container=True,
+                interactive=True,
+                choices=STYLE_NAMES,
+                value=DEFAULT_STYLE_NAME,
+                label="Image Style",
+            )
+            seed = gr.Slider(
+                label="Seed",
+                minimum=0,
+                maximum=MAX_SEED,
+                step=1,
+                value=0,
+            )
+            randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
+            with gr.Row(visible=True):
+                schedule = gr.Radio(
+                    show_label=True,
+                    container=True,
+                    interactive=True,
+                    choices=SCHEDULE_NAME,
+                    value=DEFAULT_SCHEDULE_NAME,
+                    label="Sampler Schedule",
+                    visible=True,
+                )
+                num_imgs = gr.Slider(
+                    label="Num Images",
+                    minimum=1,
+                    maximum=6,
+                    step=1,
+                    value=1,
+                )
+    run_button.click(fn=run_inference, inputs=num_imgs, outputs=info_box)
+    gr.Examples(
+        examples=examples,
+        inputs=prompt,
+        outputs=[result, seed],
+        fn=generate,
+        cache_examples=CACHE_EXAMPLES,
+    )
+    use_negative_prompt.change(
+        fn=lambda x: gr.update(visible=x),
+        inputs=use_negative_prompt,
+        outputs=negative_prompt,
+        api_name=False,
+    )
+    gr.on(
+        triggers=[
+            prompt.submit,
+            negative_prompt.submit,
+            run_button.click,
+        ],
+        fn=generate,
+        inputs=[
+            prompt,
+            negative_prompt,
+            style_selection,
+            use_negative_prompt,
+            num_imgs,
+            seed,
+            height,
+            width,
+            flow_dpms_guidance_scale,
+            flow_dpms_pag_guidance_scale,
+            flow_dpms_inference_steps,
+            randomize_seed,
+        ],
+        outputs=[result, seed, speed_box],
+        api_name="run",
+    )
+if __name__ == "__main__":
+    demo.queue(max_size=20).launch(server_name="0.0.0.0", server_port=DEMO_PORT, debug=True, share=True)

app/app_sana_multithread.py ADDED Viewed

	@@ -0,0 +1,565 @@

+#!/usr/bin/env python
+# Copyright 2024 NVIDIA CORPORATION & AFFILIATES
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+import argparse
+import os
+import random
+import uuid
+from datetime import datetime
+import gradio as gr
+import numpy as np
+import spaces
+import torch
+from diffusers import FluxPipeline
+from PIL import Image
+from torchvision.utils import make_grid, save_image
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from app import safety_check
+from app.sana_pipeline import SanaPipeline
+MAX_SEED = np.iinfo(np.int32).max
+CACHE_EXAMPLES = torch.cuda.is_available() and os.getenv("CACHE_EXAMPLES", "1") == "1"
+MAX_IMAGE_SIZE = int(os.getenv("MAX_IMAGE_SIZE", "4096"))
+USE_TORCH_COMPILE = os.getenv("USE_TORCH_COMPILE", "0") == "1"
+ENABLE_CPU_OFFLOAD = os.getenv("ENABLE_CPU_OFFLOAD", "0") == "1"
+DEMO_PORT = int(os.getenv("DEMO_PORT", "15432"))
+os.environ["GRADIO_EXAMPLES_CACHE"] = "./.gradio/cache"
+device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+style_list = [
+    {
+        "name": "(No style)",
+        "prompt": "{prompt}",
+        "negative_prompt": "",
+    },
+    {
+        "name": "Cinematic",
+        "prompt": "cinematic still {prompt} . emotional, harmonious, vignette, highly detailed, high budget, bokeh, "
+        "cinemascope, moody, epic, gorgeous, film grain, grainy",
+        "negative_prompt": "anime, cartoon, graphic, text, painting, crayon, graphite, abstract, glitch, deformed, mutated, ugly, disfigured",
+    },
+    {
+        "name": "Photographic",
+        "prompt": "cinematic photo {prompt} . 35mm photograph, film, bokeh, professional, 4k, highly detailed",
+        "negative_prompt": "drawing, painting, crayon, sketch, graphite, impressionist, noisy, blurry, soft, deformed, ugly",
+    },
+    {
+        "name": "Anime",
+        "prompt": "anime artwork {prompt} . anime style, key visual, vibrant, studio anime,  highly detailed",
+        "negative_prompt": "photo, deformed, black and white, realism, disfigured, low contrast",
+    },
+    {
+        "name": "Manga",
+        "prompt": "manga style {prompt} . vibrant, high-energy, detailed, iconic, Japanese comic style",
+        "negative_prompt": "ugly, deformed, noisy, blurry, low contrast, realism, photorealistic, Western comic style",
+    },
+    {
+        "name": "Digital Art",
+        "prompt": "concept art {prompt} . digital artwork, illustrative, painterly, matte painting, highly detailed",
+        "negative_prompt": "photo, photorealistic, realism, ugly",
+    },
+    {
+        "name": "Pixel art",
+        "prompt": "pixel-art {prompt} . low-res, blocky, pixel art style, 8-bit graphics",
+        "negative_prompt": "sloppy, messy, blurry, noisy, highly detailed, ultra textured, photo, realistic",
+    },
+    {
+        "name": "Fantasy art",
+        "prompt": "ethereal fantasy concept art of  {prompt} . magnificent, celestial, ethereal, painterly, epic, "
+        "majestic, magical, fantasy art, cover art, dreamy",
+        "negative_prompt": "photographic, realistic, realism, 35mm film, dslr, cropped, frame, text, deformed, "
+        "glitch, noise, noisy, off-center, deformed, cross-eyed, closed eyes, bad anatomy, ugly, "
+        "disfigured, sloppy, duplicate, mutated, black and white",
+    },
+    {
+        "name": "Neonpunk",
+        "prompt": "neonpunk style {prompt} . cyberpunk, vaporwave, neon, vibes, vibrant, stunningly beautiful, crisp, "
+        "detailed, sleek, ultramodern, magenta highlights, dark purple shadows, high contrast, cinematic, "
+        "ultra detailed, intricate, professional",
+        "negative_prompt": "painting, drawing, illustration, glitch, deformed, mutated, cross-eyed, ugly, disfigured",
+    },
+    {
+        "name": "3D Model",
+        "prompt": "professional 3d model {prompt} . octane render, highly detailed, volumetric, dramatic lighting",
+        "negative_prompt": "ugly, deformed, noisy, low poly, blurry, painting",
+    },
+]
+styles = {k["name"]: (k["prompt"], k["negative_prompt"]) for k in style_list}
+STYLE_NAMES = list(styles.keys())
+DEFAULT_STYLE_NAME = "(No style)"
+SCHEDULE_NAME = ["Flow_DPM_Solver"]
+DEFAULT_SCHEDULE_NAME = "Flow_DPM_Solver"
+NUM_IMAGES_PER_PROMPT = 1
+TEST_TIMES = 0
+FILENAME = f"output/port{DEMO_PORT}_inference_count.txt"
+def set_env(seed=0):
+    torch.manual_seed(seed)
+    torch.set_grad_enabled(False)
+def read_inference_count():
+    global TEST_TIMES
+    try:
+        with open(FILENAME) as f:
+            count = int(f.read().strip())
+    except FileNotFoundError:
+        count = 0
+    TEST_TIMES = count
+    return count
+def write_inference_count(count):
+    with open(FILENAME, "w") as f:
+        f.write(str(count))
+def run_inference(num_imgs=1):
+    TEST_TIMES = read_inference_count()
+    TEST_TIMES += int(num_imgs)
+    write_inference_count(TEST_TIMES)
+    return (
+        f"<span style='font-size: 16px; font-weight: bold;'>Total inference runs: </span><span style='font-size: "
+        f"16px; color:red; font-weight: bold;'>{TEST_TIMES}</span>"
+    )
+def update_inference_count():
+    count = read_inference_count()
+    return (
+        f"<span style='font-size: 16px; font-weight: bold;'>Total inference runs: </span><span style='font-size: "
+        f"16px; color:red; font-weight: bold;'>{count}</span>"
+    )
+def apply_style(style_name: str, positive: str, negative: str = "") -> tuple[str, str]:
+    p, n = styles.get(style_name, styles[DEFAULT_STYLE_NAME])
+    if not negative:
+        negative = ""
+    return p.replace("{prompt}", positive), n + negative
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config", type=str, help="config")
+    parser.add_argument(
+        "--model_path",
+        nargs="?",
+        default="output/Sana_D20/SANA.pth",
+        type=str,
+        help="Path to the model file (positional)",
+    )
+    parser.add_argument("--output", default="./", type=str)
+    parser.add_argument("--bs", default=1, type=int)
+    parser.add_argument("--image_size", default=1024, type=int)
+    parser.add_argument("--cfg_scale", default=5.0, type=float)
+    parser.add_argument("--pag_scale", default=2.0, type=float)
+    parser.add_argument("--seed", default=42, type=int)
+    parser.add_argument("--step", default=-1, type=int)
+    parser.add_argument("--custom_image_size", default=None, type=int)
+    parser.add_argument(
+        "--shield_model_path",
+        type=str,
+        help="The path to shield model, we employ ShieldGemma-2B by default.",
+        default="google/shieldgemma-2b",
+    )
+    return parser.parse_args()
+args = get_args()
+if torch.cuda.is_available():
+    weight_dtype = torch.float16
+    model_path = args.model_path
+    pipe = SanaPipeline(args.config)
+    pipe.from_pretrained(model_path)
+    pipe.register_progress_bar(gr.Progress())
+    repo_name = "black-forest-labs/FLUX.1-dev"
+    pipe2 = FluxPipeline.from_pretrained(repo_name, torch_dtype=torch.float16).to("cuda")
+    # safety checker
+    safety_checker_tokenizer = AutoTokenizer.from_pretrained(args.shield_model_path)
+    safety_checker_model = AutoModelForCausalLM.from_pretrained(
+        args.shield_model_path,
+        device_map="auto",
+        torch_dtype=torch.bfloat16,
+    ).to(device)
+    set_env(42)
+def save_image_sana(img, seed="", save_img=False):
+    unique_name = f"{str(uuid.uuid4())}_{seed}.png"
+    save_path = os.path.join(f"output/online_demo_img/{datetime.now().date()}")
+    os.umask(0o000)  # file permission: 666; dir permission: 777
+    os.makedirs(save_path, exist_ok=True)
+    unique_name = os.path.join(save_path, unique_name)
+    if save_img:
+        save_image(img, unique_name, nrow=1, normalize=True, value_range=(-1, 1))
+    return unique_name
+def randomize_seed_fn(seed: int, randomize_seed: bool) -> int:
+    if randomize_seed:
+        seed = random.randint(0, MAX_SEED)
+    return seed
+@spaces.GPU(enable_queue=True)
+async def generate_2(
+    prompt: str = None,
+    negative_prompt: str = "",
+    style: str = DEFAULT_STYLE_NAME,
+    use_negative_prompt: bool = False,
+    num_imgs: int = 1,
+    seed: int = 0,
+    height: int = 1024,
+    width: int = 1024,
+    flow_dpms_guidance_scale: float = 5.0,
+    flow_dpms_pag_guidance_scale: float = 2.0,
+    flow_dpms_inference_steps: int = 20,
+    randomize_seed: bool = False,
+):
+    seed = int(randomize_seed_fn(seed, randomize_seed))
+    generator = torch.Generator(device=device).manual_seed(seed)
+    print(f"PORT: {DEMO_PORT}, model_path: {model_path}")
+    if safety_check.is_dangerous(safety_checker_tokenizer, safety_checker_model, prompt):
+        prompt = "A red heart."
+    print(prompt)
+    if not use_negative_prompt:
+        negative_prompt = None  # type: ignore
+    prompt, negative_prompt = apply_style(style, prompt, negative_prompt)
+    with torch.no_grad():
+        images = pipe2(
+            prompt=prompt,
+            height=height,
+            width=width,
+            guidance_scale=3.5,
+            num_inference_steps=50,
+            num_images_per_prompt=num_imgs,
+            max_sequence_length=256,
+            generator=generator,
+        ).images
+    save_img = False
+    img = images
+    if save_img:
+        img = [save_image_sana(img, seed, save_img=save_image) for img in images]
+        print(img)
+    torch.cuda.empty_cache()
+    return img
+@spaces.GPU(enable_queue=True)
+async def generate(
+    prompt: str = None,
+    negative_prompt: str = "",
+    style: str = DEFAULT_STYLE_NAME,
+    use_negative_prompt: bool = False,
+    num_imgs: int = 1,
+    seed: int = 0,
+    height: int = 1024,
+    width: int = 1024,
+    flow_dpms_guidance_scale: float = 5.0,
+    flow_dpms_pag_guidance_scale: float = 2.0,
+    flow_dpms_inference_steps: int = 20,
+    randomize_seed: bool = False,
+):
+    global TEST_TIMES
+    # seed = 823753551
+    seed = int(randomize_seed_fn(seed, randomize_seed))
+    generator = torch.Generator(device=device).manual_seed(seed)
+    print(f"PORT: {DEMO_PORT}, model_path: {model_path}, time_times: {TEST_TIMES}")
+    if safety_check.is_dangerous(safety_checker_tokenizer, safety_checker_model, prompt):
+        prompt = "A red heart."
+    print(prompt)
+    num_inference_steps = flow_dpms_inference_steps
+    guidance_scale = flow_dpms_guidance_scale
+    pag_guidance_scale = flow_dpms_pag_guidance_scale
+    if not use_negative_prompt:
+        negative_prompt = None  # type: ignore
+    prompt, negative_prompt = apply_style(style, prompt, negative_prompt)
+    pipe.progress_fn(0, desc="Sana Start")
+    with torch.no_grad():
+        images = pipe(
+            prompt=prompt,
+            height=height,
+            width=width,
+            negative_prompt=negative_prompt,
+            guidance_scale=guidance_scale,
+            pag_guidance_scale=pag_guidance_scale,
+            num_inference_steps=num_inference_steps,
+            num_images_per_prompt=num_imgs,
+            generator=generator,
+        )
+    pipe.progress_fn(1.0, desc="Sana End")
+    save_img = False
+    if save_img:
+        img = [save_image_sana(img, seed, save_img=save_image) for img in images]
+        print(img)
+    else:
+        if num_imgs > 1:
+            nrow = 2
+        else:
+            nrow = 1
+        img = make_grid(images, nrow=nrow, normalize=True, value_range=(-1, 1))
+        img = img.mul(255).add_(0.5).clamp_(0, 255).permute(1, 2, 0).to("cpu", torch.uint8).numpy()
+        img = [Image.fromarray(img.astype(np.uint8))]
+    torch.cuda.empty_cache()
+    return img
+TEST_TIMES = read_inference_count()
+model_size = "1.6" if "D20" in args.model_path else "0.6"
+title = f"""
+    <div style='display: flex; align-items: center; justify-content: center; text-align: center;'>
+        <img src="https://raw.githubusercontent.com/NVlabs/Sana/refs/heads/main/asset/logo.png" width="50%" alt="logo"/>
+    </div>
+"""
+DESCRIPTION = f"""
+        <p><span style="font-size: 36px; font-weight: bold;">Sana-{model_size}B</span><span style="font-size: 20px; font-weight: bold;">{args.image_size}px</span></p>
+        <p style="font-size: 16px; font-weight: bold;">Sana: Efficient High-Resolution Image Synthesis with Linear Diffusion Transformer</p>
+        <p><span style="font-size: 16px;"><a href="https://arxiv.org/abs/2410.10629">[Paper]</a></span> <span style="font-size: 16px;"><a href="https://github.com/NVlabs/Sana">[Github(coming soon)]</a></span> <span style="font-size: 16px;"><a href="https://nvlabs.github.io/Sana">[Project]</a></span</p>
+        <p style="font-size: 16px; font-weight: bold;">Powered by <a href="https://hanlab.mit.edu/projects/dc-ae">DC-AE</a> with 32x latent space</p>
+        <p style="font-size: 16px; font-weight: bold;">Unsafe word will give you a 'Red Heart' in the image instead.</p>
+        """
+if model_size == "0.6":
+    DESCRIPTION += "\n<p>0.6B model's text rendering ability is limited.</p>"
+if not torch.cuda.is_available():
+    DESCRIPTION += "\n<p>Running on CPU 🥶 This demo does not work on CPU.</p>"
+examples = [
+    'a cyberpunk cat with a neon sign that says "Sana"',
+    "A very detailed and realistic full body photo set of a tall, slim, and athletic Shiba Inu in a white oversized straight t-shirt, white shorts, and short white shoes.",
+    "Pirate ship trapped in a cosmic maelstrom nebula, rendered in cosmic beach whirlpool engine, volumetric lighting, spectacular, ambient lights, light pollution, cinematic atmosphere, art nouveau style, illustration art artwork by SenseiJaye, intricate detail.",
+    "portrait photo of a girl, photograph, highly detailed face, depth of field",
+    'make me a logo that says "So Fast"  with a really cool flying dragon shape with lightning sparks all over the sides and all of it contains Indonesian language',
+    "🐶 Wearing 🕶 flying on the 🌈",
+    # "👧 with 🌹 in the ❄️",
+    # "an old rusted robot wearing pants and a jacket riding skis in a supermarket.",
+    # "professional portrait photo of an anthropomorphic cat wearing fancy gentleman hat and jacket walking in autumn forest.",
+    # "Astronaut in a jungle, cold color palette, muted colors, detailed",
+    # "a stunning and luxurious bedroom carved into a rocky mountainside seamlessly blending nature with modern design with a plush earth-toned bed textured stone walls circular fireplace massive uniquely shaped window framing snow-capped mountains dense forests",
+]
+css = """
+.gradio-container{max-width: 1024px !important}
+h1{text-align:center}
+"""
+with gr.Blocks(css=css) as demo:
+    gr.Markdown(title)
+    gr.Markdown(DESCRIPTION)
+    gr.DuplicateButton(
+        value="Duplicate Space for private use",
+        elem_id="duplicate-button",
+        visible=os.getenv("SHOW_DUPLICATE_BUTTON") == "1",
+    )
+    info_box = gr.Markdown(
+        value=f"<span style='font-size: 16px; font-weight: bold;'>Total inference runs: </span><span style='font-size: 16px; color:red; font-weight: bold;'>{read_inference_count()}</span>"
+    )
+    demo.load(fn=update_inference_count, outputs=info_box)  # update the value when re-loading the page
+    # with gr.Row(equal_height=False):
+    with gr.Group():
+        with gr.Row():
+            prompt = gr.Text(
+                label="Prompt",
+                show_label=False,
+                max_lines=1,
+                placeholder="Enter your prompt",
+                container=False,
+            )
+            run_button = gr.Button("Run-sana", scale=0)
+            run_button2 = gr.Button("Run-flux", scale=0)
+        with gr.Row():
+            result = gr.Gallery(label="Result from Sana", show_label=True, columns=NUM_IMAGES_PER_PROMPT, format="webp")
+            result_2 = gr.Gallery(
+                label="Result from FLUX", show_label=True, columns=NUM_IMAGES_PER_PROMPT, format="webp"
+            )
+    with gr.Accordion("Advanced options", open=False):
+        with gr.Group():
+            with gr.Row(visible=True):
+                height = gr.Slider(
+                    label="Height",
+                    minimum=256,
+                    maximum=MAX_IMAGE_SIZE,
+                    step=32,
+                    value=1024,
+                )
+                width = gr.Slider(
+                    label="Width",
+                    minimum=256,
+                    maximum=MAX_IMAGE_SIZE,
+                    step=32,
+                    value=1024,
+                )
+            with gr.Row():
+                flow_dpms_inference_steps = gr.Slider(
+                    label="Sampling steps",
+                    minimum=5,
+                    maximum=40,
+                    step=1,
+                    value=18,
+                )
+                flow_dpms_guidance_scale = gr.Slider(
+                    label="CFG Guidance scale",
+                    minimum=1,
+                    maximum=10,
+                    step=0.1,
+                    value=5.0,
+                )
+                flow_dpms_pag_guidance_scale = gr.Slider(
+                    label="PAG Guidance scale",
+                    minimum=1,
+                    maximum=4,
+                    step=0.5,
+                    value=2.0,
+                )
+            with gr.Row():
+                use_negative_prompt = gr.Checkbox(label="Use negative prompt", value=False, visible=True)
+            negative_prompt = gr.Text(
+                label="Negative prompt",
+                max_lines=1,
+                placeholder="Enter a negative prompt",
+                visible=True,
+            )
+            style_selection = gr.Radio(
+                show_label=True,
+                container=True,
+                interactive=True,
+                choices=STYLE_NAMES,
+                value=DEFAULT_STYLE_NAME,
+                label="Image Style",
+            )
+            seed = gr.Slider(
+                label="Seed",
+                minimum=0,
+                maximum=MAX_SEED,
+                step=1,
+                value=0,
+            )
+            randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
+            with gr.Row(visible=True):
+                schedule = gr.Radio(
+                    show_label=True,
+                    container=True,
+                    interactive=True,
+                    choices=SCHEDULE_NAME,
+                    value=DEFAULT_SCHEDULE_NAME,
+                    label="Sampler Schedule",
+                    visible=True,
+                )
+                num_imgs = gr.Slider(
+                    label="Num Images",
+                    minimum=1,
+                    maximum=6,
+                    step=1,
+                    value=1,
+                )
+    run_button.click(fn=run_inference, inputs=num_imgs, outputs=info_box)
+    gr.Examples(
+        examples=examples,
+        inputs=prompt,
+        outputs=[result],
+        fn=generate,
+        cache_examples=CACHE_EXAMPLES,
+    )
+    gr.Examples(
+        examples=examples,
+        inputs=prompt,
+        outputs=[result_2],
+        fn=generate_2,
+        cache_examples=CACHE_EXAMPLES,
+    )
+    use_negative_prompt.change(
+        fn=lambda x: gr.update(visible=x),
+        inputs=use_negative_prompt,
+        outputs=negative_prompt,
+        api_name=False,
+    )
+    run_button.click(
+        fn=generate,
+        inputs=[
+            prompt,
+            negative_prompt,
+            style_selection,
+            use_negative_prompt,
+            num_imgs,
+            seed,
+            height,
+            width,
+            flow_dpms_guidance_scale,
+            flow_dpms_pag_guidance_scale,
+            flow_dpms_inference_steps,
+            randomize_seed,
+        ],
+        outputs=[result],
+        queue=True,
+    )
+    run_button2.click(
+        fn=generate_2,
+        inputs=[
+            prompt,
+            negative_prompt,
+            style_selection,
+            use_negative_prompt,
+            num_imgs,
+            seed,
+            height,
+            width,
+            flow_dpms_guidance_scale,
+            flow_dpms_pag_guidance_scale,
+            flow_dpms_inference_steps,
+            randomize_seed,
+        ],
+        outputs=[result_2],
+        queue=True,
+    )
+if __name__ == "__main__":
+    demo.queue(max_size=20).launch(server_name="0.0.0.0", server_port=DEMO_PORT, debug=True, share=True)

app/safety_check.py ADDED Viewed

	@@ -0,0 +1,72 @@

+# Copyright 2024 NVIDIA CORPORATION & AFFILIATES
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+# This file is copy from https://github.com/mit-han-lab/hart/blob/main/hart/utils/safety_check.py
+import json
+from functools import lru_cache
+from typing import Dict, Sequence
+import torch
+import transformers
+from torch.nn.functional import softmax
+from transformers import AutoModelForCausalLM, AutoTokenizer
+__all__ = ["is_dangerous"]
+__rank, __local_rank, __world_size, __device = (
+    0,
+    0,
+    1,
+    "cuda" if torch.cuda.is_available() else "cpu",
+)
+safety_policy = """
+    The prompt must not seek to generate harmful, abusive, hateful, sexually explicit, or otherwise inappropriate content targeting individuals or protected groups.
+"""
+def is_dangerous(tokenizer, model, user_prompt, threshold=0.5):
+    prompt = f"""You are a policy expert trying to help determine whether a user
+    prompt is in violation of the defined safety policies.
+    <start_of_turn>
+    Human Question: {user_prompt.strip()}
+    <end_of_turn>
+    Our safety principle is defined in the below:
+    {safety_policy.strip()}
+    Does the human question violate the above principle? Your answer must start
+    with 'Yes' or 'No'. And then walk through step by step to be sure we answer
+    correctly.
+    """
+    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
+    with torch.no_grad():
+        logits = model(**inputs).logits
+    # Extract the logits for the Yes and No tokens
+    vocab = tokenizer.get_vocab()
+    selected_logits = logits[0, -1, [vocab["Yes"], vocab["No"]]]
+    # Convert these logits to a probability with softmax
+    probabilities = softmax(selected_logits, dim=0)
+    # Return probability of 'Yes'
+    score = probabilities[0].item()
+    return score > threshold

app/sana_pipeline.py ADDED Viewed

	@@ -0,0 +1,324 @@

+# Copyright 2024 NVIDIA CORPORATION & AFFILIATES
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+import argparse
+import warnings
+from dataclasses import dataclass, field
+from typing import Optional, Tuple
+import pyrallis
+import torch
+import torch.nn as nn
+warnings.filterwarnings("ignore")  # ignore warning
+from diffusion import DPMS, FlowEuler
+from diffusion.data.datasets.utils import ASPECT_RATIO_512_TEST, ASPECT_RATIO_1024_TEST, ASPECT_RATIO_2048_TEST
+from diffusion.model.builder import build_model, get_tokenizer_and_text_encoder, get_vae, vae_decode
+from diffusion.model.utils import prepare_prompt_ar, resize_and_crop_tensor
+from diffusion.utils.config import SanaConfig
+from diffusion.utils.logger import get_root_logger
+# from diffusion.utils.misc import read_config
+from tools.download import find_model
+def guidance_type_select(default_guidance_type, pag_scale, attn_type):
+    guidance_type = default_guidance_type
+    if not (pag_scale > 1.0 and attn_type == "linear"):
+        guidance_type = "classifier-free"
+    return guidance_type
+def classify_height_width_bin(height: int, width: int, ratios: dict) -> Tuple[int, int]:
+    """Returns binned height and width."""
+    ar = float(height / width)
+    closest_ratio = min(ratios.keys(), key=lambda ratio: abs(float(ratio) - ar))
+    default_hw = ratios[closest_ratio]
+    return int(default_hw[0]), int(default_hw[1])
+@dataclass
+class SanaInference(SanaConfig):
+    config: Optional[str] = "configs/sana_config/1024ms/Sana_1600M_img1024.yaml"  # config
+    model_path: str = field(
+        default="output/Sana_D20/SANA.pth", metadata={"help": "Path to the model file (positional)"}
+    )
+    output: str = "./output"
+    bs: int = 1
+    image_size: int = 1024
+    cfg_scale: float = 5.0
+    pag_scale: float = 2.0
+    seed: int = 42
+    step: int = -1
+    custom_image_size: Optional[int] = None
+    shield_model_path: str = field(
+        default="google/shieldgemma-2b",
+        metadata={"help": "The path to shield model, we employ ShieldGemma-2B by default."},
+    )
+class SanaPipeline(nn.Module):
+    def __init__(
+        self,
+        config: Optional[str] = "configs/sana_config/1024ms/Sana_1600M_img1024.yaml",
+    ):
+        super().__init__()
+        config = pyrallis.load(SanaInference, open(config))
+        self.args = self.config = config
+        # set some hyper-parameters
+        self.image_size = self.config.model.image_size
+        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+        logger = get_root_logger()
+        self.logger = logger
+        self.progress_fn = lambda progress, desc: None
+        self.latent_size = self.image_size // config.vae.vae_downsample_rate
+        self.max_sequence_length = config.text_encoder.model_max_length
+        self.flow_shift = config.scheduler.flow_shift
+        guidance_type = "classifier-free_PAG"
+        if config.model.mixed_precision == "fp16":
+            weight_dtype = torch.float16
+        elif config.model.mixed_precision == "bf16":
+            weight_dtype = torch.bfloat16
+        elif config.model.mixed_precision == "fp32":
+            weight_dtype = torch.float32
+        else:
+            raise ValueError(f"weigh precision {config.model.mixed_precision} is not defined")
+        self.weight_dtype = weight_dtype
+        self.base_ratios = eval(f"ASPECT_RATIO_{self.image_size}_TEST")
+        self.vis_sampler = self.config.scheduler.vis_sampler
+        logger.info(f"Sampler {self.vis_sampler}, flow_shift: {self.flow_shift}")
+        self.guidance_type = guidance_type_select(guidance_type, self.args.pag_scale, config.model.attn_type)
+        logger.info(f"Inference with {self.weight_dtype}, PAG guidance layer: {self.config.model.pag_applied_layers}")
+        # 1. build vae and text encoder
+        self.vae = self.build_vae(config.vae)
+        self.tokenizer, self.text_encoder = self.build_text_encoder(config.text_encoder)
+        # 2. build Sana model
+        self.model = self.build_sana_model(config).to(self.device)
+        # 3. pre-compute null embedding
+        with torch.no_grad():
+            null_caption_token = self.tokenizer(
+                "", max_length=self.max_sequence_length, padding="max_length", truncation=True, return_tensors="pt"
+            ).to(self.device)
+            self.null_caption_embs = self.text_encoder(null_caption_token.input_ids, null_caption_token.attention_mask)[
+                0
+            ]
+    def build_vae(self, config):
+        vae = get_vae(config.vae_type, config.vae_pretrained, self.device).to(self.weight_dtype)
+        return vae
+    def build_text_encoder(self, config):
+        tokenizer, text_encoder = get_tokenizer_and_text_encoder(name=config.text_encoder_name, device=self.device)
+        return tokenizer, text_encoder
+    def build_sana_model(self, config):
+        # model setting
+        pred_sigma = getattr(config.scheduler, "pred_sigma", True)
+        learn_sigma = getattr(config.scheduler, "learn_sigma", True) and pred_sigma
+        model_kwargs = {
+            "input_size": self.latent_size,
+            "pe_interpolation": config.model.pe_interpolation,
+            "config": config,
+            "model_max_length": config.text_encoder.model_max_length,
+            "qk_norm": config.model.qk_norm,
+            "micro_condition": config.model.micro_condition,
+            "caption_channels": self.text_encoder.config.hidden_size,
+            "y_norm": config.text_encoder.y_norm,
+            "attn_type": config.model.attn_type,
+            "ffn_type": config.model.ffn_type,
+            "mlp_ratio": config.model.mlp_ratio,
+            "mlp_acts": list(config.model.mlp_acts),
+            "in_channels": config.vae.vae_latent_dim,
+            "y_norm_scale_factor": config.text_encoder.y_norm_scale_factor,
+            "use_pe": config.model.use_pe,
+            "pred_sigma": pred_sigma,
+            "learn_sigma": learn_sigma,
+            "use_fp32_attention": config.model.get("fp32_attention", False) and config.model.mixed_precision != "bf16",
+        }
+        model = build_model(config.model.model, **model_kwargs)
+        model = model.to(self.weight_dtype)
+        self.logger.info(f"use_fp32_attention: {model.fp32_attention}")
+        self.logger.info(
+            f"{model.__class__.__name__}:{config.model.model},"
+            f"Model Parameters: {sum(p.numel() for p in model.parameters()):,}"
+        )
+        return model
+    def from_pretrained(self, model_path):
+        state_dict = find_model(model_path)
+        state_dict = state_dict.get("state_dict", state_dict)
+        if "pos_embed" in state_dict:
+            del state_dict["pos_embed"]
+        missing, unexpected = self.model.load_state_dict(state_dict, strict=False)
+        self.model.eval().to(self.weight_dtype)
+        self.logger.info("Generating sample from ckpt: %s" % model_path)
+        self.logger.warning(f"Missing keys: {missing}")
+        self.logger.warning(f"Unexpected keys: {unexpected}")
+    def register_progress_bar(self, progress_fn=None):
+        self.progress_fn = progress_fn if progress_fn is not None else self.progress_fn
+    @torch.inference_mode()
+    def forward(
+        self,
+        prompt=None,
+        height=1024,
+        width=1024,
+        negative_prompt="",
+        num_inference_steps=20,
+        guidance_scale=5,
+        pag_guidance_scale=2.5,
+        num_images_per_prompt=1,
+        generator=torch.Generator().manual_seed(42),
+        latents=None,
+    ):
+        self.ori_height, self.ori_width = height, width
+        self.height, self.width = classify_height_width_bin(height, width, ratios=self.base_ratios)
+        self.latent_size_h, self.latent_size_w = (
+            self.height // self.config.vae.vae_downsample_rate,
+            self.width // self.config.vae.vae_downsample_rate,
+        )
+        self.guidance_type = guidance_type_select(self.guidance_type, pag_guidance_scale, self.config.model.attn_type)
+        # 1. pre-compute negative embedding
+        if negative_prompt != "":
+            null_caption_token = self.tokenizer(
+                negative_prompt,
+                max_length=self.max_sequence_length,
+                padding="max_length",
+                truncation=True,
+                return_tensors="pt",
+            ).to(self.device)
+            self.null_caption_embs = self.text_encoder(null_caption_token.input_ids, null_caption_token.attention_mask)[
+                0
+            ]
+        if prompt is None:
+            prompt = [""]
+        prompts = prompt if isinstance(prompt, list) else [prompt]
+        samples = []
+        for prompt in prompts:
+            # data prepare
+            prompts, hw, ar = (
+                [],
+                torch.tensor([[self.image_size, self.image_size]], dtype=torch.float, device=self.device).repeat(
+                    num_images_per_prompt, 1
+                ),
+                torch.tensor([[1.0]], device=self.device).repeat(num_images_per_prompt, 1),
+            )
+            for _ in range(num_images_per_prompt):
+                with torch.no_grad():
+                    prompts.append(
+                        prepare_prompt_ar(prompt, self.base_ratios, device=self.device, show=False)[0].strip()
+                    )
+                    # prepare text feature
+                    if not self.config.text_encoder.chi_prompt:
+                        max_length_all = self.config.text_encoder.model_max_length
+                        prompts_all = prompts
+                    else:
+                        chi_prompt = "\n".join(self.config.text_encoder.chi_prompt)
+                        prompts_all = [chi_prompt + prompt for prompt in prompts]
+                        num_chi_prompt_tokens = len(self.tokenizer.encode(chi_prompt))
+                        max_length_all = (
+                            num_chi_prompt_tokens + self.config.text_encoder.model_max_length - 2
+                        )  # magic number 2: [bos], [_]
+                    caption_token = self.tokenizer(
+                        prompts_all,
+                        max_length=max_length_all,
+                        padding="max_length",
+                        truncation=True,
+                        return_tensors="pt",
+                    ).to(device=self.device)
+                    select_index = [0] + list(range(-self.config.text_encoder.model_max_length + 1, 0))
+                    caption_embs = self.text_encoder(caption_token.input_ids, caption_token.attention_mask)[0][:, None][
+                        :, :, select_index
+                    ].to(self.weight_dtype)
+                    emb_masks = caption_token.attention_mask[:, select_index]
+                    null_y = self.null_caption_embs.repeat(len(prompts), 1, 1)[:, None].to(self.weight_dtype)
+                    n = len(prompts)
+                    if latents is None:
+                        z = torch.randn(
+                            n,
+                            self.config.vae.vae_latent_dim,
+                            self.latent_size_h,
+                            self.latent_size_w,
+                            generator=generator,
+                            device=self.device,
+                            dtype=self.weight_dtype,
+                        )
+                    else:
+                        z = latents.to(self.weight_dtype).to(self.device)
+                    model_kwargs = dict(data_info={"img_hw": hw, "aspect_ratio": ar}, mask=emb_masks)
+                    if self.vis_sampler == "flow_euler":
+                        flow_solver = FlowEuler(
+                            self.model,
+                            condition=caption_embs,
+                            uncondition=null_y,
+                            cfg_scale=guidance_scale,
+                            model_kwargs=model_kwargs,
+                        )
+                        sample = flow_solver.sample(
+                            z,
+                            steps=num_inference_steps,
+                        )
+                    elif self.vis_sampler == "flow_dpm-solver":
+                        scheduler = DPMS(
+                            self.model,
+                            condition=caption_embs,
+                            uncondition=null_y,
+                            guidance_type=self.guidance_type,
+                            cfg_scale=guidance_scale,
+                            pag_scale=pag_guidance_scale,
+                            pag_applied_layers=self.config.model.pag_applied_layers,
+                            model_type="flow",
+                            model_kwargs=model_kwargs,
+                            schedule="FLOW",
+                        )
+                        scheduler.register_progress_bar(self.progress_fn)
+                        sample = scheduler.sample(
+                            z,
+                            steps=num_inference_steps,
+                            order=2,
+                            skip_type="time_uniform_flow",
+                            method="multistep",
+                            flow_shift=self.flow_shift,
+                        )
+            sample = sample.to(self.weight_dtype)
+            with torch.no_grad():
+                sample = vae_decode(self.config.vae.vae_type, self.vae, sample)
+            sample = resize_and_crop_tensor(sample, self.ori_width, self.ori_height)
+            samples.append(sample)
+            return sample
+        return samples

asset/Sana.jpg ADDED Viewed

Git LFS Details

SHA256: 1a10d77cfe5a1a703c2cb801d0f3fe9fa32a05c60dfff22b0bc7a479980df61c
Pointer size: 132 Bytes
Size of remote file: 1.16 MB

asset/docs/metrics_toolkit.md ADDED Viewed

	@@ -0,0 +1,118 @@

+# 💻 How to Inference & Test Metrics (FID, CLIP Score, GenEval, DPG-Bench, etc...)
+This ToolKit will automatically inference your model and log the metrics results onto wandb as chart for better illustration. We curerntly support:
+- \[x\] [FID](https://github.com/mseitzer/pytorch-fid) & [CLIP-Score](https://github.com/openai/CLIP)
+- \[x\] [GenEval](https://github.com/djghosh13/geneval)
+- \[x\] [DPG-Bench](https://github.com/TencentQQGYLab/ELLA)
+- \[x\] [ImageReward](https://github.com/THUDM/ImageReward/tree/main)
+### 0. Install corresponding env for GenEval and DPG-Bench
+Make sure you can activate the following envs:
+- `conda activate geneval`([GenEval](https://github.com/djghosh13/geneval))
+- `conda activate dpg`([DGB-Bench](https://github.com/TencentQQGYLab/ELLA))
+### 0.1 Prepare data.
+Metirc FID & CLIP-Score on [MJHQ-30K](https://huggingface.co/datasets/playgroundai/MJHQ-30K)
+```python
+from huggingface_hub import hf_hub_download
+hf_hub_download(
+  repo_id="playgroundai/MJHQ-30K",
+  filename="mjhq30k_imgs.zip",
+  local_dir="data/test/PG-eval-data/MJHQ-30K/",
+  repo_type="dataset"
+)
+```
+Unzip mjhq30k_imgs.zip into its per-category folder structure.
+```
+data/test/PG-eval-data/MJHQ-30K/imgs/
+├── animals
+├── art
+├── fashion
+├── food
+├── indoor
+├── landscape
+├── logo
+├── people
+├── plants
+└── vehicles
+```
+### 0.2 Prepare checkpoints
+```bash
+huggingface-cli download  Efficient-Large-Model/Sana_1600M_1024px --repo-type model --local-dir ./output/Sana_1600M_1024px --local-dir-use-symlinks False
+```
+### 1. directly \[Inference and Metric\] a .pth file
+```bash
+# We provide four scripts for evaluating metrics:
+fid_clipscore_launch=scripts/bash_run_inference_metric.sh
+geneval_launch=scripts/bash_run_inference_metric_geneval.sh
+dpg_launch=scripts/bash_run_inference_metric_dpg.sh
+image_reward_launch=scripts/bash_run_inference_metric_imagereward.sh
+# Use following format to metric your models:
+# bash $correspoinding_metric_launch $your_config_file_path $your_relative_pth_file_path
+# example
+bash $geneval_launch \
+    configs/sana_config/1024ms/Sana_1600M_img1024.yaml \
+    output/Sana_1600M_1024px/checkpoints/Sana_1600M_1024px.pth
+```
+### 2. \[Inference and Metric\] a list of .pth files using a txt file
+You can also write all your pth files of a job in one txt file, eg. [model_paths.txt](../model_paths.txt)
+```bash
+# Use following format to metric your models, gathering in a txt file:
+# bash $correspoinding_metric_launch $your_config_file_path $your_txt_file_path_containing_pth_path
+# We suggest follow the file tree structure in our project for robust experiment
+# example
+bash scripts/bash_run_inference_metric.sh \
+    configs/sana_config/1024ms/Sana_1600M_img1024.yaml \
+    asset/model_paths.txt
+```
+### 3. You will get the following data tree.
+```
+output
+├──your_job_name/  (everything will be saved here)
+│  ├──config.yaml
+│  ├──train_log.log
+│  ├──checkpoints    (all checkpoints)
+│  │  ├──epoch_1_step_6666.pth
+│  │  ├──epoch_1_step_8888.pth
+│  │  ├──......
+│  ├──vis    (all visualization result dirs)
+│  │  ├──visualization_file_name
+│  │  │  ├──xxxxxxx.jpg
+│  │  │  ├──......
+│  │  ├──visualization_file_name2
+│  │  │  ├──xxxxxxx.jpg
+│  │  │  ├──......
+│  ├──......
+│  ├──metrics    (all metrics testing related files)
+│  │  ├──model_paths.txt  Optional(👈)(relative path of testing ckpts)
+│  │  │  ├──output/your_job_name/checkpoings/epoch_1_step_6666.pth
+│  │  │  ├──output/your_job_name/checkpoings/epoch_1_step_8888.pth
+│  │  ├──fid_img_paths.txt  Optional(👈)(name of testing img_dir in vis)
+│  │  │  ├──visualization_file_name
+│  │  │  ├──visualization_file_name2
+│  │  ├──cached_img_paths.txt  Optional(👈)
+│  │  ├──......
+```

asset/example_data/00000000.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ a cyberpunk cat with a neon sign that says "Sana".

asset/examples.py ADDED Viewed

	@@ -0,0 +1,69 @@

+# Copyright 2024 NVIDIA CORPORATION & AFFILIATES
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+examples = [
+    [
+        "A small cactus with a happy face in the Sahara desert.",
+        "flow_dpm-solver",
+        20,
+        5.0,
+        2.5,
+    ],
+    [
+        "An extreme close-up of an gray-haired man with a beard in his 60s, he is deep in thought pondering the history"
+        "of the universe as he sits at a cafe in Paris, his eyes focus on people offscreen as they walk as he sits "
+        "mostly motionless, he is dressed in a wool coat suit coat with a button-down shirt, he wears a brown beret "
+        "and glasses and has a very professorial appearance, and the end he offers a subtle closed-mouth smile "
+        "as if he found the answer to the mystery of life, the lighting is very cinematic with the golden light and "
+        "the Parisian streets and city in the background, depth of field, cinematic 35mm film.",
+        "flow_dpm-solver",
+        20,
+        5.0,
+        2.5,
+    ],
+    [
+        "An illustration of a human heart made of translucent glass, standing on a pedestal amidst a stormy sea. "
+        "Rays of sunlight pierce the clouds, illuminating the heart, revealing a tiny universe within. "
+        "The quote 'Find the universe within you' is etched in bold letters across the horizon."
+        "blue and pink, brilliantly illuminated in the background.",
+        "flow_dpm-solver",
+        20,
+        5.0,
+        2.5,
+    ],
+    [
+        "A transparent sculpture of a duck made out of glass. The sculpture is in front of a painting of a landscape.",
+        "flow_dpm-solver",
+        20,
+        5.0,
+        2.5,
+    ],
+    [
+        "A litter of golden retriever puppies playing in the snow. Their heads pop out of the snow, covered in.",
+        "flow_dpm-solver",
+        20,
+        5.0,
+        2.5,
+    ],
+    [
+        "a kayak in the water, in the style of optical color mixing, aerial view, rainbowcore, "
+        "national geographic photo, 8k resolution, crayon art, interactive artwork",
+        "flow_dpm-solver",
+        20,
+        5.0,
+        2.5,
+    ],
+]

asset/model-incremental.jpg ADDED Viewed

asset/model_paths.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ output/Sana_1600M_1024px/checkpoints/Sana_1600M_1024px.pth
2	+ output/Sana_1600M_1024px/checkpoints/Sana_1600M_1024px.pth

asset/samples.txt ADDED Viewed

	@@ -0,0 +1,125 @@

+A small cactus with a happy face in the Sahara desert.
+Pirate ship trapped in a cosmic maelstrom nebula, rendered in cosmic beach whirlpool engine, volumetric lighting, spectacular, ambient lights, light pollution, cinematic atmosphere, art nouveau style, illustration art artwork by SenseiJaye, intricate detail.
+beautiful lady, freckles, big smile, blue eyes, short ginger hair, dark makeup, wearing a floral blue vest top, soft light, dark grey background
+stars, water, brilliantly, gorgeous large scale scene, a little girl, in the style of dreamy realism, light gold and amber, blue and pink, brilliantly illuminated in the background.
+nature vs human nature, surreal, UHD, 8k, hyper details, rich colors, photograph.
+Spectacular Tiny World in the Transparent Jar On the Table, interior of the Great Hall, Elaborate, Carved Architecture, Anatomy, Symetrical, Geometric and Parameteric Details, Precision Flat line Details, Pattern, Dark fantasy, Dark errie mood and ineffably mysterious mood, Technical design, Intricate Ultra Detail, Ornate Detail, Stylized and Futuristic and Biomorphic Details, Architectural Concept, Low contrast Details, Cinematic Lighting, 8k, by moebius, Fullshot, Epic, Fullshot, Octane render, Unreal ,Photorealistic, Hyperrealism
+anthropomorphic profile of the white snow owl Crystal priestess , art deco painting, pretty and expressive eyes, ornate costume, mythical, ethereal, intricate, elaborate, hyperrealism, hyper detailed, 3D, 8K, Ultra Realistic, high octane, ultra resolution, amazing detail, perfection, In frame, photorealistic, cinematic lighting, visual clarity, shading , Lumen Reflections, Super-Resolution, gigapixel, color grading, retouch, enhanced, PBR, Blender, V-ray, Procreate, zBrush, Unreal Engine 5, cinematic, volumetric, dramatic, neon lighting, wide angle lens ,no digital painting blur
+The parametric hotel lobby is a sleek and modern space with plenty of natural light. The lobby is spacious and open with a variety of seating options. The front desk is a sleek white counter with a parametric design. The walls are a light blue color with parametric patterns. The floor is a light wood color with a parametric design. There are plenty of plants and flowers throughout the space. The overall effect is a calm and relaxing space. occlusion, moody, sunset, concept art, octane rendering, 8k, highly detailed, concept art, highly detailed, beautiful scenery, cinematic, beautiful light, hyperreal, octane render, hdr, long exposure, 8K, realistic, fog, moody, fire and explosions, smoke, 50mm f2.8
+Bright scene, aerial view, ancient city, fantasy, gorgeous light, mirror reflection, high detail, wide angle lens.
+8k uhd A man looks up at the starry sky, lonely and ethereal, Minimalism, Chaotic composition Op Art
+A middle-aged woman of Asian descent, her dark hair streaked with silver, appears fractured and splintered, intricately embedded within a sea of broken porcelain. The porcelain glistens with splatter paint patterns in a harmonious blend of glossy and matte blues, greens, oranges, and reds, capturing her dance in a surreal juxtaposition of movement and stillness. Her skin tone, a light hue like the porcelain, adds an almost mystical quality to her form.
+A 4k dslr image of a lemur wearing a red magician hat and a blue coat performing magic tricks with cards in a garden.
+A alpaca made of colorful building blocks, cyberpunk
+A baby painter trying to draw very simple picture, white background
+A boy and a girl fall in love
+A dog that has been meditating all the time
+A man is sitting in a chair with his chin resting on his hand. The chair, along with the man's feet, are submerged in the sea. Strikingly, the man's back is on fire.
+A painter study hard to learn how to draw with many concepts in the air, white background
+A painter with low quality, white background, pixel art
+A person standing on the desert, desert waves, gossip illustration, half red, half blue, abstract image of sand, clear style, trendy illustration, outdoor, top view, clear style, precision art, ultra high definition image
+A silhouette of a grand piano overlooking a dusky cityscape viewed from a top-floor penthouse, rendered in the bold and vivid sytle of a vintage travel poster.
+A sureal parallel world where mankind avoid extinction by preserving nature, epic trees, water streams, various flowers, intricate details, rich colors, rich vegetation, cinematic, symmetrical, beautiful lighting, V-Ray render, sun rays, magical lights, photography
+A woman is shopping for fresh produce at the farmer's market.
+A worker that looks like a mixture of cow and horse is working hard to type code
+A young man dressed in ancient Chinese clothing, Asian people, White robe, Handsome, Hand gestures forming a spell, Martial arts and fairy-like vibe, Carrying a legendary-level giant sword on the back, Game character, Surrounded by runes, Cyberpunk style, neon lights, best quality, masterpiece, cg, hdr, high-definition, extremely detailed, photorealistic, epic, character design, detailed face, superhero, hero, detailed UHD, real-time, vfx, 3D rendering, 8k
+An alien octopus floats through a protal reading a newspaper
+An epressive oil painting of a basketbal player dunking, depicted as an explosion of  a nebula
+art collection style and fashion shoot, in the style of made of glass, dark blue and light pink, paul rand, solarpunk, camille vivier, beth didonato hair, barbiecore, hyper-realistic
+artistic
+beautiful secen
+Crocodile in a sweater
+Design a letter A, 3D stereoscopic Ice material Interior light blue Conceptual product design Futuristic Blind box toy Handcrafted Exquisite 3D effect Full body display Ultra-high precision Ultra-detailed Perfect lighting OC Renderer Blender 8k Ultra-sharp Ultra-noise reduction
+Floating,colossal,futuristic statue in the sky, awe-inspiring and serenein the style of Stuart Lippincott:2with detailed composition and subtle geometric elements.This sanctuary-ike atmosphere features crisp clarity and soft amber tones.In contrasttiny human figures surround the statueThe pieceincorporates flowing draperiesreminiscent of Shwedoff and Philip McKay's stylesemphasizing thejuxtaposition between the powerful presence of the statue and thevulnerability of the minuscule human figuresshwedoff
+knolling of a drawing tools for painter
+Leonardo da Vinci's Last Supper content, Van Goph's Starry Night Style
+Luffy from ONEPIECE, handsome face, fantasy
+photography shot through an outdoor window of a coffee shop with neon sign lighting, window glares and reflections, depth of field, {little girl with red hair sitting at a table, portrait, kodak portra 800,105 mm f1.8
+poster of a mechanical cat, techical Schematics viewed from front and side view on light white blueprint paper, illustartion drafting style, illustation, typography, conceptual art, dark fantasy steampunk, cinematic, dark fantasy
+The girl in the car is filled with goldfish and flowers, goldfish can fly, Kawaguchi Renko's art, natural posture, holiday dadcore, youthful energy and pressure, body stretching, goldfish simulation movies in the sky, super details, and dreamy high photography. Colorful. Covered by water and goldfish, indoor scene, close-up shot in XT4 movie
+The image features a woman wearing a red shirt with an icon. She appears to be posing for the camera, and her outfit includes a pair of jeans. The woman seems to be in a good mood, as she is smiling. The background of the image is blurry, focusing more on the woman and her attire.
+The towel was on top of the hard counter.
+A vast landscape made entirely of various meats spreads out before the viewer. tender, succulent hills of roast beef, chicken drumstick trees, bacon rivers, and ham boulders create a surreal, yet appetizing scene. the sky is adorned with pepperoni sun and salami clouds.
+I want to supplement vitamin c, please help me paint related food.
+A vibrant yellow banana-shaped couch sits in a cozy living room, its curve cradling a pile of colorful cushions. on the wooden floor, a patterned rug adds a touch of eclectic charm, and a potted plant sits in the corner, reaching towards the sunlight filtering through the window.
+A transparent sculpture of a duck made out of glass. The sculpture is in front of a painting of a landscape.
+A blue jay standing on a large basket of rainbow macarons.
+A bucket bag made of blue suede. The bag is decorated with intricate golden paisley patterns. The handle of the bag is made of rubies and pearls.
+An alien octopus floats through a portal reading a newspaper.
+bird's eye view of a city.
+beautiful scene
+A 2D animation of a folk music band composed of anthropomorphic autumn leaves, each playing traditional bluegrass instruments, amidst a rustic forest setting dappled with the soft light of a harvest moon.
+In front of a deep black backdrop, a figure of middle years, her Tongan skin rich and glowing, is captured mid-twirl, her curly hair flowing like a storm behind her. Her attire resembles a whirlwind of marble and porcelain fragments. Illuminated by the gleam of scattered porcelain shards, creating a dreamlike atmosphere, the dancer manages to appear fragmented, yet maintains a harmonious and fluid form.
+Digital illustration of a beach scene crafted from yarn. The sandy beach is depicted with beige yarn, waves are made of blue and white yarn crashing onto the shore. A yarn sun sets on the horizon, casting a warm glow. Yarn palm trees sway gently, and little yarn seashells dot the shoreline.
+Illustration of a chic chair with a design reminiscent of a pumpkin’s form, with deep orange cushioning, in a stylish loft setting.
+A detailed oil painting of an old sea captain, steering his ship through a storm. Saltwater is splashing against his weathered face, determination in his eyes. Twirling malevolent clouds are seen above and stern waves threaten to submerge the ship while seagulls dive and twirl through the chaotic landscape. Thunder and lights embark in the distance, illuminating the scene with an eerie green glow.
+An illustration of a human heart made of translucent glass, standing on a pedestal amidst a stormy sea. Rays of sunlight pierce the clouds, illuminating the heart, revealing a tiny universe within. The quote 'Find the universe within you' is etched in bold letters across the horizon.
+A modern architectural building with large glass windows, situated on a cliff overlooking a serene ocean at sunset
+photo of an ancient shipwreck nestled on the ocean floor. Marine plants have claimed the wooden structure, and fish swim in and out of its hollow spaces. Sunken treasures and old cannons are scattered around, providing a glimpse into the past
+A 3D render of a coffee mug placed on a window sill during a stormy day. The storm outside the window is reflected in the coffee, with miniature lightning bolts and turbulent waves seen inside the mug. The room is dimly lit, adding to the dramatic atmosphere.A minimap diorama of a cafe adorned with indoor plants. Wooden beams crisscross above, and a cold brew station stands out with tiny bottles and glasses.
+An antique botanical illustration drawn with fine lines and a touch of watercolour whimsy, depicting a strange lily crossed with a Venus flytrap, its petals poised as if ready to snap shut on any unsuspecting insects.An illustration inspired by old-world botanical sketches blends a cactus with lilac blooms into a Möbius strip, using detailed lines and subtle watercolor touches to capture nature's diverse beauty and mathematical intrigue.
+An ink sketch style illustration of a small hedgehog holding a piece of watermelon with its tiny paws, taking little bites with its eyes closed in delight.Photo of a lychee-inspired spherical chair, with a bumpy white exterior and plush interior, set against a tropical wallpaper.
+3d digital art of an adorable ghost, glowing within, holding a heart shaped pumpkin, Halloween, super cute, spooky haunted house background
+professional portrait photo of an anthropomorphic cat wearing fancy gentleman hat and jacket walking in autumn forest.
+an astronaut sitting in a diner, eating fries, cinematic, analog film
+Chinese architecture, ancient style,mountain, bird, lotus, pond, big tree, 4K Unity, octane rendering.
+Ethereal fantasy concept art of thunder god with hammer. magnificent, celestial, ethereal, painterly, epic, majestic, magical, fantasy art, cover art, dreamy.
+A Japanese girl walking along a path, surrounding by blooming oriental cherry, pink petal slowly falling down to the ground
+A Ukiyoe style painting, an astronaut riding a unicorn, In the background there is an ancient Japanese architecture
+Steampunk makeup, in the style of vray tracing, colorful impasto, uhd image, indonesian art, fine feather details with bright red and yellow and green and pink and orange colours, intricate patterns and details, dark cyan and amber makeup. Rich colourful plumes. Victorian style.
+A cute teddy bear in front of a plain white wall, warm and brown fur, soft and fluffy
+The beautiful scenery of Seattle, painting by Al Capp.
+Photo of a rhino dressed suit and tie sitting at a table in a bar with a bar stools, award winning photography, Elke vogelsang.
+An astronaut riding a horse on the moon, oil painting by Van Gogh.
+A deep forest clearing with a mirrored pond reflecting a galaxy-filled night sky
+Realistic oil painting of a stunning model merged in multicolor splash made of finely torn paper, eye contact, walking with class in a street.
+a chinese model is sitting on a train, magazine cover, clothes made of plastic, photorealistic,futuristic style, gray and green light, movie lighting, 32K HD
+a handsome 24 years old boy in the middle with sky color background wearing eye glasses, it's super detailed with anime style, it's a portrait with delicated eyes and nice looking face
+a kayak in the water, in the style of optical color mixing, aerial view, rainbowcore, national geographic photo, 8k resolution, crayon art, interactive artwork
+3D rendering miniature scene design, Many tall buildings, A winding urban road runs through the middle，a lot of cars on the road, transparent material pipeline transports Materials, ,there are many people around, in thestyle of light orange and yellow, graphic design- inspired illustrations, classic still-life, beeple, josan gon-zalez, manga-influenced, miniature dioramas, in thestyle of playful and whimsical designs, graphic de-sign-inspired illustrations, minimalism, hyperrealismlomo lca, e-commerce C4D style, e-commerce posterUl, UX, octane render, blender
+Close-up photos of models, hazy light and shadow, laser metal hair accessories, soft and beautiful, light gold pupils, white eyelashes, low saturation, real skin details, clear pores and fine lines, light reflection and refraction, ultra-clear, cinematography, award-winning works
+A cute orange kitten sliding down an aqua slide. happy excited. 16mm lens in front. we see his excitement and scared in the eye. vibrant colors. water splashing on the lens
+Several giant wooly mammoths approach treading through a snowy meadow, their long wooly fur lightly blows in the wind as they walk, snow covered trees and dramatic snow capped mountains in the distance, mid afternoon light with wispy clouds and a sun high in the distance creates a warm glow, the low camera view is stunning capturing the large furry mammal with beautiful photography, depth of field.
+A gorgeously rendered papercraft world of a coral reef, rife with colorful fish and sea creatures.
+An extreme close-up of an gray-haired man with a beard in his 60s, he is deep in thought pondering the history of the universe as he sits at a cafe in Paris, his eyes focus on people offscreen as they walk as he sits mostly motionless, he is dressed in a wool coat suit coat with a button-down shirt , he wears a brown beret and glasses and has a very professorial appearance, and the end he offers a subtle closed-mouth smile as if he found the answer to the mystery of life, the lighting is very cinematic with the golden light and the Parisian streets and city in the background, depth of field, cinematic 35mm film.
+A litter of golden retriever puppies playing in the snow. Their heads pop out of the snow, covered in.
+A New Zealand female business owner stands and is happy that his business is growing by having good VoIP and broadband supplied by Voyager Internet. This business owner is dressed semi casual and is standing with a funky office space in the background. The image is light and bright and is well lit. This image needs to be shot like a professional photo shoot using a Canon R6 with high quality 25mm lens. This image has a shallow depth of field
+The parametric hotel lobby is a sleek and modern space with plenty of natural light. The lobby is spacious and open with a variety of seating options. The front desk is a sleek white counter with a parametric design. The walls are a light blue color with parametric patterns. The floor is a light wood color with a parametric design. There are plenty of plants and flowers throughout the space. The overall effect is a calm and relaxing space. occlusion, moody, sunset, concept art, octane rendering, 8k, highly detailed, concept art, highly detailed, beautiful scenery, cinematic, beautiful light, hyperreal, octane render, hdr, long exposure, 8K, realistic, fog, moody, fire and explosions, smoke, 50mm f2.8
+Editorial photoshoot of a old woman, high fashion 2000s fashion
+Mural Painted of Prince in Purple Rain on side of 5 story brick building next to zen garden vacant lot in the urban center district, rgb
+Cozy Scandinavian living room, there is a cat sleeping on the couch, depth of field
+Street style centered straight shot photo shot on Afga Vista 400, lense 50mm, of a two women,skin to skin touch face, emotion, hughing, natural blond hair, natural features, ultra detailed, skin texture, Rembrandt light, soft shadows
+Frog, in forest, colorful, no watermark, no signature, in forest, 8k
+selfie of a woman and her lion cub on the plains
+A fisherman fixing his net sitting on a beautiful tropical beach at sunset with bending palm trees fishing gear and a small boat on shore
+Coast, decorative painting, horizon, modern, fashionable, full of abstract feeling, full of imagination, the picture reveals the sense of time passing, there is a feeling of the end of the world
+A close up of a branch of a tree and a golden bug on the top a leaf, shutterstock contest winner,ecological art, depth of field, shallow depth of field, macro photography
+Outdoor style fashion photo, full – body shot of a man with short brown hair, happy and smiling, he is standing on his hipster bicycle wearing a light blue long sleeved blouse with closed buttons and dark blue jeans trousers, in the background the exterior of an Aldi store, fully lit background, natural afternoon lighting
+beautiful woman sniper, wearing soviet army uniform, one eye on sniper lens, in snow ground
+A very attractive and natural woman, sitting on a yoka mat, breathing, eye closed, no make up, intense satisfaction, she looks like she is intensely relaxed, yoga class, sunrise, 35mm
+a close up of a helmet on a person, digital art, inspired by Han Gan, cloisonnism, female, victorian armor, ultramarine, best of behance, anton fadeev 8 k, fined detail, sci-fi character, elegant armor, fantasy art behance
+a melting apple
+yellow FIAT 500 Cinquecento 1957 driving through liechtenstein castle with a lot of banknotes scattered behind ,filled with wads of cash , car color yellow, license plate R-33
+tented resort in the desert, rocky and sandy terrain, 5 star hotel, beautiful landscape, landscape photography, depth of view, Fujifilm GFX 100 –uplight
+Full body shot, a French woman, Photography, French Streets background, backlighting, rim light, Fujifilm.
+Modern luxury contemporary luxury home interiors house, in the style of mimicking ruined materials, ray tracing, haunting houses, and stone, capture the essence of nature, gray and bronze, dynamic outdoor shots.
+Over the shoulder game perspective, game screen of Diablo 4, Inside the gorgeous palace is the wet ground, The necromancer knelt before the king, and a horde of skeletons he summoned stood at his side, cinematic light.
+Color photo of a corgi made of transparent glass, standing on the riverside in Yosemite National Park.
+Happy dreamy owl monster sitting on a tree branch, colorful glittering particles, forest background, detailed feathers.
+Game-Art - An island with different geographical properties and multiple small cities floating in space
+Photorealistic closeup video of two pirate ships battling each other as they sail inside a cup of coffee.
+A car made out of vegetables.
+A serene lakeside during autumn with trees displaying a palette of fiery colors.
+A realistic landscape shot of the Northern Lights dancing over a snowy mountain range in Iceland.
+A deep forest clearing with a mirrored pond reflecting a galaxy-filled night sky.
+Drone view of waves crashing against the rugged cliffs along Big Sur’s Garay Point beach. The crashing blue waters create white-tipped waves, while the golden light of the setting sun illuminates the rocky shore.
+A curvy timber house near a sea, designed by Zaha Hadid, represent the image of a cold, modern architecture, at night, white lighting, highly detailed.
+Eiffel Tower was Made up of more than 2 million translucent straws to look like a cloud, with the bell tower at the top of the building, Michel installed huge foam-making machines in the forest to blow huge amounts of unpredictable wet clouds in the building's classic architecture.
+Close-up photos of models, hazy light and shadow, laser metal hair accessories, soft and beautiful, light gold pupils, white eyelashes, low saturation, real skin details, clear pores and fine lines, light reflection and refraction, ultra-clear, cinematography, award-winning works.
+smiling cartoon dog sits at a table, coffee mug on hand, as a room goes up in flames. "Help" the dog is yelling.
+A stylish woman walks down a Tokyo street filled with warm glowing neon and animated city signage. She wears a black leather jacket, a long red dress, and black boots, and carries a black purse. She wears sunglasses and red lipstick. She walks confidently and casually. The street is damp and reflective, creating a mirror effect of the colorful lights. Many pedestrians walk about.
+A close-up photo of a person. The subject is a woman. She wore a blue coat with a gray dress underneath. She has blue eyes and blond hair and wears a pair of earrings. Behind are blurred city buildings and streets.
+👧 with 🌹 in the ❄️
+🐶 Wearing 🕶 flying on the 🌈
+a cyberpunk cat with a neon sign that says "MIT"
+a black and white picture of a woman looking through the window, in the style of Duffy Sheridan, Anna Razumovskaya, smooth and shiny, wavy, Patrick Demarchelier, album covers, lush and detailed.

asset/samples_mini.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+A cyberpunk cat with a neon sign that says 'Sana'.
+A small cactus with a happy face in the Sahara desert.
+The towel was on top of the hard counter.
+A vast landscape made entirely of various meats spreads out before the viewer. tender, succulent hills of roast beef, chicken drumstick trees, bacon rivers, and ham boulders create a surreal, yet appetizing scene. the sky is adorned with pepperoni sun and salami clouds.
+I want to supplement vitamin c, please help me paint related food.
+A transparent sculpture of a duck made out of glass. The sculpture is in front of a painting of a landscape.
+an old rusted robot wearing pants and a jacket riding skis in a supermarket.
+professional portrait photo of an anthropomorphic cat wearing fancy gentleman hat and jacket walking in autumn forest.
+Astronaut in a jungle, cold color palette, muted colors, detailed
+a stunning and luxurious bedroom carved into a rocky mountainside seamlessly blending nature with modern design with a plush earth-toned bed textured stone walls circular fireplace massive uniquely shaped window framing snow-capped mountains dense forests.

configs/sana_app_config/Sana_1600M_app.yaml ADDED Viewed

	@@ -0,0 +1,107 @@

+data:
+  data_dir: []
+  image_size: 1024
+  caption_proportion:
+    prompt: 1
+  external_caption_suffixes: []
+  external_clipscore_suffixes: []
+  clip_thr_temperature: 0.1
+  clip_thr: 25.0
+  load_text_feat: false
+  load_vae_feat: false
+  transform: default_train
+  type: SanaWebDatasetMS
+  data:
+  sort_dataset: false
+# model config
+model:
+  model: SanaMS_1600M_P1_D20
+  image_size: 1024
+  mixed_precision: fp16 # ['fp16', 'fp32', 'bf16']
+  fp32_attention: true
+  load_from:
+  resume_from:
+  aspect_ratio_type: ASPECT_RATIO_1024
+  multi_scale: true
+  #pe_interpolation: 1.
+  attn_type: linear
+  ffn_type: glumbconv
+  mlp_acts:
+    - silu
+    - silu
+    -
+  mlp_ratio: 2.5
+  use_pe: false
+  qk_norm: false
+  class_dropout_prob: 0.1
+  # CFG & PAG settings
+  pag_applied_layers:
+    - 8
+# VAE setting
+vae:
+  vae_type: dc-ae
+  vae_pretrained: mit-han-lab/dc-ae-f32c32-sana-1.0
+  scale_factor: 0.41407
+  vae_latent_dim: 32
+  vae_downsample_rate: 32
+  sample_posterior: true
+# text encoder
+text_encoder:
+  text_encoder_name: gemma-2-2b-it
+  y_norm: true
+  y_norm_scale_factor: 0.01
+  model_max_length: 300
+  # CHI
+  chi_prompt:
+    - 'Given a user prompt, generate an "Enhanced prompt" that provides detailed visual descriptions suitable for image generation. Evaluate the level of detail in the user prompt:'
+    - '- If the prompt is simple, focus on adding specifics about colors, shapes, sizes, textures, and spatial relationships to create vivid and concrete scenes.'
+    - '- If the prompt is already detailed, refine and enhance the existing details slightly without overcomplicating.'
+    - 'Here are examples of how to transform or refine prompts:'
+    - '- User Prompt: A cat sleeping -> Enhanced: A small, fluffy white cat curled up in a round shape, sleeping peacefully on a warm sunny windowsill, surrounded by pots of blooming red flowers.'
+    - '- User Prompt: A busy city street -> Enhanced: A bustling city street scene at dusk, featuring glowing street lamps, a diverse crowd of people in colorful clothing, and a double-decker bus passing by towering glass skyscrapers.'
+    - 'Please generate only the enhanced description for the prompt below and avoid including any additional commentary or evaluations:'
+    - 'User Prompt: '
+# Sana schedule Flow
+scheduler:
+  predict_v: true
+  noise_schedule: linear_flow
+  pred_sigma: false
+  flow_shift: 3.0
+  # logit-normal timestep
+  weighting_scheme: logit_normal
+  logit_mean: 0.0
+  logit_std: 1.0
+  vis_sampler: flow_dpm-solver
+# training setting
+train:
+  num_workers: 10
+  seed: 1
+  train_batch_size: 64
+  num_epochs: 100
+  gradient_accumulation_steps: 1
+  grad_checkpointing: true
+  gradient_clip: 0.1
+  optimizer:
+    betas:
+      - 0.9
+      - 0.999
+      - 0.9999
+    eps:
+      - 1.0e-30
+      - 1.0e-16
+    lr: 0.0001
+    type: CAMEWrapper
+    weight_decay: 0.0
+  lr_schedule: constant
+  lr_schedule_args:
+    num_warmup_steps: 2000
+  local_save_vis: true # if save log image locally
+  visualize: true
+  eval_sampling_steps: 500
+  log_interval: 20
+  save_model_epochs: 5
+  save_model_steps: 500
+  work_dir: output/debug
+  online_metric: false
+  eval_metric_step: 2000
+  online_metric_dir: metric_helper

configs/sana_app_config/Sana_600M_app.yaml ADDED Viewed

	@@ -0,0 +1,105 @@

+data:
+  data_dir: []
+  image_size: 1024
+  caption_proportion:
+    prompt: 1
+  external_caption_suffixes: []
+  external_clipscore_suffixes: []
+  clip_thr_temperature: 0.1
+  clip_thr: 25.0
+  load_text_feat: false
+  load_vae_feat: true
+  transform: default_train
+  type: SanaWebDatasetMS
+  sort_dataset: false
+# model config
+model:
+  model: SanaMS_600M_P1_D28
+  image_size: 1024
+  mixed_precision: fp16 # ['fp16', 'fp32', 'bf16']
+  fp32_attention: true
+  load_from:
+  resume_from:
+  aspect_ratio_type: ASPECT_RATIO_1024
+  multi_scale: true
+  attn_type: linear
+  ffn_type: glumbconv
+  mlp_acts:
+    - silu
+    - silu
+    -
+  mlp_ratio: 2.5
+  use_pe: false
+  qk_norm: false
+  class_dropout_prob: 0.1
+  # CFG & PAG settings
+  pag_applied_layers:
+    - 14
+# VAE setting
+vae:
+  vae_type: dc-ae
+  vae_pretrained: mit-han-lab/dc-ae-f32c32-sana-1.0
+  scale_factor: 0.41407
+  vae_latent_dim: 32
+  vae_downsample_rate: 32
+  sample_posterior: true
+# text encoder
+text_encoder:
+  text_encoder_name: gemma-2-2b-it
+  y_norm: true
+  y_norm_scale_factor: 0.01
+  model_max_length: 300
+  # CHI
+  chi_prompt:
+    - 'Given a user prompt, generate an "Enhanced prompt" that provides detailed visual descriptions suitable for image generation. Evaluate the level of detail in the user prompt:'
+    - '- If the prompt is simple, focus on adding specifics about colors, shapes, sizes, textures, and spatial relationships to create vivid and concrete scenes.'
+    - '- If the prompt is already detailed, refine and enhance the existing details slightly without overcomplicating.'
+    - 'Here are examples of how to transform or refine prompts:'
+    - '- User Prompt: A cat sleeping -> Enhanced: A small, fluffy white cat curled up in a round shape, sleeping peacefully on a warm sunny windowsill, surrounded by pots of blooming red flowers.'
+    - '- User Prompt: A busy city street -> Enhanced: A bustling city street scene at dusk, featuring glowing street lamps, a diverse crowd of people in colorful clothing, and a double-decker bus passing by towering glass skyscrapers.'
+    - 'Please generate only the enhanced description for the prompt below and avoid including any additional commentary or evaluations:'
+    - 'User Prompt: '
+# Sana schedule Flow
+scheduler:
+  predict_v: true
+  noise_schedule: linear_flow
+  pred_sigma: false
+  flow_shift: 4.0
+  # logit-normal timestep
+  weighting_scheme: logit_normal
+  logit_mean: 0.0
+  logit_std: 1.0
+  vis_sampler: flow_dpm-solver
+# training setting
+train:
+  num_workers: 10
+  seed: 1
+  train_batch_size: 64
+  num_epochs: 100
+  gradient_accumulation_steps: 1
+  grad_checkpointing: true
+  gradient_clip: 0.1
+  optimizer:
+    betas:
+      - 0.9
+      - 0.999
+      - 0.9999
+    eps:
+      - 1.0e-30
+      - 1.0e-16
+    lr: 0.0001
+    type: CAMEWrapper
+    weight_decay: 0.0
+  lr_schedule: constant
+  lr_schedule_args:
+    num_warmup_steps: 2000
+  local_save_vis: true # if save log image locally
+  visualize: true
+  eval_sampling_steps: 500
+  log_interval: 20
+  save_model_epochs: 5
+  save_model_steps: 500
+  work_dir: output/debug
+  online_metric: false
+  eval_metric_step: 2000
+  online_metric_dir: metric_helper

configs/sana_base.yaml ADDED Viewed

	@@ -0,0 +1,140 @@

+# data settings
+data:
+  data_dir: []
+  caption_proportion:
+    prompt: 1
+  external_caption_suffixes: []
+  external_clipscore_suffixes: []
+  clip_thr_temperature: 1.0
+  clip_thr: 0.0
+  sort_dataset: false
+  load_text_feat: false
+  load_vae_feat: false
+  transform: default_train
+  type: SanaWebDatasetMS
+  image_size: 512
+  hq_only: false
+  valid_num: 0
+# model settings
+model:
+  model: SanaMS_600M_P1_D28
+  image_size: 512
+  mixed_precision: fp16 # ['fp16', 'fp32', 'bf16']
+  fp32_attention: true
+  load_from:
+  resume_from:
+    checkpoint:
+    load_ema: false
+    resume_lr_scheduler: true
+    resume_optimizer: true
+  aspect_ratio_type: ASPECT_RATIO_1024
+  multi_scale: true
+  pe_interpolation: 1.0
+  micro_condition: false
+  attn_type: linear # 'flash', 'linear', 'vanilla', 'triton_linear'
+  cross_norm: false
+  autocast_linear_attn: false
+  ffn_type: glumbconv
+  mlp_acts:
+    - silu
+    - silu
+    -
+  mlp_ratio: 2.5
+  use_pe: false
+  qk_norm: false
+  class_dropout_prob: 0.0
+  linear_head_dim: 32
+  # CFG & PAG settings
+  cfg_scale: 4
+  guidance_type: classifier-free
+  pag_applied_layers: [14]
+# text encoder settings
+text_encoder:
+  text_encoder_name: gemma-2-2b-it
+  caption_channels: 2304
+  y_norm: false
+  y_norm_scale_factor: 1.0
+  model_max_length: 300
+  chi_prompt: []
+# VAE settings
+vae:
+  vae_type: dc-ae
+  vae_pretrained: mit-han-lab/dc-ae-f32c32-sana-1.0
+  scale_factor: 0.41407
+  vae_latent_dim: 32
+  vae_downsample_rate: 32
+  sample_posterior: true
+# Scheduler settings
+scheduler:
+  train_sampling_steps: 1000
+  predict_v: True
+  noise_schedule: linear_flow
+  pred_sigma: false
+  flow_shift: 1.0
+  weighting_scheme: logit_normal
+  logit_mean: 0.0
+  logit_std: 1.0
+  vis_sampler: flow_dpm-solver
+# training settings
+train:
+  num_workers: 4
+  seed: 43
+  train_batch_size: 32
+  num_epochs: 100
+  gradient_accumulation_steps: 1
+  grad_checkpointing: false
+  gradient_clip: 1.0
+  gc_step: 1
+  # optimizer settings
+  optimizer:
+    eps: 1.0e-10
+    lr: 0.0001
+    type: AdamW
+    weight_decay: 0.03
+  lr_schedule: constant
+  lr_schedule_args:
+    num_warmup_steps: 500
+  auto_lr:
+    rule: sqrt
+  ema_rate: 0.9999
+  eval_batch_size: 16
+  use_fsdp: false
+  use_flash_attn: false
+  eval_sampling_steps: 250
+  lora_rank: 4
+  log_interval: 50
+  mask_type: 'null'
+  mask_loss_coef: 0.0
+  load_mask_index: false
+  snr_loss: false
+  real_prompt_ratio: 1.0
+  debug_nan: false
+  # checkpoint settings
+  save_image_epochs: 1
+  save_model_epochs: 1
+  save_model_steps: 1000000
+  # visualization settings
+  visualize: false
+  null_embed_root: output/pretrained_models/
+  valid_prompt_embed_root: output/tmp_embed/
+  validation_prompts:
+    - dog
+    - portrait photo of a girl, photograph, highly detailed face, depth of field
+    - Self-portrait oil painting, a beautiful cyborg with golden hair, 8k
+    - Astronaut in a jungle, cold color palette, muted colors, detailed, 8k
+    - A photo of beautiful mountain with realistic sunset and blue lake, highly detailed, masterpiece
+  local_save_vis: false
+  deterministic_validation: true
+  online_metric: false
+  eval_metric_step: 5000
+  online_metric_dir: metric_helper
+  # work dir settings
+  work_dir: /cache/exps/
+  skip_step: 0
+  # LCM settings
+  loss_type: huber
+  huber_c: 0.001
+  num_ddim_timesteps: 50
+  w_max: 15.0
+  w_min: 3.0
+  ema_decay: 0.95

configs/sana_config/1024ms/Sana_1600M_img1024.yaml ADDED Viewed

	@@ -0,0 +1,109 @@

+data:
+  data_dir: [data/data_public/dir1]
+  image_size: 1024
+  caption_proportion:
+    prompt: 1
+  external_caption_suffixes: ['', _InternVL2-26B, _VILA1-5-13B]
+  external_clipscore_suffixes:
+    - _InternVL2-26B_clip_score
+    - _VILA1-5-13B_clip_score
+    - _prompt_clip_score
+  clip_thr_temperature: 0.1
+  clip_thr: 25.0
+  load_text_feat: false
+  load_vae_feat: false
+  transform: default_train
+  type: SanaWebDatasetMS
+  sort_dataset: false
+# model config
+model:
+  model: SanaMS_1600M_P1_D20
+  image_size: 1024
+  mixed_precision: fp16 # ['fp16', 'fp32', 'bf16']
+  fp32_attention: true
+  load_from:
+  resume_from:
+  aspect_ratio_type: ASPECT_RATIO_1024
+  multi_scale: true
+  #pe_interpolation: 1.
+  attn_type: linear
+  ffn_type: glumbconv
+  mlp_acts:
+    - silu
+    - silu
+    -
+  mlp_ratio: 2.5
+  use_pe: false
+  qk_norm: false
+  class_dropout_prob: 0.1
+  # PAG
+  pag_applied_layers:
+    - 8
+# VAE setting
+vae:
+  vae_type: dc-ae
+  vae_pretrained: mit-han-lab/dc-ae-f32c32-sana-1.0
+  scale_factor: 0.41407
+  vae_latent_dim: 32
+  vae_downsample_rate: 32
+  sample_posterior: true
+# text encoder
+text_encoder:
+  text_encoder_name: gemma-2-2b-it
+  y_norm: true
+  y_norm_scale_factor: 0.01
+  model_max_length: 300
+  # CHI
+  chi_prompt:
+    - 'Given a user prompt, generate an "Enhanced prompt" that provides detailed visual descriptions suitable for image generation. Evaluate the level of detail in the user prompt:'
+    - '- If the prompt is simple, focus on adding specifics about colors, shapes, sizes, textures, and spatial relationships to create vivid and concrete scenes.'
+    - '- If the prompt is already detailed, refine and enhance the existing details slightly without overcomplicating.'
+    - 'Here are examples of how to transform or refine prompts:'
+    - '- User Prompt: A cat sleeping -> Enhanced: A small, fluffy white cat curled up in a round shape, sleeping peacefully on a warm sunny windowsill, surrounded by pots of blooming red flowers.'
+    - '- User Prompt: A busy city street -> Enhanced: A bustling city street scene at dusk, featuring glowing street lamps, a diverse crowd of people in colorful clothing, and a double-decker bus passing by towering glass skyscrapers.'
+    - 'Please generate only the enhanced description for the prompt below and avoid including any additional commentary or evaluations:'
+    - 'User Prompt: '
+# Sana schedule Flow
+scheduler:
+  predict_v: true
+  noise_schedule: linear_flow
+  pred_sigma: false
+  flow_shift: 3.0
+  # logit-normal timestep
+  weighting_scheme: logit_normal
+  logit_mean: 0.0
+  logit_std: 1.0
+  vis_sampler: flow_dpm-solver
+# training setting
+train:
+  num_workers: 10
+  seed: 1
+  train_batch_size: 64
+  num_epochs: 100
+  gradient_accumulation_steps: 1
+  grad_checkpointing: true
+  gradient_clip: 0.1
+  optimizer:
+    betas:
+      - 0.9
+      - 0.999
+      - 0.9999
+    eps:
+      - 1.0e-30
+      - 1.0e-16
+    lr: 0.0001
+    type: CAMEWrapper
+    weight_decay: 0.0
+  lr_schedule: constant
+  lr_schedule_args:
+    num_warmup_steps: 2000
+  local_save_vis: true # if save log image locally
+  visualize: true
+  eval_sampling_steps: 500
+  log_interval: 20
+  save_model_epochs: 5
+  save_model_steps: 500
+  work_dir: output/debug
+  online_metric: false
+  eval_metric_step: 2000
+  online_metric_dir: metric_helper

configs/sana_config/1024ms/Sana_600M_img1024.yaml ADDED Viewed

	@@ -0,0 +1,105 @@

+data:
+  data_dir: [data/data_public/dir1]
+  image_size: 1024
+  caption_proportion:
+    prompt: 1
+  external_caption_suffixes: ['', _InternVL2-26B, _VILA1-5-13B]
+  external_clipscore_suffixes:
+    - _InternVL2-26B_clip_score
+    - _VILA1-5-13B_clip_score
+    - _prompt_clip_score
+  clip_thr_temperature: 0.1
+  clip_thr: 25.0
+  load_text_feat: false
+  load_vae_feat: false
+  transform: default_train
+  type: SanaWebDatasetMS
+  sort_dataset: false
+# model config
+model:
+  model: SanaMS_600M_P1_D28
+  image_size: 1024
+  mixed_precision: fp16
+  fp32_attention: true
+  load_from:
+  resume_from:
+  aspect_ratio_type: ASPECT_RATIO_1024
+  multi_scale: true
+  attn_type: linear
+  ffn_type: glumbconv
+  mlp_acts:
+    - silu
+    - silu
+    -
+  mlp_ratio: 2.5
+  use_pe: false
+  qk_norm: false
+  class_dropout_prob: 0.1
+# VAE setting
+vae:
+  vae_type: dc-ae
+  vae_pretrained: mit-han-lab/dc-ae-f32c32-sana-1.0
+  scale_factor: 0.41407
+  vae_latent_dim: 32
+  vae_downsample_rate: 32
+  sample_posterior: true
+# text encoder
+text_encoder:
+  text_encoder_name: gemma-2-2b-it
+  y_norm: true
+  y_norm_scale_factor: 0.01
+  model_max_length: 300
+  # CHI
+  chi_prompt:
+    - 'Given a user prompt, generate an "Enhanced prompt" that provides detailed visual descriptions suitable for image generation. Evaluate the level of detail in the user prompt:'
+    - '- If the prompt is simple, focus on adding specifics about colors, shapes, sizes, textures, and spatial relationships to create vivid and concrete scenes.'
+    - '- If the prompt is already detailed, refine and enhance the existing details slightly without overcomplicating.'
+    - 'Here are examples of how to transform or refine prompts:'
+    - '- User Prompt: A cat sleeping -> Enhanced: A small, fluffy white cat curled up in a round shape, sleeping peacefully on a warm sunny windowsill, surrounded by pots of blooming red flowers.'
+    - '- User Prompt: A busy city street -> Enhanced: A bustling city street scene at dusk, featuring glowing street lamps, a diverse crowd of people in colorful clothing, and a double-decker bus passing by towering glass skyscrapers.'
+    - 'Please generate only the enhanced description for the prompt below and avoid including any additional commentary or evaluations:'
+    - 'User Prompt: '
+# Sana schedule Flow
+scheduler:
+  predict_v: true
+  noise_schedule: linear_flow
+  pred_sigma: false
+  flow_shift: 4.0
+  # logit-normal timestep
+  weighting_scheme: logit_normal
+  logit_mean: 0.0
+  logit_std: 1.0
+  vis_sampler: flow_dpm-solver
+# training setting
+train:
+  num_workers: 10
+  seed: 1
+  train_batch_size: 64
+  num_epochs: 100
+  gradient_accumulation_steps: 1
+  grad_checkpointing: true
+  gradient_clip: 0.1
+  optimizer:
+    betas:
+      - 0.9
+      - 0.999
+      - 0.9999
+    eps:
+      - 1.0e-30
+      - 1.0e-16
+    lr: 0.0001
+    type: CAMEWrapper
+    weight_decay: 0.0
+  lr_schedule: constant
+  lr_schedule_args:
+    num_warmup_steps: 2000
+  local_save_vis: true # if save log image locally
+  visualize: true
+  eval_sampling_steps: 500
+  log_interval: 20
+  save_model_epochs: 5
+  save_model_steps: 500
+  work_dir: output/debug
+  online_metric: false
+  eval_metric_step: 2000
+  online_metric_dir: metric_helper

configs/sana_config/512ms/Sana_1600M_img512.yaml ADDED Viewed

	@@ -0,0 +1,108 @@

+data:
+  data_dir: [data/data_public/dir1]
+  image_size: 512
+  caption_proportion:
+    prompt: 1
+  external_caption_suffixes: ['', _InternVL2-26B, _VILA1-5-13B]
+  external_clipscore_suffixes:
+    - _InternVL2-26B_clip_score
+    - _VILA1-5-13B_clip_score
+    - _prompt_clip_score
+  clip_thr_temperature: 0.1
+  clip_thr: 25.0
+  load_text_feat: false
+  load_vae_feat: false
+  transform: default_train
+  type: SanaWebDatasetMS
+  sort_dataset: false
+# model config
+model:
+  model: SanaMS_1600M_P1_D20
+  image_size: 512
+  mixed_precision: fp16 # ['fp16', 'fp32', 'bf16']
+  fp32_attention: true
+  load_from:
+  resume_from:
+  aspect_ratio_type: ASPECT_RATIO_512
+  multi_scale: true
+  attn_type: linear
+  ffn_type: glumbconv
+  mlp_acts:
+    - silu
+    - silu
+    -
+  mlp_ratio: 2.5
+  use_pe: false
+  qk_norm: false
+  class_dropout_prob: 0.1
+  # PAG
+  pag_applied_layers:
+    - 8
+# VAE setting
+vae:
+  vae_type: dc-ae
+  vae_pretrained: mit-han-lab/dc-ae-f32c32-sana-1.0
+  scale_factor: 0.41407
+  vae_latent_dim: 32
+  vae_downsample_rate: 32
+  sample_posterior: true
+# text encoder
+text_encoder:
+  text_encoder_name: gemma-2-2b-it
+  y_norm: true
+  y_norm_scale_factor: 0.01
+  model_max_length: 300
+  # CHI
+  chi_prompt:
+    - 'Given a user prompt, generate an "Enhanced prompt" that provides detailed visual descriptions suitable for image generation. Evaluate the level of detail in the user prompt:'
+    - '- If the prompt is simple, focus on adding specifics about colors, shapes, sizes, textures, and spatial relationships to create vivid and concrete scenes.'
+    - '- If the prompt is already detailed, refine and enhance the existing details slightly without overcomplicating.'
+    - 'Here are examples of how to transform or refine prompts:'
+    - '- User Prompt: A cat sleeping -> Enhanced: A small, fluffy white cat curled up in a round shape, sleeping peacefully on a warm sunny windowsill, surrounded by pots of blooming red flowers.'
+    - '- User Prompt: A busy city street -> Enhanced: A bustling city street scene at dusk, featuring glowing street lamps, a diverse crowd of people in colorful clothing, and a double-decker bus passing by towering glass skyscrapers.'
+    - 'Please generate only the enhanced description for the prompt below and avoid including any additional commentary or evaluations:'
+    - 'User Prompt: '
+# Sana schedule Flow
+scheduler:
+  predict_v: true
+  noise_schedule: linear_flow
+  pred_sigma: false
+  flow_shift: 1.0
+  # logit-normal timestep
+  weighting_scheme: logit_normal
+  logit_mean: 0.0
+  logit_std: 1.0
+  vis_sampler: flow_dpm-solver
+# training setting
+train:
+  num_workers: 10
+  seed: 1
+  train_batch_size: 64
+  num_epochs: 100
+  gradient_accumulation_steps: 1
+  grad_checkpointing: true
+  gradient_clip: 0.1
+  optimizer:
+    betas:
+      - 0.9
+      - 0.999
+      - 0.9999
+    eps:
+      - 1.0e-30
+      - 1.0e-16
+    lr: 0.0001
+    type: CAMEWrapper
+    weight_decay: 0.0
+  lr_schedule: constant
+  lr_schedule_args:
+    num_warmup_steps: 2000
+  local_save_vis: true # if save log image locally
+  visualize: true
+  eval_sampling_steps: 500
+  log_interval: 20
+  save_model_epochs: 5
+  save_model_steps: 500
+  work_dir: output/debug
+  online_metric: false
+  eval_metric_step: 2000
+  online_metric_dir: metric_helper

configs/sana_config/512ms/Sana_600M_img512.yaml ADDED Viewed

	@@ -0,0 +1,107 @@

+data:
+  data_dir: [data/data_public/dir1]
+  image_size: 512
+  caption_proportion:
+    prompt: 1
+  external_caption_suffixes: ['', _InternVL2-26B, _VILA1-5-13B]
+  external_clipscore_suffixes:
+    - _InternVL2-26B_clip_score
+    - _VILA1-5-13B_clip_score
+    - _prompt_clip_score
+  clip_thr_temperature: 0.1
+  clip_thr: 25.0
+  load_text_feat: false
+  load_vae_feat: false
+  transform: default_train
+  type: SanaWebDatasetMS
+  sort_dataset: false
+# model config
+model:
+  model: SanaMS_600M_P1_D28
+  image_size: 512
+  mixed_precision: fp16
+  fp32_attention: true
+  load_from:
+  resume_from:
+  aspect_ratio_type: ASPECT_RATIO_512
+  multi_scale: true
+  #pe_interpolation: 1.
+  attn_type: linear
+  linear_head_dim: 32
+  ffn_type: glumbconv
+  mlp_acts:
+    - silu
+    - silu
+    - null
+  mlp_ratio: 2.5
+  use_pe: false
+  qk_norm: false
+  class_dropout_prob: 0.1
+# VAE setting
+vae:
+  vae_type: dc-ae
+  vae_pretrained: mit-han-lab/dc-ae-f32c32-sana-1.0
+  scale_factor: 0.41407
+  vae_latent_dim: 32
+  vae_downsample_rate: 32
+  sample_posterior: true
+# text encoder
+text_encoder:
+  text_encoder_name: gemma-2-2b-it
+  y_norm: true
+  y_norm_scale_factor: 0.01
+  model_max_length: 300
+  # CHI
+  chi_prompt:
+    - 'Given a user prompt, generate an "Enhanced prompt" that provides detailed visual descriptions suitable for image generation. Evaluate the level of detail in the user prompt:'
+    - '- If the prompt is simple, focus on adding specifics about colors, shapes, sizes, textures, and spatial relationships to create vivid and concrete scenes.'
+    - '- If the prompt is already detailed, refine and enhance the existing details slightly without overcomplicating.'
+    - 'Here are examples of how to transform or refine prompts:'
+    - '- User Prompt: A cat sleeping -> Enhanced: A small, fluffy white cat curled up in a round shape, sleeping peacefully on a warm sunny windowsill, surrounded by pots of blooming red flowers.'
+    - '- User Prompt: A busy city street -> Enhanced: A bustling city street scene at dusk, featuring glowing street lamps, a diverse crowd of people in colorful clothing, and a double-decker bus passing by towering glass skyscrapers.'
+    - 'Please generate only the enhanced description for the prompt below and avoid including any additional commentary or evaluations:'
+    - 'User Prompt: '
+# Sana schedule Flow
+scheduler:
+  predict_v: true
+  noise_schedule: linear_flow
+  pred_sigma: false
+  flow_shift: 1.0
+  # logit-normal timestep
+  weighting_scheme: logit_normal
+  logit_mean: 0.0
+  logit_std: 1.0
+  vis_sampler: flow_dpm-solver
+# training setting
+train:
+  num_workers: 10
+  seed: 1
+  train_batch_size: 128
+  num_epochs: 100
+  gradient_accumulation_steps: 1
+  grad_checkpointing: true
+  gradient_clip: 0.1
+  optimizer:
+    betas:
+      - 0.9
+      - 0.999
+      - 0.9999
+    eps:
+      - 1.0e-30
+      - 1.0e-16
+    lr: 0.0001
+    type: CAMEWrapper
+    weight_decay: 0.0
+  lr_schedule: constant
+  lr_schedule_args:
+    num_warmup_steps: 2000
+  local_save_vis: true # if save log image locally
+  visualize: true
+  eval_sampling_steps: 500
+  log_interval: 20
+  save_model_epochs: 5
+  save_model_steps: 500
+  work_dir: output/debug
+  online_metric: false
+  eval_metric_step: 2000
+  online_metric_dir: metric_helper

configs/sana_config/512ms/ci_Sana_600M_img512.yaml ADDED Viewed

	@@ -0,0 +1,107 @@

+data:
+  data_dir: [data/data_public/vaef32c32_v2_512/dir1]
+  image_size: 512
+  caption_proportion:
+    prompt: 1
+  external_caption_suffixes: ['', _InternVL2-26B, _VILA1-5-13B]
+  external_clipscore_suffixes:
+    - _InternVL2-26B_clip_score
+    - _VILA1-5-13B_clip_score
+    - _prompt_clip_score
+  clip_thr_temperature: 0.1
+  clip_thr: 25.0
+  load_text_feat: false
+  load_vae_feat: false
+  transform: default_train
+  type: SanaWebDatasetMS
+  sort_dataset: false
+# model config
+model:
+  model: SanaMS_600M_P1_D28
+  image_size: 512
+  mixed_precision: fp16
+  fp32_attention: true
+  load_from:
+  resume_from:
+  aspect_ratio_type: ASPECT_RATIO_512
+  multi_scale: true
+  #pe_interpolation: 1.
+  attn_type: linear
+  linear_head_dim: 32
+  ffn_type: glumbconv
+  mlp_acts:
+    - silu
+    - silu
+    - null
+  mlp_ratio: 2.5
+  use_pe: false
+  qk_norm: false
+  class_dropout_prob: 0.1
+# VAE setting
+vae:
+  vae_type: dc-ae
+  vae_pretrained: mit-han-lab/dc-ae-f32c32-sana-1.0
+  scale_factor: 0.41407
+  vae_latent_dim: 32
+  vae_downsample_rate: 32
+  sample_posterior: true
+# text encoder
+text_encoder:
+  text_encoder_name: gemma-2-2b-it
+  y_norm: true
+  y_norm_scale_factor: 0.01
+  model_max_length: 300
+  # CHI
+  chi_prompt:
+    - 'Given a user prompt, generate an "Enhanced prompt" that provides detailed visual descriptions suitable for image generation. Evaluate the level of detail in the user prompt:'
+    - '- If the prompt is simple, focus on adding specifics about colors, shapes, sizes, textures, and spatial relationships to create vivid and concrete scenes.'
+    - '- If the prompt is already detailed, refine and enhance the existing details slightly without overcomplicating.'
+    - 'Here are examples of how to transform or refine prompts:'
+    - '- User Prompt: A cat sleeping -> Enhanced: A small, fluffy white cat curled up in a round shape, sleeping peacefully on a warm sunny windowsill, surrounded by pots of blooming red flowers.'
+    - '- User Prompt: A busy city street -> Enhanced: A bustling city street scene at dusk, featuring glowing street lamps, a diverse crowd of people in colorful clothing, and a double-decker bus passing by towering glass skyscrapers.'
+    - 'Please generate only the enhanced description for the prompt below and avoid including any additional commentary or evaluations:'
+    - 'User Prompt: '
+# Sana schedule Flow
+scheduler:
+  predict_v: true
+  noise_schedule: linear_flow
+  pred_sigma: false
+  flow_shift: 1.0
+  # logit-normal timestep
+  weighting_scheme: logit_normal
+  logit_mean: 0.0
+  logit_std: 1.0
+  vis_sampler: flow_dpm-solver
+# training setting
+train:
+  num_workers: 10
+  seed: 1
+  train_batch_size: 64
+  num_epochs: 1
+  gradient_accumulation_steps: 1
+  grad_checkpointing: true
+  gradient_clip: 0.1
+  optimizer:
+    betas:
+      - 0.9
+      - 0.999
+      - 0.9999
+    eps:
+      - 1.0e-30
+      - 1.0e-16
+    lr: 0.0001
+    type: CAMEWrapper
+    weight_decay: 0.0
+  lr_schedule: constant
+  lr_schedule_args:
+    num_warmup_steps: 2000
+  local_save_vis: true # if save log image locally
+  visualize: true
+  eval_sampling_steps: 500
+  log_interval: 20
+  save_model_epochs: 5
+  save_model_steps: 500
+  work_dir: output/debug
+  online_metric: false
+  eval_metric_step: 2000
+  online_metric_dir: metric_helper

configs/sana_config/512ms/sample_dataset.yaml ADDED Viewed

	@@ -0,0 +1,107 @@

+data:
+  data_dir: [asset/example_data]
+  image_size: 512
+  caption_proportion:
+    prompt: 1
+  external_caption_suffixes: ['', _InternVL2-26B, _VILA1-5-13B] # json fils
+  external_clipscore_suffixes: # json files
+    - _InternVL2-26B_clip_score
+    - _VILA1-5-13B_clip_score
+    - _prompt_clip_score
+  clip_thr_temperature: 0.1
+  clip_thr: 25.0
+  load_text_feat: false
+  load_vae_feat: false
+  transform: default_train
+  type: SanaImgDataset
+  sort_dataset: false
+# model config
+model:
+  model: SanaMS_600M_P1_D28
+  image_size: 512
+  mixed_precision: fp16
+  fp32_attention: true
+  load_from:
+  resume_from:
+  aspect_ratio_type: ASPECT_RATIO_512
+  multi_scale: false
+  #pe_interpolation: 1.
+  attn_type: linear
+  linear_head_dim: 32
+  ffn_type: glumbconv
+  mlp_acts:
+    - silu
+    - silu
+    - null
+  mlp_ratio: 2.5
+  use_pe: false
+  qk_norm: false
+  class_dropout_prob: 0.1
+# VAE setting
+vae:
+  vae_type: dc-ae
+  vae_pretrained: mit-han-lab/dc-ae-f32c32-sana-1.0
+  scale_factor: 0.41407
+  vae_latent_dim: 32
+  vae_downsample_rate: 32
+  sample_posterior: true
+# text encoder
+text_encoder:
+  text_encoder_name: gemma-2-2b-it
+  y_norm: true
+  y_norm_scale_factor: 0.01
+  model_max_length: 300
+  # CHI
+  chi_prompt:
+    - 'Given a user prompt, generate an "Enhanced prompt" that provides detailed visual descriptions suitable for image generation. Evaluate the level of detail in the user prompt:'
+    - '- If the prompt is simple, focus on adding specifics about colors, shapes, sizes, textures, and spatial relationships to create vivid and concrete scenes.'
+    - '- If the prompt is already detailed, refine and enhance the existing details slightly without overcomplicating.'
+    - 'Here are examples of how to transform or refine prompts:'
+    - '- User Prompt: A cat sleeping -> Enhanced: A small, fluffy white cat curled up in a round shape, sleeping peacefully on a warm sunny windowsill, surrounded by pots of blooming red flowers.'
+    - '- User Prompt: A busy city street -> Enhanced: A bustling city street scene at dusk, featuring glowing street lamps, a diverse crowd of people in colorful clothing, and a double-decker bus passing by towering glass skyscrapers.'
+    - 'Please generate only the enhanced description for the prompt below and avoid including any additional commentary or evaluations:'
+    - 'User Prompt: '
+# Sana schedule Flow
+scheduler:
+  predict_v: true
+  noise_schedule: linear_flow
+  pred_sigma: false
+  flow_shift: 1.0
+  # logit-normal timestep
+  weighting_scheme: logit_normal
+  logit_mean: 0.0
+  logit_std: 1.0
+  vis_sampler: flow_dpm-solver
+# training setting
+train:
+  num_workers: 10
+  seed: 1
+  train_batch_size: 128
+  num_epochs: 100
+  gradient_accumulation_steps: 1
+  grad_checkpointing: true
+  gradient_clip: 0.1
+  optimizer:
+    betas:
+      - 0.9
+      - 0.999
+      - 0.9999
+    eps:
+      - 1.0e-30
+      - 1.0e-16
+    lr: 0.0001
+    type: CAMEWrapper
+    weight_decay: 0.0
+  lr_schedule: constant
+  lr_schedule_args:
+    num_warmup_steps: 2000
+  local_save_vis: true # if save log image locally
+  visualize: true
+  eval_sampling_steps: 500
+  log_interval: 20
+  save_model_epochs: 5
+  save_model_steps: 500
+  work_dir: output/debug
+  online_metric: false
+  eval_metric_step: 2000
+  online_metric_dir: metric_helper

diffusion/__init__.py ADDED Viewed

	@@ -0,0 +1,9 @@

+# Modified from OpenAI's diffusion repos
+#     GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py
+#     ADM:   https://github.com/openai/guided-diffusion/blob/main/guided_diffusion
+#     IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py
+from .dpm_solver import DPMS
+from .flow_euler_sampler import FlowEuler
+from .iddpm import Scheduler
+from .sa_sampler import SASolverSampler

diffusion/data/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .datasets import *
2	+ from .transforms import get_transform

diffusion/data/builder.py ADDED Viewed

	@@ -0,0 +1,76 @@

+# Copyright 2024 NVIDIA CORPORATION & AFFILIATES
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+import os
+import time
+from mmcv import Registry, build_from_cfg
+from termcolor import colored
+from torch.utils.data import DataLoader
+from diffusion.data.transforms import get_transform
+from diffusion.utils.logger import get_root_logger
+DATASETS = Registry("datasets")
+DATA_ROOT = "data"
+def set_data_root(data_root):
+    global DATA_ROOT
+    DATA_ROOT = data_root
+def get_data_path(data_dir):
+    if os.path.isabs(data_dir):
+        return data_dir
+    global DATA_ROOT
+    return os.path.join(DATA_ROOT, data_dir)
+def get_data_root_and_path(data_dir):
+    if os.path.isabs(data_dir):
+        return data_dir
+    global DATA_ROOT
+    return DATA_ROOT, os.path.join(DATA_ROOT, data_dir)
+def build_dataset(cfg, resolution=224, **kwargs):
+    logger = get_root_logger()
+    dataset_type = cfg.get("type")
+    logger.info(f"Constructing dataset {dataset_type}...")
+    t = time.time()
+    transform = cfg.pop("transform", "default_train")
+    transform = get_transform(transform, resolution)
+    dataset = build_from_cfg(cfg, DATASETS, default_args=dict(transform=transform, resolution=resolution, **kwargs))
+    logger.info(
+        f"{colored(f'Dataset {dataset_type} constructed: ', 'green', attrs=['bold'])}"
+        f"time: {(time.time() - t):.2f} s, length (use/ori): {len(dataset)}/{dataset.ori_imgs_nums}"
+    )
+    return dataset
+def build_dataloader(dataset, batch_size=256, num_workers=4, shuffle=True, **kwargs):
+    if "batch_sampler" in kwargs:
+        dataloader = DataLoader(
+            dataset, batch_sampler=kwargs["batch_sampler"], num_workers=num_workers, pin_memory=True
+        )
+    else:
+        dataloader = DataLoader(
+            dataset, batch_size=batch_size, shuffle=shuffle, num_workers=num_workers, pin_memory=True, **kwargs
+        )
+    return dataloader

diffusion/data/datasets/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+from .sana_data import SanaImgDataset, SanaWebDataset
+from .sana_data_multi_scale import DummyDatasetMS, SanaWebDatasetMS
+from .utils import *

diffusion/data/datasets/sana_data.py ADDED Viewed

	@@ -0,0 +1,467 @@

+# Copyright 2024 NVIDIA CORPORATION & AFFILIATES
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+# This file is modified from https://github.com/PixArt-alpha/PixArt-sigma
+import getpass
+import json
+import os
+import os.path as osp
+import random
+import numpy as np
+import torch
+import torch.distributed as dist
+from PIL import Image
+from termcolor import colored
+from torch.utils.data import Dataset
+from diffusion.data.builder import DATASETS, get_data_path
+from diffusion.data.wids import ShardListDataset, ShardListDatasetMulti, lru_json_load
+from diffusion.utils.logger import get_root_logger
+@DATASETS.register_module()
+class SanaImgDataset(torch.utils.data.Dataset):
+    def __init__(
+        self,
+        data_dir="",
+        transform=None,
+        resolution=256,
+        load_vae_feat=False,
+        load_text_feat=False,
+        max_length=300,
+        config=None,
+        caption_proportion=None,
+        external_caption_suffixes=None,
+        external_clipscore_suffixes=None,
+        clip_thr=0.0,
+        clip_thr_temperature=1.0,
+        img_extension=".png",
+        **kwargs,
+    ):
+        if external_caption_suffixes is None:
+            external_caption_suffixes = []
+        if external_clipscore_suffixes is None:
+            external_clipscore_suffixes = []
+        self.logger = (
+            get_root_logger() if config is None else get_root_logger(osp.join(config.work_dir, "train_log.log"))
+        )
+        self.transform = transform if not load_vae_feat else None
+        self.load_vae_feat = load_vae_feat
+        self.load_text_feat = load_text_feat
+        self.resolution = resolution
+        self.max_length = max_length
+        self.caption_proportion = caption_proportion if caption_proportion is not None else {"prompt": 1.0}
+        self.external_caption_suffixes = external_caption_suffixes
+        self.external_clipscore_suffixes = external_clipscore_suffixes
+        self.clip_thr = clip_thr
+        self.clip_thr_temperature = clip_thr_temperature
+        self.default_prompt = "prompt"
+        self.img_extension = img_extension
+        self.data_dirs = data_dir if isinstance(data_dir, list) else [data_dir]
+        # self.meta_datas = [osp.join(data_dir, "meta_data.json") for data_dir in self.data_dirs]
+        self.dataset = []
+        for data_dir in self.data_dirs:
+            meta_data = json.load(open(osp.join(data_dir, "meta_data.json")))
+            self.dataset.extend([osp.join(data_dir, i) for i in meta_data["img_names"]])
+        self.dataset = self.dataset * 2000
+        self.logger.info(colored("Dataset is repeat 2000 times for toy dataset", "red", attrs=["bold"]))
+        self.ori_imgs_nums = len(self)
+        self.logger.info(f"Dataset samples: {len(self.dataset)}")
+        self.logger.info(f"Loading external caption json from: original_filename{external_caption_suffixes}.json")
+        self.logger.info(f"Loading external clipscore json from: original_filename{external_clipscore_suffixes}.json")
+        self.logger.info(f"external caption clipscore threshold: {clip_thr}, temperature: {clip_thr_temperature}")
+        self.logger.info(f"T5 max token length: {self.max_length}")
+    def getdata(self, idx):
+        data = self.dataset[idx]
+        self.key = data.split("/")[-1]
+        # info = json.load(open(f"{data}.json"))[self.key]
+        info = {}
+        with open(f"{data}.txt") as f:
+            info[self.default_prompt] = f.readlines()[0].strip()
+        # external json file
+        for suffix in self.external_caption_suffixes:
+            caption_json_path = f"{data}{suffix}.json"
+            if os.path.exists(caption_json_path):
+                try:
+                    caption_json = lru_json_load(caption_json_path)
+                except:
+                    caption_json = {}
+                if self.key in caption_json:
+                    info.update(caption_json[self.key])
+        caption_type, caption_clipscore = self.weighted_sample_clipscore(data, info)
+        caption_type = caption_type if caption_type in info else self.default_prompt
+        txt_fea = "" if info[caption_type] is None else info[caption_type]
+        data_info = {
+            "img_hw": torch.tensor([self.resolution, self.resolution], dtype=torch.float32),
+            "aspect_ratio": torch.tensor(1.0),
+        }
+        if self.load_vae_feat:
+            assert ValueError("Load VAE is not supported now")
+        else:
+            img = f"{data}{self.img_extension}"
+            img = Image.open(img)
+        if self.transform:
+            img = self.transform(img)
+        attention_mask = torch.ones(1, 1, self.max_length, dtype=torch.int16)  # 1x1xT
+        if self.load_text_feat:
+            npz_path = f"{self.key}.npz"
+            txt_info = np.load(npz_path)
+            txt_fea = torch.from_numpy(txt_info["caption_feature"])  # 1xTx4096
+            if "attention_mask" in txt_info:
+                attention_mask = torch.from_numpy(txt_info["attention_mask"])[None]
+            # make sure the feature length are the same
+            if txt_fea.shape[1] != self.max_length:
+                txt_fea = torch.cat([txt_fea, txt_fea[:, -1:].repeat(1, self.max_length - txt_fea.shape[1], 1)], dim=1)
+                attention_mask = torch.cat(
+                    [attention_mask, torch.zeros(1, 1, self.max_length - attention_mask.shape[-1])], dim=-1
+                )
+        return (
+            img,
+            txt_fea,
+            attention_mask.to(torch.int16),
+            data_info,
+            idx,
+            caption_type,
+            "",
+            str(caption_clipscore),
+        )
+    def __getitem__(self, idx):
+        for _ in range(10):
+            try:
+                data = self.getdata(idx)
+                return data
+            except Exception as e:
+                print(f"Error details: {str(e)}")
+                idx = idx + 1
+        raise RuntimeError("Too many bad data.")
+    def __len__(self):
+        return len(self.dataset)
+    def weighted_sample_fix_prob(self):
+        labels = list(self.caption_proportion.keys())
+        weights = list(self.caption_proportion.values())
+        sampled_label = random.choices(labels, weights=weights, k=1)[0]
+        return sampled_label
+    def weighted_sample_clipscore(self, data, info):
+        labels = []
+        weights = []
+        fallback_label = None
+        max_clip_score = float("-inf")
+        for suffix in self.external_clipscore_suffixes:
+            clipscore_json_path = f"{data}{suffix}.json"
+            if os.path.exists(clipscore_json_path):
+                try:
+                    clipscore_json = lru_json_load(clipscore_json_path)
+                except:
+                    clipscore_json = {}
+                if self.key in clipscore_json:
+                    clip_scores = clipscore_json[self.key]
+                    for caption_type, clip_score in clip_scores.items():
+                        clip_score = float(clip_score)
+                        if caption_type in info:
+                            if clip_score >= self.clip_thr:
+                                labels.append(caption_type)
+                                weights.append(clip_score)
+                            if clip_score > max_clip_score:
+                                max_clip_score = clip_score
+                                fallback_label = caption_type
+        if not labels and fallback_label:
+            return fallback_label, max_clip_score
+        if not labels:
+            return self.default_prompt, 0.0
+        adjusted_weights = np.array(weights) ** (1.0 / max(self.clip_thr_temperature, 0.01))
+        normalized_weights = adjusted_weights / np.sum(adjusted_weights)
+        sampled_label = random.choices(labels, weights=normalized_weights, k=1)[0]
+        # sampled_label = random.choices(labels, weights=[1]*len(weights), k=1)[0]
+        index = labels.index(sampled_label)
+        original_weight = weights[index]
+        return sampled_label, original_weight
+@DATASETS.register_module()
+class SanaWebDataset(torch.utils.data.Dataset):
+    def __init__(
+        self,
+        data_dir="",
+        meta_path=None,
+        cache_dir="/cache/data/sana-webds-meta",
+        max_shards_to_load=None,
+        transform=None,
+        resolution=256,
+        load_vae_feat=False,
+        load_text_feat=False,
+        max_length=300,
+        config=None,
+        caption_proportion=None,
+        sort_dataset=False,
+        num_replicas=None,
+        external_caption_suffixes=None,
+        external_clipscore_suffixes=None,
+        clip_thr=0.0,
+        clip_thr_temperature=1.0,
+        **kwargs,
+    ):
+        if external_caption_suffixes is None:
+            external_caption_suffixes = []
+        if external_clipscore_suffixes is None:
+            external_clipscore_suffixes = []
+        self.logger = (
+            get_root_logger() if config is None else get_root_logger(osp.join(config.work_dir, "train_log.log"))
+        )
+        self.transform = transform if not load_vae_feat else None
+        self.load_vae_feat = load_vae_feat
+        self.load_text_feat = load_text_feat
+        self.resolution = resolution
+        self.max_length = max_length
+        self.caption_proportion = caption_proportion if caption_proportion is not None else {"prompt": 1.0}
+        self.external_caption_suffixes = external_caption_suffixes
+        self.external_clipscore_suffixes = external_clipscore_suffixes
+        self.clip_thr = clip_thr
+        self.clip_thr_temperature = clip_thr_temperature
+        self.default_prompt = "prompt"
+        data_dirs = data_dir if isinstance(data_dir, list) else [data_dir]
+        meta_paths = meta_path if isinstance(meta_path, list) else [meta_path] * len(data_dirs)
+        self.meta_paths = []
+        for data_path, meta_path in zip(data_dirs, meta_paths):
+            self.data_path = osp.expanduser(data_path)
+            self.meta_path = osp.expanduser(meta_path) if meta_path is not None else None
+            _local_meta_path = osp.join(self.data_path, "wids-meta.json")
+            if meta_path is None and osp.exists(_local_meta_path):
+                self.logger.info(f"loading from {_local_meta_path}")
+                self.meta_path = meta_path = _local_meta_path
+            if meta_path is None:
+                self.meta_path = osp.join(
+                    osp.expanduser(cache_dir),
+                    self.data_path.replace("/", "--") + f".max_shards:{max_shards_to_load}" + ".wdsmeta.json",
+                )
+            assert osp.exists(self.meta_path), f"meta path not found in [{self.meta_path}] or [{_local_meta_path}]"
+            self.logger.info(f"[SimplyInternal] Loading meta information {self.meta_path}")
+            self.meta_paths.append(self.meta_path)
+        self._initialize_dataset(num_replicas, sort_dataset)
+        self.logger.info(f"Loading external caption json from: original_filename{external_caption_suffixes}.json")
+        self.logger.info(f"Loading external clipscore json from: original_filename{external_clipscore_suffixes}.json")
+        self.logger.info(f"external caption clipscore threshold: {clip_thr}, temperature: {clip_thr_temperature}")
+        self.logger.info(f"T5 max token length: {self.max_length}")
+        self.logger.warning(f"Sort the dataset: {sort_dataset}")
+    def _initialize_dataset(self, num_replicas, sort_dataset):
+        # uuid = abs(hash(self.meta_path)) % (10 ** 8)
+        import hashlib
+        uuid = hashlib.sha256(self.meta_path.encode()).hexdigest()[:8]
+        if len(self.meta_paths) > 0:
+            self.dataset = ShardListDatasetMulti(
+                self.meta_paths,
+                cache_dir=osp.expanduser(f"~/.cache/_wids_cache/{getpass.getuser()}-{uuid}"),
+                sort_data_inseq=sort_dataset,
+                num_replicas=num_replicas or dist.get_world_size(),
+            )
+        else:
+            # TODO: tmp to ensure there is no bug
+            self.dataset = ShardListDataset(
+                self.meta_path,
+                cache_dir=osp.expanduser(f"~/.cache/_wids_cache/{getpass.getuser()}-{uuid}"),
+            )
+        self.ori_imgs_nums = len(self)
+        self.logger.info(f"{self.dataset.data_info}")
+    def getdata(self, idx):
+        data = self.dataset[idx]
+        info = data[".json"]
+        self.key = data["__key__"]
+        dataindex_info = {
+            "index": data["__index__"],
+            "shard": "/".join(data["__shard__"].rsplit("/", 2)[-2:]),
+            "shardindex": data["__shardindex__"],
+        }
+        # external json file
+        for suffix in self.external_caption_suffixes:
+            caption_json_path = data["__shard__"].replace(".tar", f"{suffix}.json")
+            if os.path.exists(caption_json_path):
+                try:
+                    caption_json = lru_json_load(caption_json_path)
+                except:
+                    caption_json = {}
+                if self.key in caption_json:
+                    info.update(caption_json[self.key])
+        caption_type, caption_clipscore = self.weighted_sample_clipscore(data, info)
+        caption_type = caption_type if caption_type in info else self.default_prompt
+        txt_fea = "" if info[caption_type] is None else info[caption_type]
+        data_info = {
+            "img_hw": torch.tensor([self.resolution, self.resolution], dtype=torch.float32),
+            "aspect_ratio": torch.tensor(1.0),
+        }
+        if self.load_vae_feat:
+            img = data[".npy"]
+        else:
+            img = data[".png"] if ".png" in data else data[".jpg"]
+        if self.transform:
+            img = self.transform(img)
+        attention_mask = torch.ones(1, 1, self.max_length, dtype=torch.int16)  # 1x1xT
+        if self.load_text_feat:
+            npz_path = f"{self.key}.npz"
+            txt_info = np.load(npz_path)
+            txt_fea = torch.from_numpy(txt_info["caption_feature"])  # 1xTx4096
+            if "attention_mask" in txt_info:
+                attention_mask = torch.from_numpy(txt_info["attention_mask"])[None]
+            # make sure the feature length are the same
+            if txt_fea.shape[1] != self.max_length:
+                txt_fea = torch.cat([txt_fea, txt_fea[:, -1:].repeat(1, self.max_length - txt_fea.shape[1], 1)], dim=1)
+                attention_mask = torch.cat(
+                    [attention_mask, torch.zeros(1, 1, self.max_length - attention_mask.shape[-1])], dim=-1
+                )
+        return (
+            img,
+            txt_fea,
+            attention_mask.to(torch.int16),
+            data_info,
+            idx,
+            caption_type,
+            dataindex_info,
+            str(caption_clipscore),
+        )
+    def __getitem__(self, idx):
+        for _ in range(10):
+            try:
+                data = self.getdata(idx)
+                return data
+            except Exception as e:
+                print(f"Error details: {str(e)}")
+                idx = idx + 1
+        raise RuntimeError("Too many bad data.")
+    def __len__(self):
+        return len(self.dataset)
+    def weighted_sample_fix_prob(self):
+        labels = list(self.caption_proportion.keys())
+        weights = list(self.caption_proportion.values())
+        sampled_label = random.choices(labels, weights=weights, k=1)[0]
+        return sampled_label
+    def weighted_sample_clipscore(self, data, info):
+        labels = []
+        weights = []
+        fallback_label = None
+        max_clip_score = float("-inf")
+        for suffix in self.external_clipscore_suffixes:
+            clipscore_json_path = data["__shard__"].replace(".tar", f"{suffix}.json")
+            if os.path.exists(clipscore_json_path):
+                try:
+                    clipscore_json = lru_json_load(clipscore_json_path)
+                except:
+                    clipscore_json = {}
+                if self.key in clipscore_json:
+                    clip_scores = clipscore_json[self.key]
+                    for caption_type, clip_score in clip_scores.items():
+                        clip_score = float(clip_score)
+                        if caption_type in info:
+                            if clip_score >= self.clip_thr:
+                                labels.append(caption_type)
+                                weights.append(clip_score)
+                            if clip_score > max_clip_score:
+                                max_clip_score = clip_score
+                                fallback_label = caption_type
+        if not labels and fallback_label:
+            return fallback_label, max_clip_score
+        if not labels:
+            return self.default_prompt, 0.0
+        adjusted_weights = np.array(weights) ** (1.0 / max(self.clip_thr_temperature, 0.01))
+        normalized_weights = adjusted_weights / np.sum(adjusted_weights)
+        sampled_label = random.choices(labels, weights=normalized_weights, k=1)[0]
+        # sampled_label = random.choices(labels, weights=[1]*len(weights), k=1)[0]
+        index = labels.index(sampled_label)
+        original_weight = weights[index]
+        return sampled_label, original_weight
+    def get_data_info(self, idx):
+        try:
+            data = self.dataset[idx]
+            info = data[".json"]
+            key = data["__key__"]
+            version = info.get("version", "others")
+            return {"height": info["height"], "width": info["width"], "version": version, "key": key}
+        except Exception as e:
+            print(f"Error details: {str(e)}")
+            return None
+if __name__ == "__main__":
+    from torch.utils.data import DataLoader
+    from diffusion.data.transforms import get_transform
+    image_size = 1024  # 256
+    transform = get_transform("default_train", image_size)
+    train_dataset = SanaWebDataset(
+        data_dir="debug_data_train/vaef32c32/debug_data",
+        resolution=image_size,
+        transform=transform,
+        max_length=300,
+        load_vae_feat=True,
+        num_replicas=1,
+    )
+    dataloader = DataLoader(train_dataset, batch_size=32, shuffle=False, num_workers=4)
+    for data in dataloader:
+        img, txt_fea, attention_mask, data_info = data
+        print(txt_fea)
+        break

diffusion/data/datasets/sana_data_multi_scale.py ADDED Viewed

	@@ -0,0 +1,265 @@

+# Copyright 2024 NVIDIA CORPORATION & AFFILIATES
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+# This file is modified from https://github.com/PixArt-alpha/PixArt-sigma
+import os
+import random
+import numpy as np
+import torch
+from torchvision import transforms as T
+from torchvision.transforms.functional import InterpolationMode
+from tqdm import tqdm
+from diffusion.data.builder import DATASETS
+from diffusion.data.datasets.sana_data import SanaWebDataset
+from diffusion.data.datasets.utils import *
+from diffusion.data.wids import lru_json_load
+def get_closest_ratio(height: float, width: float, ratios: dict):
+    aspect_ratio = height / width
+    closest_ratio = min(ratios.keys(), key=lambda ratio: abs(float(ratio) - aspect_ratio))
+    return ratios[closest_ratio], float(closest_ratio)
+@DATASETS.register_module()
+class SanaWebDatasetMS(SanaWebDataset):
+    def __init__(
+        self,
+        data_dir="",
+        meta_path=None,
+        cache_dir="/cache/data/sana-webds-meta",
+        max_shards_to_load=None,
+        transform=None,
+        resolution=256,
+        sample_subset=None,
+        load_vae_feat=False,
+        load_text_feat=False,
+        input_size=32,
+        patch_size=2,
+        max_length=300,
+        config=None,
+        caption_proportion=None,
+        sort_dataset=False,
+        num_replicas=None,
+        external_caption_suffixes=None,
+        external_clipscore_suffixes=None,
+        clip_thr=0.0,
+        clip_thr_temperature=1.0,
+        vae_downsample_rate=32,
+        **kwargs,
+    ):
+        super().__init__(
+            data_dir=data_dir,
+            meta_path=meta_path,
+            cache_dir=cache_dir,
+            max_shards_to_load=max_shards_to_load,
+            transform=transform,
+            resolution=resolution,
+            sample_subset=sample_subset,
+            load_vae_feat=load_vae_feat,
+            load_text_feat=load_text_feat,
+            input_size=input_size,
+            patch_size=patch_size,
+            max_length=max_length,
+            config=config,
+            caption_proportion=caption_proportion,
+            sort_dataset=sort_dataset,
+            num_replicas=num_replicas,
+            external_caption_suffixes=external_caption_suffixes,
+            external_clipscore_suffixes=external_clipscore_suffixes,
+            clip_thr=clip_thr,
+            clip_thr_temperature=clip_thr_temperature,
+            vae_downsample_rate=32,
+            **kwargs,
+        )
+        self.base_size = int(kwargs["aspect_ratio_type"].split("_")[-1])
+        self.aspect_ratio = eval(kwargs.pop("aspect_ratio_type"))  # base aspect ratio
+        self.ratio_index = {}
+        self.ratio_nums = {}
+        self.interpolate_model = InterpolationMode.BICUBIC
+        self.interpolate_model = (
+            InterpolationMode.BICUBIC
+            if self.aspect_ratio not in [ASPECT_RATIO_2048, ASPECT_RATIO_2880]
+            else InterpolationMode.LANCZOS
+        )
+        for k, v in self.aspect_ratio.items():
+            self.ratio_index[float(k)] = []
+            self.ratio_nums[float(k)] = 0
+        self.vae_downsample_rate = vae_downsample_rate
+    def __getitem__(self, idx):
+        for _ in range(10):
+            try:
+                data = self.getdata(idx)
+                return data
+            except Exception as e:
+                print(f"Error details: {str(e)}")
+                idx = random.choice(self.ratio_index[self.closest_ratio])
+        raise RuntimeError("Too many bad data.")
+    def getdata(self, idx):
+        data = self.dataset[idx]
+        info = data[".json"]
+        self.key = data["__key__"]
+        dataindex_info = {
+            "index": data["__index__"],
+            "shard": "/".join(data["__shard__"].rsplit("/", 2)[-2:]),
+            "shardindex": data["__shardindex__"],
+        }
+        # external json file
+        for suffix in self.external_caption_suffixes:
+            caption_json_path = data["__shard__"].replace(".tar", f"{suffix}.json")
+            if os.path.exists(caption_json_path):
+                try:
+                    caption_json = lru_json_load(caption_json_path)
+                except:
+                    caption_json = {}
+                if self.key in caption_json:
+                    info.update(caption_json[self.key])
+        data_info = {}
+        ori_h, ori_w = info["height"], info["width"]
+        # Calculate the closest aspect ratio and resize & crop image[w, h]
+        closest_size, closest_ratio = get_closest_ratio(ori_h, ori_w, self.aspect_ratio)
+        closest_size = list(map(lambda x: int(x), closest_size))
+        self.closest_ratio = closest_ratio
+        data_info["img_hw"] = torch.tensor([ori_h, ori_w], dtype=torch.float32)
+        data_info["aspect_ratio"] = closest_ratio
+        caption_type, caption_clipscore = self.weighted_sample_clipscore(data, info)
+        caption_type = caption_type if caption_type in info else self.default_prompt
+        txt_fea = "" if info[caption_type] is None else info[caption_type]
+        if self.load_vae_feat:
+            img = data[".npy"]
+            if len(img.shape) == 4 and img.shape[0] == 1:
+                img = img[0]
+            h, w = (img.shape[1], img.shape[2])
+            assert h == int(closest_size[0] // self.vae_downsample_rate) and w == int(
+                closest_size[1] // self.vae_downsample_rate
+            ), f"h: {h}, w: {w}, ori_hw: {closest_size}, data_info: {dataindex_info}"
+        else:
+            img = data[".png"] if ".png" in data else data[".jpg"]
+            if closest_size[0] / ori_h > closest_size[1] / ori_w:
+                resize_size = closest_size[0], int(ori_w * closest_size[0] / ori_h)
+            else:
+                resize_size = int(ori_h * closest_size[1] / ori_w), closest_size[1]
+            self.transform = T.Compose(
+                [
+                    T.Lambda(lambda img: img.convert("RGB")),
+                    T.Resize(resize_size, interpolation=self.interpolate_model),  # Image.BICUBIC
+                    T.CenterCrop(closest_size),
+                    T.ToTensor(),
+                    T.Normalize([0.5], [0.5]),
+                ]
+            )
+        if idx not in self.ratio_index[closest_ratio]:
+            self.ratio_index[closest_ratio].append(idx)
+        if self.transform:
+            img = self.transform(img)
+        attention_mask = torch.ones(1, 1, self.max_length, dtype=torch.int16)  # 1x1xT
+        if self.load_text_feat:
+            npz_path = f"{self.key}.npz"
+            txt_info = np.load(npz_path)
+            txt_fea = torch.from_numpy(txt_info["caption_feature"])  # 1xTx4096
+            if "attention_mask" in txt_info:
+                attention_mask = torch.from_numpy(txt_info["attention_mask"])[None]
+            # make sure the feature length are the same
+            if txt_fea.shape[1] != self.max_length:
+                txt_fea = torch.cat([txt_fea, txt_fea[:, -1:].repeat(1, self.max_length - txt_fea.shape[1], 1)], dim=1)
+                attention_mask = torch.cat(
+                    [attention_mask, torch.zeros(1, 1, self.max_length - attention_mask.shape[-1])], dim=-1
+                )
+        return (
+            img,
+            txt_fea,
+            attention_mask.to(torch.int16),
+            data_info,
+            idx,
+            caption_type,
+            dataindex_info,
+            str(caption_clipscore),
+        )
+    def __len__(self):
+        return len(self.dataset)
+@DATASETS.register_module()
+class DummyDatasetMS(SanaWebDatasetMS):
+    def __init__(self, **kwargs):
+        self.base_size = int(kwargs["aspect_ratio_type"].split("_")[-1])
+        self.aspect_ratio = eval(kwargs.pop("aspect_ratio_type"))  # base aspect ratio
+        self.ratio_index = {}
+        self.ratio_nums = {}
+        self.interpolate_model = InterpolationMode.BICUBIC
+        self.interpolate_model = (
+            InterpolationMode.BICUBIC
+            if self.aspect_ratio not in [ASPECT_RATIO_2048, ASPECT_RATIO_2880]
+            else InterpolationMode.LANCZOS
+        )
+        for k, v in self.aspect_ratio.items():
+            self.ratio_index[float(k)] = []
+            self.ratio_nums[float(k)] = 0
+        self.ori_imgs_nums = 1_000_000
+        self.height = 384
+        self.width = 672
+    def __getitem__(self, idx):
+        img = torch.randn((3, self.height, self.width))
+        txt_fea = "The image depicts a young woman standing in the middle of a street, leaning against a silver car. She is dressed in a stylish outfit consisting of a blue blouse and black pants. Her hair is long and dark, and she is looking directly at the camera with a confident expression. The street is lined with colorful buildings, and the trees have autumn leaves, suggesting the season is fall. The lighting is warm, with sunlight casting long shadows on the street. There are a few people in the background, and the overall atmosphere is vibrant and lively."
+        attention_mask = torch.ones(1, 1, 300, dtype=torch.int16)  # 1x1xT
+        data_info = {"img_hw": torch.tensor([816.0, 1456.0]), "aspect_ratio": 0.57}
+        idx = 2500
+        caption_type = self.default_prompt
+        dataindex_info = {"index": 2500, "shard": "data_for_test_after_change/00000000.tar", "shardindex": 2500}
+        return img, txt_fea, attention_mask, data_info, idx, caption_type, dataindex_info
+    def __len__(self):
+        return self.ori_imgs_nums
+    def get_data_info(self, idx):
+        return {"height": self.height, "width": self.width, "version": "1.0", "key": "dummpy_key"}
+if __name__ == "__main__":
+    from torch.utils.data import DataLoader
+    from diffusion.data.datasets.utils import ASPECT_RATIO_1024
+    from diffusion.data.transforms import get_transform
+    image_size = 256
+    transform = get_transform("default_train", image_size)
+    data_dir = ["data/debug_data_train/debug_data"]
+    for data_path in data_dir:
+        train_dataset = SanaWebDatasetMS(data_dir=data_path, resolution=image_size, transform=transform, max_length=300)
+        dataloader = DataLoader(train_dataset, batch_size=1, shuffle=False, num_workers=4)
+        for data in tqdm(dataloader):
+            break
+        print(dataloader.dataset.index_info)

diffusion/data/datasets/utils.py ADDED Viewed

	@@ -0,0 +1,506 @@

+# Copyright 2024 NVIDIA CORPORATION & AFFILIATES
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+# This file is modified from https://github.com/PixArt-alpha/PixArt-sigma
+ASPECT_RATIO_4096 = {
+    "0.25": [2048.0, 8192.0],
+    "0.26": [2048.0, 7936.0],
+    "0.27": [2048.0, 7680.0],
+    "0.28": [2048.0, 7424.0],
+    "0.32": [2304.0, 7168.0],
+    "0.33": [2304.0, 6912.0],
+    "0.35": [2304.0, 6656.0],
+    "0.4": [2560.0, 6400.0],
+    "0.42": [2560.0, 6144.0],
+    "0.48": [2816.0, 5888.0],
+    "0.5": [2816.0, 5632.0],
+    "0.52": [2816.0, 5376.0],
+    "0.57": [3072.0, 5376.0],
+    "0.6": [3072.0, 5120.0],
+    "0.68": [3328.0, 4864.0],
+    "0.72": [3328.0, 4608.0],
+    "0.78": [3584.0, 4608.0],
+    "0.82": [3584.0, 4352.0],
+    "0.88": [3840.0, 4352.0],
+    "0.94": [3840.0, 4096.0],
+    "1.0": [4096.0, 4096.0],
+    "1.07": [4096.0, 3840.0],
+    "1.13": [4352.0, 3840.0],
+    "1.21": [4352.0, 3584.0],
+    "1.29": [4608.0, 3584.0],
+    "1.38": [4608.0, 3328.0],
+    "1.46": [4864.0, 3328.0],
+    "1.67": [5120.0, 3072.0],
+    "1.75": [5376.0, 3072.0],
+    "2.0": [5632.0, 2816.0],
+    "2.09": [5888.0, 2816.0],
+    "2.4": [6144.0, 2560.0],
+    "2.5": [6400.0, 2560.0],
+    "2.89": [6656.0, 2304.0],
+    "3.0": [6912.0, 2304.0],
+    "3.11": [7168.0, 2304.0],
+    "3.62": [7424.0, 2048.0],
+    "3.75": [7680.0, 2048.0],
+    "3.88": [7936.0, 2048.0],
+    "4.0": [8192.0, 2048.0],
+}
+ASPECT_RATIO_2880 = {
+    "0.25": [1408.0, 5760.0],
+    "0.26": [1408.0, 5568.0],
+    "0.27": [1408.0, 5376.0],
+    "0.28": [1408.0, 5184.0],
+    "0.32": [1600.0, 4992.0],
+    "0.33": [1600.0, 4800.0],
+    "0.34": [1600.0, 4672.0],
+    "0.4": [1792.0, 4480.0],
+    "0.42": [1792.0, 4288.0],
+    "0.47": [1920.0, 4096.0],
+    "0.49": [1920.0, 3904.0],
+    "0.51": [1920.0, 3776.0],
+    "0.55": [2112.0, 3840.0],
+    "0.59": [2112.0, 3584.0],
+    "0.68": [2304.0, 3392.0],
+    "0.72": [2304.0, 3200.0],
+    "0.78": [2496.0, 3200.0],
+    "0.83": [2496.0, 3008.0],
+    "0.89": [2688.0, 3008.0],
+    "0.93": [2688.0, 2880.0],
+    "1.0": [2880.0, 2880.0],
+    "1.07": [2880.0, 2688.0],
+    "1.12": [3008.0, 2688.0],
+    "1.21": [3008.0, 2496.0],
+    "1.28": [3200.0, 2496.0],
+    "1.39": [3200.0, 2304.0],
+    "1.47": [3392.0, 2304.0],
+    "1.7": [3584.0, 2112.0],
+    "1.82": [3840.0, 2112.0],
+    "2.03": [3904.0, 1920.0],
+    "2.13": [4096.0, 1920.0],
+    "2.39": [4288.0, 1792.0],
+    "2.5": [4480.0, 1792.0],
+    "2.92": [4672.0, 1600.0],
+    "3.0": [4800.0, 1600.0],
+    "3.12": [4992.0, 1600.0],
+    "3.68": [5184.0, 1408.0],
+    "3.82": [5376.0, 1408.0],
+    "3.95": [5568.0, 1408.0],
+    "4.0": [5760.0, 1408.0],
+}
+ASPECT_RATIO_2048 = {
+    "0.25": [1024.0, 4096.0],
+    "0.26": [1024.0, 3968.0],
+    "0.27": [1024.0, 3840.0],
+    "0.28": [1024.0, 3712.0],
+    "0.32": [1152.0, 3584.0],
+    "0.33": [1152.0, 3456.0],
+    "0.35": [1152.0, 3328.0],
+    "0.4": [1280.0, 3200.0],
+    "0.42": [1280.0, 3072.0],
+    "0.48": [1408.0, 2944.0],
+    "0.5": [1408.0, 2816.0],
+    "0.52": [1408.0, 2688.0],
+    "0.57": [1536.0, 2688.0],
+    "0.6": [1536.0, 2560.0],
+    "0.68": [1664.0, 2432.0],
+    "0.72": [1664.0, 2304.0],
+    "0.78": [1792.0, 2304.0],
+    "0.82": [1792.0, 2176.0],
+    "0.88": [1920.0, 2176.0],
+    "0.94": [1920.0, 2048.0],
+    "1.0": [2048.0, 2048.0],
+    "1.07": [2048.0, 1920.0],
+    "1.13": [2176.0, 1920.0],
+    "1.21": [2176.0, 1792.0],
+    "1.29": [2304.0, 1792.0],
+    "1.38": [2304.0, 1664.0],
+    "1.46": [2432.0, 1664.0],
+    "1.67": [2560.0, 1536.0],
+    "1.75": [2688.0, 1536.0],
+    "2.0": [2816.0, 1408.0],
+    "2.09": [2944.0, 1408.0],
+    "2.4": [3072.0, 1280.0],
+    "2.5": [3200.0, 1280.0],
+    "2.89": [3328.0, 1152.0],
+    "3.0": [3456.0, 1152.0],
+    "3.11": [3584.0, 1152.0],
+    "3.62": [3712.0, 1024.0],
+    "3.75": [3840.0, 1024.0],
+    "3.88": [3968.0, 1024.0],
+    "4.0": [4096.0, 1024.0],
+}
+ASPECT_RATIO_1024 = {
+    "0.25": [512.0, 2048.0],
+    "0.26": [512.0, 1984.0],
+    "0.27": [512.0, 1920.0],
+    "0.28": [512.0, 1856.0],
+    "0.32": [576.0, 1792.0],
+    "0.33": [576.0, 1728.0],
+    "0.35": [576.0, 1664.0],
+    "0.4": [640.0, 1600.0],
+    "0.42": [640.0, 1536.0],
+    "0.48": [704.0, 1472.0],
+    "0.5": [704.0, 1408.0],
+    "0.52": [704.0, 1344.0],
+    "0.57": [768.0, 1344.0],
+    "0.6": [768.0, 1280.0],
+    "0.68": [832.0, 1216.0],
+    "0.72": [832.0, 1152.0],
+    "0.78": [896.0, 1152.0],
+    "0.82": [896.0, 1088.0],
+    "0.88": [960.0, 1088.0],
+    "0.94": [960.0, 1024.0],
+    "1.0": [1024.0, 1024.0],
+    "1.07": [1024.0, 960.0],
+    "1.13": [1088.0, 960.0],
+    "1.21": [1088.0, 896.0],
+    "1.29": [1152.0, 896.0],
+    "1.38": [1152.0, 832.0],
+    "1.46": [1216.0, 832.0],
+    "1.67": [1280.0, 768.0],
+    "1.75": [1344.0, 768.0],
+    "2.0": [1408.0, 704.0],
+    "2.09": [1472.0, 704.0],
+    "2.4": [1536.0, 640.0],
+    "2.5": [1600.0, 640.0],
+    "2.89": [1664.0, 576.0],
+    "3.0": [1728.0, 576.0],
+    "3.11": [1792.0, 576.0],
+    "3.62": [1856.0, 512.0],
+    "3.75": [1920.0, 512.0],
+    "3.88": [1984.0, 512.0],
+    "4.0": [2048.0, 512.0],
+}
+ASPECT_RATIO_512 = {
+    "0.25": [256.0, 1024.0],
+    "0.26": [256.0, 992.0],
+    "0.27": [256.0, 960.0],
+    "0.28": [256.0, 928.0],
+    "0.32": [288.0, 896.0],
+    "0.33": [288.0, 864.0],
+    "0.35": [288.0, 832.0],
+    "0.4": [320.0, 800.0],
+    "0.42": [320.0, 768.0],
+    "0.48": [352.0, 736.0],
+    "0.5": [352.0, 704.0],
+    "0.52": [352.0, 672.0],
+    "0.57": [384.0, 672.0],
+    "0.6": [384.0, 640.0],
+    "0.68": [416.0, 608.0],
+    "0.72": [416.0, 576.0],
+    "0.78": [448.0, 576.0],
+    "0.82": [448.0, 544.0],
+    "0.88": [480.0, 544.0],
+    "0.94": [480.0, 512.0],
+    "1.0": [512.0, 512.0],
+    "1.07": [512.0, 480.0],
+    "1.13": [544.0, 480.0],
+    "1.21": [544.0, 448.0],
+    "1.29": [576.0, 448.0],
+    "1.38": [576.0, 416.0],
+    "1.46": [608.0, 416.0],
+    "1.67": [640.0, 384.0],
+    "1.75": [672.0, 384.0],
+    "2.0": [704.0, 352.0],
+    "2.09": [736.0, 352.0],
+    "2.4": [768.0, 320.0],
+    "2.5": [800.0, 320.0],
+    "2.89": [832.0, 288.0],
+    "3.0": [864.0, 288.0],
+    "3.11": [896.0, 288.0],
+    "3.62": [928.0, 256.0],
+    "3.75": [960.0, 256.0],
+    "3.88": [992.0, 256.0],
+    "4.0": [1024.0, 256.0],
+}
+ASPECT_RATIO_256 = {
+    "0.25": [128.0, 512.0],
+    "0.26": [128.0, 496.0],
+    "0.27": [128.0, 480.0],
+    "0.28": [128.0, 464.0],
+    "0.32": [144.0, 448.0],
+    "0.33": [144.0, 432.0],
+    "0.35": [144.0, 416.0],
+    "0.4": [160.0, 400.0],
+    "0.42": [160.0, 384.0],
+    "0.48": [176.0, 368.0],
+    "0.5": [176.0, 352.0],
+    "0.52": [176.0, 336.0],
+    "0.57": [192.0, 336.0],
+    "0.6": [192.0, 320.0],
+    "0.68": [208.0, 304.0],
+    "0.72": [208.0, 288.0],
+    "0.78": [224.0, 288.0],
+    "0.82": [224.0, 272.0],
+    "0.88": [240.0, 272.0],
+    "0.94": [240.0, 256.0],
+    "1.0": [256.0, 256.0],
+    "1.07": [256.0, 240.0],
+    "1.13": [272.0, 240.0],
+    "1.21": [272.0, 224.0],
+    "1.29": [288.0, 224.0],
+    "1.38": [288.0, 208.0],
+    "1.46": [304.0, 208.0],
+    "1.67": [320.0, 192.0],
+    "1.75": [336.0, 192.0],
+    "2.0": [352.0, 176.0],
+    "2.09": [368.0, 176.0],
+    "2.4": [384.0, 160.0],
+    "2.5": [400.0, 160.0],
+    "2.89": [416.0, 144.0],
+    "3.0": [432.0, 144.0],
+    "3.11": [448.0, 144.0],
+    "3.62": [464.0, 128.0],
+    "3.75": [480.0, 128.0],
+    "3.88": [496.0, 128.0],
+    "4.0": [512.0, 128.0],
+}
+ASPECT_RATIO_256_TEST = {
+    "0.25": [128.0, 512.0],
+    "0.28": [128.0, 464.0],
+    "0.32": [144.0, 448.0],
+    "0.33": [144.0, 432.0],
+    "0.35": [144.0, 416.0],
+    "0.4": [160.0, 400.0],
+    "0.42": [160.0, 384.0],
+    "0.48": [176.0, 368.0],
+    "0.5": [176.0, 352.0],
+    "0.52": [176.0, 336.0],
+    "0.57": [192.0, 336.0],
+    "0.6": [192.0, 320.0],
+    "0.68": [208.0, 304.0],
+    "0.72": [208.0, 288.0],
+    "0.78": [224.0, 288.0],
+    "0.82": [224.0, 272.0],
+    "0.88": [240.0, 272.0],
+    "0.94": [240.0, 256.0],
+    "1.0": [256.0, 256.0],
+    "1.07": [256.0, 240.0],
+    "1.13": [272.0, 240.0],
+    "1.21": [272.0, 224.0],
+    "1.29": [288.0, 224.0],
+    "1.38": [288.0, 208.0],
+    "1.46": [304.0, 208.0],
+    "1.67": [320.0, 192.0],
+    "1.75": [336.0, 192.0],
+    "2.0": [352.0, 176.0],
+    "2.09": [368.0, 176.0],
+    "2.4": [384.0, 160.0],
+    "2.5": [400.0, 160.0],
+    "3.0": [432.0, 144.0],
+    "4.0": [512.0, 128.0],
+}
+ASPECT_RATIO_512_TEST = {
+    "0.25": [256.0, 1024.0],
+    "0.28": [256.0, 928.0],
+    "0.32": [288.0, 896.0],
+    "0.33": [288.0, 864.0],
+    "0.35": [288.0, 832.0],
+    "0.4": [320.0, 800.0],
+    "0.42": [320.0, 768.0],
+    "0.48": [352.0, 736.0],
+    "0.5": [352.0, 704.0],
+    "0.52": [352.0, 672.0],
+    "0.57": [384.0, 672.0],
+    "0.6": [384.0, 640.0],
+    "0.68": [416.0, 608.0],
+    "0.72": [416.0, 576.0],
+    "0.78": [448.0, 576.0],
+    "0.82": [448.0, 544.0],
+    "0.88": [480.0, 544.0],
+    "0.94": [480.0, 512.0],
+    "1.0": [512.0, 512.0],
+    "1.07": [512.0, 480.0],
+    "1.13": [544.0, 480.0],
+    "1.21": [544.0, 448.0],
+    "1.29": [576.0, 448.0],
+    "1.38": [576.0, 416.0],
+    "1.46": [608.0, 416.0],
+    "1.67": [640.0, 384.0],
+    "1.75": [672.0, 384.0],
+    "2.0": [704.0, 352.0],
+    "2.09": [736.0, 352.0],
+    "2.4": [768.0, 320.0],
+    "2.5": [800.0, 320.0],
+    "3.0": [864.0, 288.0],
+    "4.0": [1024.0, 256.0],
+}
+ASPECT_RATIO_1024_TEST = {
+    "0.25": [512.0, 2048.0],
+    "0.28": [512.0, 1856.0],
+    "0.32": [576.0, 1792.0],
+    "0.33": [576.0, 1728.0],
+    "0.35": [576.0, 1664.0],
+    "0.4": [640.0, 1600.0],
+    "0.42": [640.0, 1536.0],
+    "0.48": [704.0, 1472.0],
+    "0.5": [704.0, 1408.0],
+    "0.52": [704.0, 1344.0],
+    "0.57": [768.0, 1344.0],
+    "0.6": [768.0, 1280.0],
+    "0.68": [832.0, 1216.0],
+    "0.72": [832.0, 1152.0],
+    "0.78": [896.0, 1152.0],
+    "0.82": [896.0, 1088.0],
+    "0.88": [960.0, 1088.0],
+    "0.94": [960.0, 1024.0],
+    "1.0": [1024.0, 1024.0],
+    "1.07": [1024.0, 960.0],
+    "1.13": [1088.0, 960.0],
+    "1.21": [1088.0, 896.0],
+    "1.29": [1152.0, 896.0],
+    "1.38": [1152.0, 832.0],
+    "1.46": [1216.0, 832.0],
+    "1.67": [1280.0, 768.0],
+    "1.75": [1344.0, 768.0],
+    "2.0": [1408.0, 704.0],
+    "2.09": [1472.0, 704.0],
+    "2.4": [1536.0, 640.0],
+    "2.5": [1600.0, 640.0],
+    "3.0": [1728.0, 576.0],
+    "4.0": [2048.0, 512.0],
+}
+ASPECT_RATIO_2048_TEST = {
+    "0.25": [1024.0, 4096.0],
+    "0.26": [1024.0, 3968.0],
+    "0.32": [1152.0, 3584.0],
+    "0.33": [1152.0, 3456.0],
+    "0.35": [1152.0, 3328.0],
+    "0.4": [1280.0, 3200.0],
+    "0.42": [1280.0, 3072.0],
+    "0.48": [1408.0, 2944.0],
+    "0.5": [1408.0, 2816.0],
+    "0.52": [1408.0, 2688.0],
+    "0.57": [1536.0, 2688.0],
+    "0.6": [1536.0, 2560.0],
+    "0.68": [1664.0, 2432.0],
+    "0.72": [1664.0, 2304.0],
+    "0.78": [1792.0, 2304.0],
+    "0.82": [1792.0, 2176.0],
+    "0.88": [1920.0, 2176.0],
+    "0.94": [1920.0, 2048.0],
+    "1.0": [2048.0, 2048.0],
+    "1.07": [2048.0, 1920.0],
+    "1.13": [2176.0, 1920.0],
+    "1.21": [2176.0, 1792.0],
+    "1.29": [2304.0, 1792.0],
+    "1.38": [2304.0, 1664.0],
+    "1.46": [2432.0, 1664.0],
+    "1.67": [2560.0, 1536.0],
+    "1.75": [2688.0, 1536.0],
+    "2.0": [2816.0, 1408.0],
+    "2.09": [2944.0, 1408.0],
+    "2.4": [3072.0, 1280.0],
+    "2.5": [3200.0, 1280.0],
+    "3.0": [3456.0, 1152.0],
+    "4.0": [4096.0, 1024.0],
+}
+ASPECT_RATIO_2880_TEST = {
+    "0.25": [2048.0, 8192.0],
+    "0.26": [2048.0, 7936.0],
+    "0.32": [2304.0, 7168.0],
+    "0.33": [2304.0, 6912.0],
+    "0.35": [2304.0, 6656.0],
+    "0.4": [2560.0, 6400.0],
+    "0.42": [2560.0, 6144.0],
+    "0.48": [2816.0, 5888.0],
+    "0.5": [2816.0, 5632.0],
+    "0.52": [2816.0, 5376.0],
+    "0.57": [3072.0, 5376.0],
+    "0.6": [3072.0, 5120.0],
+    "0.68": [3328.0, 4864.0],
+    "0.72": [3328.0, 4608.0],
+    "0.78": [3584.0, 4608.0],
+    "0.82": [3584.0, 4352.0],
+    "0.88": [3840.0, 4352.0],
+    "0.94": [3840.0, 4096.0],
+    "1.0": [4096.0, 4096.0],
+    "1.07": [4096.0, 3840.0],
+    "1.13": [4352.0, 3840.0],
+    "1.21": [4352.0, 3584.0],
+    "1.29": [4608.0, 3584.0],
+    "1.38": [4608.0, 3328.0],
+    "1.46": [4864.0, 3328.0],
+    "1.67": [5120.0, 3072.0],
+    "1.75": [5376.0, 3072.0],
+    "2.0": [5632.0, 2816.0],
+    "2.09": [5888.0, 2816.0],
+    "2.4": [6144.0, 2560.0],
+    "2.5": [6400.0, 2560.0],
+    "3.0": [6912.0, 2304.0],
+    "4.0": [8192.0, 2048.0],
+}
+ASPECT_RATIO_4096_TEST = {
+    "0.25": [2048.0, 8192.0],
+    "0.26": [2048.0, 7936.0],
+    "0.27": [2048.0, 7680.0],
+    "0.28": [2048.0, 7424.0],
+    "0.32": [2304.0, 7168.0],
+    "0.33": [2304.0, 6912.0],
+    "0.35": [2304.0, 6656.0],
+    "0.4": [2560.0, 6400.0],
+    "0.42": [2560.0, 6144.0],
+    "0.48": [2816.0, 5888.0],
+    "0.5": [2816.0, 5632.0],
+    "0.52": [2816.0, 5376.0],
+    "0.57": [3072.0, 5376.0],
+    "0.6": [3072.0, 5120.0],
+    "0.68": [3328.0, 4864.0],
+    "0.72": [3328.0, 4608.0],
+    "0.78": [3584.0, 4608.0],
+    "0.82": [3584.0, 4352.0],
+    "0.88": [3840.0, 4352.0],
+    "0.94": [3840.0, 4096.0],
+    "1.0": [4096.0, 4096.0],
+    "1.07": [4096.0, 3840.0],
+    "1.13": [4352.0, 3840.0],
+    "1.21": [4352.0, 3584.0],
+    "1.29": [4608.0, 3584.0],
+    "1.38": [4608.0, 3328.0],
+    "1.46": [4864.0, 3328.0],
+    "1.67": [5120.0, 3072.0],
+    "1.75": [5376.0, 3072.0],
+    "2.0": [5632.0, 2816.0],
+    "2.09": [5888.0, 2816.0],
+    "2.4": [6144.0, 2560.0],
+    "2.5": [6400.0, 2560.0],
+    "2.89": [6656.0, 2304.0],
+    "3.0": [6912.0, 2304.0],
+    "3.11": [7168.0, 2304.0],
+    "3.62": [7424.0, 2048.0],
+    "3.75": [7680.0, 2048.0],
+    "3.88": [7936.0, 2048.0],
+    "4.0": [8192.0, 2048.0],
+}
+ASPECT_RATIO_1280_TEST = {"1.0": [1280.0, 1280.0]}
+ASPECT_RATIO_1536_TEST = {"1.0": [1536.0, 1536.0]}
+ASPECT_RATIO_768_TEST = {"1.0": [768.0, 768.0]}
+def get_chunks(lst, n):
+    for i in range(0, len(lst), n):
+        yield lst[i : i + n]

diffusion/data/transforms.py ADDED Viewed

	@@ -0,0 +1,46 @@

+# Copyright 2024 NVIDIA CORPORATION & AFFILIATES
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+import torchvision.transforms as T
+TRANSFORMS = dict()
+def register_transform(transform):
+    name = transform.__name__
+    if name in TRANSFORMS:
+        raise RuntimeError(f"Transform {name} has already registered.")
+    TRANSFORMS.update({name: transform})
+def get_transform(type, resolution):
+    transform = TRANSFORMS[type](resolution)
+    transform = T.Compose(transform)
+    transform.image_size = resolution
+    return transform
+@register_transform
+def default_train(n_px):
+    transform = [
+        T.Lambda(lambda img: img.convert("RGB")),
+        T.Resize(n_px),  # Image.BICUBIC
+        T.CenterCrop(n_px),
+        # T.RandomHorizontalFlip(),
+        T.ToTensor(),
+        T.Normalize([0.5], [0.5]),
+    ]
+    return transform

diffusion/data/wids/__init__.py ADDED Viewed

	@@ -0,0 +1,16 @@

+# Copyright (c) 2017-2019 NVIDIA CORPORATION. All rights reserved.
+# This file is part of the WebDataset library.
+# See the LICENSE file for licensing terms (BSD-style).
+#
+# flake8: noqa
+from .wids import (
+    ChunkedSampler,
+    DistributedChunkedSampler,
+    DistributedLocalSampler,
+    DistributedRangedSampler,
+    ShardedSampler,
+    ShardListDataset,
+    ShardListDatasetMulti,
+    lru_json_load,
+)

diffusion/data/wids/wids.py ADDED Viewed

	@@ -0,0 +1,1051 @@

+# Copyright 2024 NVIDIA CORPORATION & AFFILIATES
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+# This file is modified from https://github.com/NVlabs/VILA/tree/main/llava/wids
+import base64
+import gzip
+import hashlib
+import io
+import json
+import math
+import os
+import os.path as osp
+import random
+import re
+import sqlite3
+import sys
+import tempfile
+import uuid
+import warnings
+from functools import lru_cache, partial
+from typing import Any, BinaryIO, Dict, Optional, TypeVar, Union
+from urllib.parse import quote, urlparse
+import numpy as np
+import torch
+import torch.distributed as dist
+from torch.utils.data.distributed import DistributedSampler
+from .wids_dl import download_and_open
+from .wids_lru import LRUCache
+from .wids_mmtar import MMIndexedTar
+from .wids_specs import load_dsdesc_and_resolve, urldir
+from .wids_tar import TarFileReader, find_index_file
+try:
+    from torch.utils.data import Dataset, Sampler
+except ImportError:
+    class Dataset:
+        pass
+    class Sampler:
+        pass
+T = TypeVar("T")
+T_co = TypeVar("T_co", covariant=True)
+def compute_file_md5sum(fname: Union[str, BinaryIO], chunksize: int = 1000000) -> str:
+    """Compute the md5sum of a file in chunks.
+    Parameters
+    ----------
+    fname : Union[str, BinaryIO]
+        Filename or file object
+    chunksize : int, optional
+        Chunk size in bytes, by default 1000000
+    Returns
+    -------
+    str
+        MD5 sum of the file
+    Examples
+    --------
+    >>> compute_file_md5sum("test.txt")
+    'd41d8cd98f00b204e9800998ecf8427e'
+    """
+    md5 = hashlib.md5()
+    if isinstance(fname, str):
+        with open(fname, "rb") as f:
+            for chunk in iter(lambda: f.read(chunksize), b""):
+                md5.update(chunk)
+    else:
+        fname.seek(0)
+        for chunk in iter(lambda: fname.read(chunksize), b""):
+            md5.update(chunk)
+    return md5.hexdigest()
+def compute_file_md5sum(fname: Union[str, BinaryIO], chunksize: int = 1000000) -> str:
+    """Compute the md5sum of a file in chunks."""
+    md5 = hashlib.md5()
+    if isinstance(fname, str):
+        with open(fname, "rb") as f:
+            for chunk in iter(lambda: f.read(chunksize), b""):
+                md5.update(chunk)
+    else:
+        fname.seek(0)
+        for chunk in iter(lambda: fname.read(chunksize), b""):
+            md5.update(chunk)
+    return md5.hexdigest()
+def compute_num_samples(fname):
+    ds = IndexedTarSamples(fname)
+    return len(ds)
+def splitname(fname):
+    """Returns the basename and extension of a filename"""
+    assert "." in fname, "Filename must have an extension"
+    # basename, extension = re.match(r"^((?:.*/)?.*?)(\..*)$", fname).groups()
+    basename, extension = os.path.splitext(fname)
+    return basename, extension
+# NOTE(ligeng): change to ordered mapping to more flexbile dict
+# TODO(ligeng):  submit a PR to fix the mapping issue.
+def group_by_key(names):
+    """Group the file names by key.
+    Args:
+        names: A list of file names.
+    Returns:
+        A list of lists of indices, where each sublist contains indices of files
+        with the same key.
+    """
+    groups = []
+    kmaps = {}
+    for i, fname in enumerate(names):
+        # Ignore files that are not in a subdirectory.
+        if "." not in fname:
+            print(f"Warning: Ignoring file {fname} (no '.')")
+            continue
+        if fname == ".":
+            print(f"Warning: Ignoring the '.' file.")
+            continue
+        key, ext = splitname(fname)
+        if key not in kmaps:
+            kmaps[key] = []
+        kmaps[key].append(i)
+    for k, v in kmaps.items():
+        groups.append(v)
+    return groups
+def default_decoder(sample: Dict[str, Any], format: Optional[Union[bool, str]] = True):
+    """A default decoder for webdataset.
+    This handles common file extensions: .txt, .cls, .cls2,
+        .jpg, .png, .json, .npy, .mp, .pt, .pth, .pickle, .pkl.
+    These are the most common extensions used in webdataset.
+    For other extensions, users can provide their own decoder.
+    Args:
+        sample: sample, modified in place
+    """
+    sample = dict(sample)
+    for key, stream in sample.items():
+        extensions = key.split(".")
+        if len(extensions) < 1:
+            continue
+        extension = extensions[-1]
+        if extension in ["gz"]:
+            decompressed = gzip.decompress(stream.read())
+            stream = io.BytesIO(decompressed)
+            if len(extensions) < 2:
+                sample[key] = stream
+                continue
+            extension = extensions[-2]
+        if key.startswith("__"):
+            continue
+        elif extension in ["txt", "text"]:
+            value = stream.read()
+            sample[key] = value.decode("utf-8")
+        elif extension in ["cls", "cls2"]:
+            value = stream.read()
+            sample[key] = int(value.decode("utf-8"))
+        elif extension in ["jpg", "png", "ppm", "pgm", "pbm", "pnm"]:
+            if format == "PIL":
+                import PIL.Image
+                sample[key] = PIL.Image.open(stream)
+            elif format == "numpy":
+                import numpy as np
+                sample[key] = np.asarray(PIL.Image.open(stream))
+            else:
+                raise ValueError(f"Unknown format: {format}")
+        elif extension == "json":
+            import json
+            value = stream.read()
+            sample[key] = json.loads(value)
+        elif extension == "npy":
+            import numpy as np
+            sample[key] = np.load(stream)
+        elif extension == "mp":
+            import msgpack
+            value = stream.read()
+            sample[key] = msgpack.unpackb(value, raw=False)
+        elif extension in ["pt", "pth"]:
+            import torch
+            sample[key] = torch.load(stream)
+        elif extension in ["pickle", "pkl"]:
+            import pickle
+            sample[key] = pickle.load(stream)
+        elif extension == "mp4":
+            # Write stream to a temporary file
+            # with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as tmpfile:
+            #     tmpfile.write(stream.read())
+            #     tmpfile_path = tmpfile.name
+            # sample[key] = tmpfile_path
+            sample[key] = io.BytesIO(stream.read())
+    return sample
+def update_dict_with_extend(original_dict, update_dict):
+    for key, value in update_dict.items():
+        if key in original_dict and isinstance(original_dict[key], list) and isinstance(value, list):
+            original_dict[key].extend(value)
+        else:
+            original_dict[key] = value
+open_itfs = {}
+class IndexedTarSamples:
+    """A class that accesses samples in a tar file. The tar file must follow
+    WebDataset conventions. The tar file is indexed when the IndexedTarSamples
+    object is created. The samples are accessed by index using the __getitem__
+    method. The __getitem__ method returns a dictionary containing the files
+    for the sample. The key for each file is the extension of the file name.
+    The key "__key__" is reserved for the key of the sample (the basename of
+    each file without the extension). For example, if the tar file contains
+    the files "sample1.jpg" and "sample1.txt", then the sample with key
+    "sample1" will be returned as the dictionary {"jpg": ..., "txt": ...}.
+    """
+    def __init__(
+        self,
+        *,
+        path=None,
+        stream=None,
+        md5sum=None,
+        expected_size=None,
+        use_mmap=True,
+        index_file=find_index_file,
+    ):
+        assert path is not None or stream is not None
+        # Create TarFileReader object to read from tar_file
+        self.path = path
+        stream = self.stream = stream or open(path, "rb")
+        # verify the MD5 sum
+        if md5sum is not None:
+            stream.seek(0)
+            got = compute_file_md5sum(stream)
+            assert got == md5sum, f"MD5 sum mismatch: expected {md5sum}, got {got}"
+            stream.seek(0)
+        # use either the mmap or the stream based implementation
+        # NOTE(ligeng): https://stackoverflow.com/questions/11072705/twitter-trends-api-unicodedecodeerror-utf8-codec-cant-decode-byte-0x8b-in-po
+        # import gzip
+        # print("convert to gzip IO stream")
+        # stream = gzip.GzipFile(fileobj=stream)
+        if use_mmap:
+            self.reader = MMIndexedTar(stream)
+        else:
+            self.reader = TarFileReader(stream, index_file=index_file)
+        # Get list of all files in stream
+        all_files = self.reader.names()
+        # Group files by key into samples
+        self.samples = group_by_key(all_files)
+        # print("DEBUG:", list(all_files)[:20])
+        # print("DEBUG:", self.samples[:20])
+        # check that the number of samples is correct
+        if expected_size is not None:
+            assert len(self) == expected_size, f"Expected {expected_size} samples, got {len(self)}"
+        self.uuid = str(uuid.uuid4())
+    def close(self):
+        self.reader.close()
+        if not self.stream.closed:
+            self.stream.close()
+    def __len__(self):
+        return len(self.samples)
+    def __getitem__(self, idx):
+        # Get indexes of files for the sample at index idx
+        try:
+            indexes = self.samples[idx]
+        except IndexError as e:
+            print(f"[wids-debug] curr idx: {idx}, total sample length: {len(self.samples)} {e}")
+            raise e
+        sample = {}
+        key = None
+        for i in indexes:
+            # Get filename and data for the file at index i
+            fname, data = self.reader.get_file(i)
+            # Split filename into key and extension
+            k, ext = splitname(fname)
+            # Make sure all files in sample have same key
+            key = key or k
+            assert key == k
+            sample[ext] = data
+        # Add key to sample
+        sample["__key__"] = key
+        return sample
+    def __str__(self):
+        return f"<IndexedTarSamples-{id(self)} {self.path}>"
+    def __repr__(self):
+        return str(self)
+def hash_localname(dldir="/tmp/_wids_cache"):
+    os.makedirs(dldir, exist_ok=True)
+    connection = sqlite3.connect(os.path.join(dldir, "cache.db"))
+    cursor = connection.cursor()
+    cursor.execute("CREATE TABLE IF NOT EXISTS cache (url TEXT PRIMARY KEY, path TEXT, checksum TEXT)")
+    connection.commit()
+    def f(shard):
+        """Given a URL, return a local name for the shard."""
+        if shard.startswith("pipe:"):
+            # uuencode the entire URL string
+            hex32 = base64.urlsafe_b64encode(hashlib.sha256(shard.encode()).digest())[:32].decode()
+            return os.path.join(dldir, "pipe__" + hex32)
+        else:
+            # we hash the host and directory components into a 16 character string
+            dirname = urldir(shard)
+            hex16 = base64.urlsafe_b64encode(hashlib.sha256(dirname.encode()).digest())[:16].decode()
+            # the cache name is the concatenation of the hex16 string and the file name component of the URL
+            cachename = "data__" + hex16 + "__" + os.path.basename(urlparse(shard).path)
+            checksum = None
+            cursor.execute(
+                "INSERT OR REPLACE INTO cache VALUES (?, ?, ?)",
+                (shard, cachename, checksum),
+            )
+            connection.commit()
+            return os.path.join(dldir, cachename)
+    return f
+def cache_localname(cachedir):
+    os.makedirs(cachedir, exist_ok=True)
+    def f(shard):
+        """Given a URL, return a local name for the shard."""
+        path = urlparse(shard).path
+        fname = os.path.basename(path)
+        return os.path.join(cachedir, fname)
+    return f
+def default_localname(dldir="/tmp/_wids_cache"):
+    os.makedirs(dldir, exist_ok=True)
+    def f(shard):
+        """Given a URL, return a local name for the shard."""
+        cachename = quote(shard, safe="+-")
+        return os.path.join(dldir, cachename)
+    return f
+class LRUShards:
+    """A class that manages a cache of shards. The cache is a LRU cache that
+    stores the local names of the shards as keys and the downloaded paths as
+    values. The shards are downloaded to a directory specified by dldir.
+    The local name of a shard is computed by the localname function, which
+    takes the shard URL as an argument. If keep is True, the downloaded files
+    are not deleted when they are no longer needed.
+    """
+    def __init__(self, lru_size, keep=False, localname=default_localname()):
+        self.localname = localname
+        # the cache contains the local name as the key and the downloaded path as the value
+        self.lru = LRUCache(lru_size, release_handler=self.release_handler)
+        # keep statistics
+        self.reset_stats()
+    def reset_stats(self):
+        self.accesses = 0
+        self.misses = 0
+    def __len__(self):
+        return len(self.lru)
+    def release_handler(self, key, value):
+        value.close()
+    def clear(self):
+        self.lru.clear()
+    def get_shard(self, url):
+        assert isinstance(url, str)
+        self.accesses += 1
+        if url not in self.lru:
+            local = self.localname(url)
+            with download_and_open(url, local) as stream:
+                itf = IndexedTarSamples(path=local, stream=stream)
+            self.lru[url] = itf
+            self.misses += 1
+            self.last_missed = True
+        else:
+            self.last_missed = False
+        return self.lru[url]
+def interpret_transformations(transformations):
+    """Interpret the transformations argument.
+    This takes care of transformations specified as string shortcuts
+    and returns a list of callables.
+    """
+    if not isinstance(transformations, list):
+        transformations = [transformations]
+    result = []
+    for transformation in transformations:
+        if transformation == "PIL":
+            transformation = partial(default_decoder, format="PIL")
+        elif transformation == "numpy":
+            transformation = partial(default_decoder, format="numpy")
+        else:
+            assert callable(transformation)
+        result.append(transformation)
+    return result
+def hash_dataset_name(input_string):
+    """Compute a hash of the input string and return the first 16 characters of the hash."""
+    # Compute SHA256 hash of the input string
+    hash_object = hashlib.sha256(input_string.encode())
+    hash_digest = hash_object.digest()
+    # Encode the hash in base64
+    base64_encoded_hash = base64.urlsafe_b64encode(hash_digest)
+    # Return the first 16 characters of the base64-encoded hash
+    return base64_encoded_hash[:16].decode("ascii")
+@lru_cache(maxsize=16)
+def lru_json_load(fpath):
+    with open(fpath) as fp:
+        return json.load(fp)
+class ShardListDataset(Dataset[T]):
+    """An indexable dataset based on a list of shards.
+    The dataset is either given as a list of shards with optional options and name,
+    or as a URL pointing to a JSON descriptor file.
+    Datasets can reference other datasets via `source_url`.
+    Shard references within a dataset are resolve relative to an explicitly
+    given `base` property, or relative to the URL from which the dataset
+    descriptor was loaded.
+    """
+    def __init__(
+        self,
+        shards,
+        *,
+        cache_size=int(1e12),
+        cache_dir=None,
+        lru_size=10,
+        dataset_name=None,
+        localname=None,
+        transformations="PIL",
+        keep=False,
+        base=None,
+        options=None,
+    ):
+        """Create a ShardListDataset.
+        Args:
+            shards: a list of (filename, length) pairs or a URL pointing to a JSON descriptor file
+            cache_size: the number of shards to keep in the cache
+            lru_size: the number of shards to keep in the LRU cache
+            localname: a function that maps URLs to local filenames
+        Note that there are two caches: an on-disk directory, and an in-memory LRU cache.
+        """
+        if options is None:
+            options = {}
+        super().__init__()
+        # shards is a list of (filename, length) pairs. We'll need to
+        # keep track of the lengths and cumulative lengths to know how
+        # to map indices to shards and indices within shards.
+        if isinstance(shards, (str, io.IOBase)):
+            if base is None and isinstance(shards, str):
+                shards = osp.expanduser(shards)
+                base = urldir(shards)
+            self.base = base
+            self.spec = load_dsdesc_and_resolve(shards, options=options, base=base)
+            self.shards = self.spec.get("shardlist", [])
+            self.dataset_name = self.spec.get("name") or hash_dataset_name(str(shards))
+        else:
+            raise NotImplementedError("Only support taking path/url to JSON descriptor file.")
+            self.base = None
+            self.spec = options
+            self.shards = shards
+            self.dataset_name = dataset_name or hash_dataset_name(str(shards))
+        self.lengths = [shard["nsamples"] for shard in self.shards]
+        self.cum_lengths = np.cumsum(self.lengths)
+        self.total_length = self.cum_lengths[-1]
+        if cache_dir is not None:
+            # when a cache dir is explicitly given, we download files into
+            # that directory without any changes
+            self.cache_dir = cache_dir
+            self.localname = cache_localname(cache_dir)
+        elif localname is not None:
+            # when a localname function is given, we use that
+            self.cache_dir = None
+            self.localname = localname
+        else:
+            import getpass
+            # when no cache dir or localname are given, use the cache from the environment
+            self.cache_dir = os.environ.get("WIDS_CACHE", f"~/.cache/_wids_cache")
+            self.cache_dir = osp.expanduser(self.cache_dir)
+            self.localname = default_localname(self.cache_dir)
+        self.data_info = (
+            f"[WebShardedList] {str(shards)}, base: {self.base,}, name: {self.spec.get('name')}, "
+            f"nfiles: {str(len(self.shards))}"
+        )
+        if True or int(os.environ.get("WIDS_VERBOSE", 0)):
+            nbytes = sum(shard.get("filesize", 0) for shard in self.shards)
+            nsamples = sum(shard["nsamples"] for shard in self.shards)
+            self.data_info += f"nbytes: {str(nbytes)}, samples: {str(nsamples),}, cache: {self.cache_dir} "
+            # print(
+            #     "[WebShardedList]",
+            #     str(shards),
+            #     "base:",
+            #     self.base,
+            #     "name:",
+            #     self.spec.get("name"),
+            #     "nfiles:",
+            #     len(self.shards),
+            #     "nbytes:",
+            #     nbytes,
+            #     "samples:",
+            #     nsamples,
+            #     "cache:",
+            #     self.cache_dir,
+            #     file=sys.stderr,
+            # )
+        self.transformations = interpret_transformations(transformations)
+        if lru_size > 200:
+            warnings.warn("LRU size is very large; consider reducing it to avoid running out of file descriptors")
+        self.cache = LRUShards(lru_size, localname=self.localname, keep=keep)
+    def add_transform(self, transform):
+        """Add a transformation to the dataset."""
+        self.transformations.append(transform)
+        return self
+    def __len__(self):
+        """Return the total number of samples in the dataset."""
+        return self.total_length
+    def get_stats(self):
+        """Return the number of cache accesses and misses."""
+        return self.cache.accesses, self.cache.misses
+    def check_cache_misses(self):
+        """Check if the cache miss rate is too high."""
+        accesses, misses = self.get_stats()
+        if accesses > 100 and misses / accesses > 0.3:
+            # output a warning only once
+            self.check_cache_misses = lambda: None
+            print(f"Warning: ShardListDataset has a cache miss rate of {misses * 100.0 / accesses:.1%}%")
+    def get_shard(self, index):
+        """Get the shard and index within the shard corresponding to the given index."""
+        # Find the shard corresponding to the given index.
+        shard_idx = np.searchsorted(self.cum_lengths, index, side="right")
+        # Figure out which index within the shard corresponds to the
+        # given index.
+        if shard_idx == 0:
+            inner_idx = index
+        else:
+            inner_idx = index - self.cum_lengths[shard_idx - 1]
+        # Get the shard and return the corresponding element.
+        desc = self.shards[shard_idx]
+        url = desc["url"]
+        if url.startswith(("https://", "http://", "gs://", "/", "~")):
+            # absolute path or url path
+            url = url
+        else:
+            # concat relative path
+            if self.base is None and "base_path" not in self.spec:
+                raise FileNotFoundError("passing a relative path in shardlist but no base found.")
+            base_path = self.spec["base_path"] if "base_path" in self.spec else self.base
+            url = osp.abspath(osp.join(osp.expanduser(base_path), url))
+        desc["url"] = url
+        try:
+            shard = self.cache.get_shard(url)
+        except UnicodeDecodeError as e:
+            print("UnicodeDecodeError:", desc)
+            raise e
+        return shard, inner_idx, desc
+    def __getitem__(self, index):
+        """Return the sample corresponding to the given index."""
+        shard, inner_idx, desc = self.get_shard(index)
+        sample = shard[inner_idx]
+        # Check if we're missing the cache too often.
+        self.check_cache_misses()
+        sample["__dataset__"] = desc.get("dataset")
+        sample["__index__"] = index
+        sample["__shard__"] = desc["url"]
+        sample["__shardindex__"] = inner_idx
+        # Apply transformations
+        for transform in self.transformations:
+            sample = transform(sample)
+        return sample
+    def close(self):
+        """Close the dataset."""
+        self.cache.clear()
+class ShardListDatasetMulti(ShardListDataset):
+    """An indexable dataset based on a list of shards.
+    The dataset is either given as a list of shards with optional options and name,
+    or as a URL pointing to a JSON descriptor file.
+    Datasets can reference other datasets via `source_url`.
+    Shard references within a dataset are resolve relative to an explicitly
+    given `base` property, or relative to the URL from which the dataset
+    descriptor was loaded.
+    """
+    def __init__(
+        self,
+        shards,
+        *,
+        cache_size=int(1e12),
+        cache_dir=None,
+        lru_size=10,
+        dataset_name=None,
+        localname=None,
+        transformations="PIL",
+        keep=False,
+        base=None,
+        options=None,
+        sort_data_inseq=False,
+        num_replicas=None,
+    ):
+        """Create a ShardListDataset.
+        Args:
+            shards: a list of (filename, length) pairs or a URL pointing to a JSON descriptor file
+            cache_size: the number of shards to keep in the cache
+            lru_size: the number of shards to keep in the LRU cache
+            localname: a function that maps URLs to local filenames
+        Note that there are two caches: an on-disk directory, and an in-memory LRU cache.
+        """
+        if options is None:
+            options = {}
+        # shards is a list of (filename, length) pairs. We'll need to
+        # keep track of the lengths and cumulative lengths to know how
+        # to map indices to shards and indices within shards.
+        shards_lists = shards if isinstance(shards, list) else [shards]
+        bases = base if isinstance(base, list) else [base] * len(shards_lists)
+        self.spec = {}
+        self.shards = []
+        self.num_per_dir = {}
+        for base, shards in zip(bases, shards_lists):
+            if isinstance(shards, (str, io.IOBase)):
+                if base is None and isinstance(shards, str):
+                    shards = osp.expanduser(shards)
+                    base = urldir(shards)
+                self.base = base
+                _spec = load_dsdesc_and_resolve(shards, options=options, base=base)
+                update_dict_with_extend(self.spec, _spec)
+                self.num_per_dir[os.path.basename(os.path.dirname(shards))] = sum(
+                    [shard["nsamples"] for shard in _spec.get("shardlist", [])]
+                )
+            else:
+                raise NotImplementedError("Only support taking path/url to JSON descriptor file.")
+                self.base = None
+                self.spec = options
+                self.shards = shards
+                self.dataset_name = dataset_name or hash_dataset_name(str(shards))
+        if sort_data_inseq and len(self.spec.get("shardlist", [])) > 0:
+            num_replicas = num_replicas or dist.get_world_size()
+            self.spec["shardlist"] = split_and_recombine(self.spec["shardlist"], num_replicas)
+        self.shards.extend(self.spec.get("shardlist", []))
+        self.dataset_name = self.spec.get("name") or hash_dataset_name(str(shards))
+        self.lengths = [shard["nsamples"] for shard in self.shards]
+        self.cum_lengths = np.cumsum(self.lengths)
+        self.total_length = self.cum_lengths[-1]
+        if cache_dir is not None:
+            # when a cache dir is explicitly given, we download files into
+            # that directory without any changes
+            self.cache_dir = cache_dir
+            self.localname = cache_localname(cache_dir)
+        elif localname is not None:
+            # when a localname function is given, we use that
+            self.cache_dir = None
+            self.localname = localname
+        else:
+            import getpass
+            # when no cache dir or localname are given, use the cache from the environment
+            self.cache_dir = os.environ.get("WIDS_CACHE", f"~/.cache/_wids_cache")
+            self.cache_dir = osp.expanduser(self.cache_dir)
+            self.localname = default_localname(self.cache_dir)
+        self.data_info = (
+            f"[WebShardedList] {str(shards)}, base: {self.base,}, name: {self.spec.get('name')}, "
+            f"nfiles: {str(len(self.shards))}"
+        )
+        if True or int(os.environ.get("WIDS_VERBOSE", 0)):
+            nbytes = sum(shard.get("filesize", 0) for shard in self.shards)
+            nsamples = sum(shard["nsamples"] for shard in self.shards)
+            self.data_info += f"nbytes: {str(nbytes)}, samples: {str(nsamples),}, cache: {self.cache_dir} "
+        self.transformations = interpret_transformations(transformations)
+        if lru_size > 200:
+            warnings.warn("LRU size is very large; consider reducing it to avoid running out of file descriptors")
+        self.cache = LRUShards(lru_size, localname=self.localname, keep=keep)
+def split_and_recombine(lst, n):
+    from collections import OrderedDict
+    def extract_prefix(i):
+        return i["url"].split("/")[-2]
+    unique_parts = list(OrderedDict((extract_prefix(item), None) for item in lst).keys())
+    split_dict = {part: [] for part in unique_parts}
+    for part in unique_parts:
+        part_list = [item for item in lst if extract_prefix(item) == part]
+        chunk_size = max(1, len(part_list) // n)  # 确保 chunk_size 至少为 1
+        chunks = [part_list[i * chunk_size : (i + 1) * chunk_size] for i in range(n)]
+        # 处理最后一个 chunk，如果数量不均匀，将剩余的元素添加到最后一个 chunk
+        if len(part_list) % n != 0:
+            chunks[-1].extend(part_list[n * chunk_size :])
+        split_dict[part] = chunks
+    recombined_list = []
+    for i in range(n):
+        for part in unique_parts:
+            recombined_list.extend(split_dict[part][i])
+    return recombined_list
+def lengths_to_ranges(lengths):
+    """Convert a list of lengths to a list of ranges."""
+    ranges = []
+    start = 0
+    for length in lengths:
+        ranges.append((start, start + length))
+        start += length
+    return ranges
+def intersect_range(a, b):
+    """Return the intersection of the two half-open integer intervals."""
+    result = max(a[0], b[0]), min(a[1], b[1])
+    if result[0] >= result[1]:
+        return None
+    return result
+def intersect_ranges(rangelist, r):
+    """Return the intersection of the half-open integer interval r with the list of half-open integer intervals."""
+    result = []
+    for a in rangelist:
+        x = intersect_range(a, r)
+        if x is not None:
+            result.append(x)
+    return result
+def iterate_ranges(ranges, rng, indexshuffle=True, shardshuffle=True):
+    """Iterate over the ranges in a random order."""
+    shard_indexes = list(range(len(ranges)))
+    if shardshuffle:
+        rng.shuffle(shard_indexes)
+    for i in shard_indexes:
+        lo, hi = ranges[i]
+        sample_indexes = list(range(lo, hi))
+        if indexshuffle:
+            rng.shuffle(sample_indexes)
+        yield from sample_indexes
+class ShardListSampler(Sampler):
+    """A sampler that samples consistent with a ShardListDataset.
+    This sampler is used to sample from a ShardListDataset in a way that
+    preserves locality.
+    This returns a permutation of the indexes by shard, then a permutation of
+    indexes within each shard. This ensures that the data is accessed in a
+    way that preserves locality.
+    Note that how this ends up splitting data between multiple workers ends up
+    on the details of the DataLoader. Generally, it will likely load samples from the
+    same shard in each worker.
+    Other more sophisticated shard-aware samplers are possible and will likely
+    be added.
+    """
+    def __init__(self, dataset, *, lengths=None, seed=0, shufflefirst=False):
+        if lengths is None:
+            lengths = list(dataset.lengths)
+        self.ranges = lengths_to_ranges(lengths)
+        self.seed = seed
+        self.shufflefirst = shufflefirst
+        self.epoch = 0
+    def __iter__(self):
+        self.rng = random.Random(self.seed + 1289738273 * self.epoch)
+        shardshuffle = self.shufflefirst or self.epoch > 0
+        yield from iterate_ranges(self.ranges, self.rng, shardshuffle=shardshuffle)
+        self.epoch += 1
+ShardedSampler = ShardListSampler
+class ChunkedSampler(Sampler):
+    """A sampler that samples in chunks and then shuffles the samples within each chunk.
+    This preserves locality of reference while still shuffling the data.
+    """
+    def __init__(
+        self,
+        dataset,
+        *,
+        num_samples=None,
+        chunksize=2000,
+        seed=0,
+        shuffle=False,
+        shufflefirst=False,
+    ):
+        if isinstance(num_samples, int):
+            lo, hi = 0, num_samples
+        elif num_samples is None:
+            lo, hi = 0, len(dataset)
+        else:
+            lo, hi = num_samples
+        self.ranges = [(i, min(i + chunksize, hi)) for i in range(lo, hi, chunksize)]
+        self.seed = seed
+        self.shuffle = shuffle
+        self.shufflefirst = shufflefirst
+        self.epoch = 0
+    def set_epoch(self, epoch):
+        self.epoch = epoch
+    def __iter__(self):
+        self.rng = random.Random(self.seed + 1289738273 * self.epoch)
+        shardshuffle = self.shufflefirst or self.epoch > 0
+        yield from iterate_ranges(
+            self.ranges,
+            self.rng,
+            indexshuffle=self.shuffle,
+            shardshuffle=(self.shuffle and shardshuffle),
+        )
+        self.epoch += 1
+    def __len__(self):
+        return len(self.ranges)
+def DistributedChunkedSampler(
+    dataset: Dataset,
+    *,
+    num_replicas: Optional[int] = None,
+    num_samples: Optional[int] = None,
+    rank: Optional[int] = None,
+    shuffle: bool = True,
+    shufflefirst: bool = False,
+    seed: int = 0,
+    drop_last: bool = None,
+    chunksize: int = 1000000,
+) -> ChunkedSampler:
+    """Return a ChunkedSampler for the current worker in distributed training.
+    Reverts to a simple ChunkedSampler if not running in distributed mode.
+    Since the split among workers takes place before the chunk shuffle,
+    workers end up with a fixed set of shards they need to download. The
+    more workers, the fewer shards are used by each worker.
+    """
+    if drop_last is not None:
+        warnings.warn("DistributedChunkedSampler does not support drop_last, thus it will be ignored")
+    if not dist.is_initialized():
+        warnings.warn("DistributedChunkedSampler is called without distributed initialized; assuming single process")
+        num_replicas = 1
+        rank = 0
+    else:
+        num_replicas = num_replicas or dist.get_world_size()
+        rank = rank or dist.get_rank()
+    assert rank >= 0 and rank < num_replicas
+    num_samples = num_samples or len(dataset)
+    worker_chunk = (num_samples + num_replicas - 1) // num_replicas
+    worker_start = rank * worker_chunk
+    worker_end = min(worker_start + worker_chunk, num_samples)
+    return ChunkedSampler(
+        dataset,
+        num_samples=(worker_start, worker_end),
+        chunksize=chunksize,
+        seed=seed,
+        shuffle=shuffle,
+        shufflefirst=shufflefirst,
+    )
+class DistributedRangedSampler(Sampler):
+    """A sampler that samples in chunks and then shuffles the samples within each chunk.
+    This preserves locality of reference while still shuffling the data.
+    """
+    def __init__(
+        self,
+        dataset: Dataset,
+        num_replicas: Optional[int] = None,
+        num_samples: Optional[int] = None,
+        rank: Optional[int] = None,
+        drop_last: bool = None,
+    ):
+        if drop_last is not None:
+            warnings.warn("DistributedChunkedSampler does not support drop_last, thus it will be ignored")
+        if not dist.is_initialized():
+            warnings.warn(
+                "DistributedChunkedSampler is called without distributed initialized; assuming single process"
+            )
+            num_replicas = 1
+            rank = 0
+        else:
+            num_replicas = num_replicas or dist.get_world_size()
+            rank = rank or dist.get_rank()
+        assert rank >= 0 and rank < num_replicas
+        num_samples = num_samples or len(dataset)
+        self.worker_chunk = num_samples // num_replicas
+        self.worker_start = rank * self.worker_chunk
+        self.worker_end = min((rank + 1) * self.worker_chunk, num_samples)
+        self.ranges = range(self.worker_start, self.worker_end)
+        self.epoch = 0
+        self.step_start = 0
+    def set_epoch(self, epoch):
+        self.epoch = epoch
+    def __len__(self):
+        return len(self.ranges)
+    def set_start(self, start):
+        self.step_start = start
+    def __iter__(self):
+        yield from self.ranges[self.step_start :]
+        self.epoch += 1
+class DistributedLocalSampler(DistributedSampler):
+    def __iter__(self):
+        if self.shuffle:
+            # deterministically shuffle based on epoch and seed
+            g = torch.Generator()
+            g.manual_seed(self.seed + self.epoch)
+            indices = torch.randperm(len(self.dataset), generator=g).tolist()  # type: ignore[arg-type]
+        else:
+            indices = list(range(len(self.dataset)))  # type: ignore[arg-type]
+        if not self.drop_last:
+            # add extra samples to make it evenly divisible
+            padding_size = self.total_size - len(indices)
+            if padding_size <= len(indices):
+                indices += indices[:padding_size]
+            else:
+                indices += (indices * math.ceil(padding_size / len(indices)))[:padding_size]
+        else:
+            # remove tail of data to make it evenly divisible.
+            indices = indices[: self.total_size]
+        assert len(indices) == self.total_size
+        # subsample
+        # indices = indices[self.rank:self.total_size:self.num_replicas]
+        chunk_size = self.total_size // self.num_replicas
+        begin_idx = chunk_size * self.rank
+        stop_idx = chunk_size * (self.rank + 1)
+        indices = indices[begin_idx:stop_idx]
+        # print("[SamplerIndices: ]", indices)
+        assert len(indices) == self.num_samples
+        return iter(indices)

diffusion/data/wids/wids_dl.py ADDED Viewed

	@@ -0,0 +1,174 @@

+# Copyright 2024 NVIDIA CORPORATION & AFFILIATES
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+# This file is copied from https://github.com/NVlabs/VILA/tree/main/llava/wids
+import fcntl
+import os
+import shutil
+import sys
+import time
+from collections import deque
+from datetime import datetime
+from urllib.parse import urlparse
+recent_downloads = deque(maxlen=1000)
+open_objects = {}
+max_open_objects = 100
+class ULockFile:
+    """A simple locking class. We don't need any of the third
+    party libraries since we rely on POSIX semantics for linking
+    below anyway."""
+    def __init__(self, path):
+        self.lockfile_path = path
+        self.lockfile = None
+    def __enter__(self):
+        self.lockfile = open(self.lockfile_path, "w")
+        fcntl.flock(self.lockfile.fileno(), fcntl.LOCK_EX)
+        return self
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        fcntl.flock(self.lockfile.fileno(), fcntl.LOCK_UN)
+        self.lockfile.close()
+        self.lockfile = None
+        try:
+            os.unlink(self.lockfile_path)
+        except FileNotFoundError:
+            pass
+def pipe_download(remote, local):
+    """Perform a download for a pipe: url."""
+    assert remote.startswith("pipe:")
+    cmd = remote[5:]
+    cmd = cmd.format(local=local)
+    assert os.system(cmd) == 0, "Command failed: %s" % cmd
+def copy_file(remote, local):
+    remote = urlparse(remote)
+    assert remote.scheme in ["file", ""]
+    # use absolute path
+    remote = os.path.abspath(remote.path)
+    local = urlparse(local)
+    assert local.scheme in ["file", ""]
+    local = os.path.abspath(local.path)
+    if remote == local:
+        return
+    # check if the local file exists
+    shutil.copyfile(remote, local)
+verbose_cmd = int(os.environ.get("WIDS_VERBOSE_CMD", "0"))
+def vcmd(flag, verbose_flag=""):
+    return verbose_flag if verbose_cmd else flag
+default_cmds = {
+    "posixpath": copy_file,
+    "file": copy_file,
+    "pipe": pipe_download,
+    "http": "curl " + vcmd("-s") + " -L {url} -o {local}",
+    "https": "curl " + vcmd("-s") + " -L {url} -o {local}",
+    "ftp": "curl " + vcmd("-s") + " -L {url} -o {local}",
+    "ftps": "curl " + vcmd("-s") + " -L {url} -o {local}",
+    "gs": "gsutil " + vcmd("-q") + " cp {url} {local}",
+    "s3": "aws s3 cp {url} {local}",
+}
+# TODO(ligeng): change HTTPS download to python requests library
+def download_file_no_log(remote, local, handlers=default_cmds):
+    """Download a file from a remote url to a local path.
+    The remote url can be a pipe: url, in which case the remainder of
+    the url is treated as a command template that is executed to perform the download.
+    """
+    if remote.startswith("pipe:"):
+        schema = "pipe"
+    else:
+        schema = urlparse(remote).scheme
+    if schema is None or schema == "":
+        schema = "posixpath"
+    # get the handler
+    handler = handlers.get(schema)
+    if handler is None:
+        raise ValueError("Unknown schema: %s" % schema)
+    # call the handler
+    if callable(handler):
+        handler(remote, local)
+    else:
+        assert isinstance(handler, str)
+        cmd = handler.format(url=remote, local=local)
+        assert os.system(cmd) == 0, "Command failed: %s" % cmd
+    return local
+def download_file(remote, local, handlers=default_cmds, verbose=False):
+    start = time.time()
+    try:
+        return download_file_no_log(remote, local, handlers=handlers)
+    finally:
+        recent_downloads.append((remote, local, time.time(), time.time() - start))
+        if verbose:
+            print(
+                "downloaded",
+                remote,
+                "to",
+                local,
+                "in",
+                time.time() - start,
+                "seconds",
+                file=sys.stderr,
+            )
+def download_and_open(remote, local, mode="rb", handlers=default_cmds, verbose=False):
+    with ULockFile(local + ".lock"):
+        if os.path.exists(remote):
+            # print("enter1", remote, local, mode)
+            result = open(remote, mode)
+        else:
+            # print("enter2", remote, local, mode)
+            if not os.path.exists(local):
+                if verbose:
+                    print("downloading", remote, "to", local, file=sys.stderr)
+                download_file(remote, local, handlers=handlers)
+            else:
+                if verbose:
+                    print("using cached", local, file=sys.stderr)
+            result = open(local, mode)
+        # input()
+        if open_objects is not None:
+            for k, v in list(open_objects.items()):
+                if v.closed:
+                    del open_objects[k]
+            if len(open_objects) > max_open_objects:
+                raise RuntimeError("Too many open objects")
+            current_time = datetime.now().strftime("%Y%m%d%H%M%S")
+            key = tuple(str(x) for x in [remote, local, mode, current_time])
+            open_objects[key] = result
+        return result

diffusion/data/wids/wids_lru.py ADDED Viewed

	@@ -0,0 +1,81 @@

+# Copyright 2024 NVIDIA CORPORATION & AFFILIATES
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+# This file is copied from https://github.com/NVlabs/VILA/tree/main/llava/wids
+from collections import OrderedDict
+class LRUCache:
+    def __init__(self, capacity: int, release_handler=None):
+        """Initialize a new LRU cache with the given capacity."""
+        self.capacity = capacity
+        self.cache = OrderedDict()
+        self.release_handler = release_handler
+    def __getitem__(self, key):
+        """Return the value associated with the given key, or None."""
+        if key not in self.cache:
+            return None
+        self.cache.move_to_end(key)
+        return self.cache[key]
+    def __setitem__(self, key, value):
+        """Associate the given value with the given key."""
+        if key in self.cache:
+            self.cache.move_to_end(key)
+        self.cache[key] = value
+        if len(self.cache) > self.capacity:
+            key, value = self.cache.popitem(last=False)
+            if self.release_handler is not None:
+                self.release_handler(key, value)
+    def __delitem__(self, key):
+        """Remove the given key from the cache."""
+        if key in self.cache:
+            if self.release_handler is not None:
+                value = self.cache[key]
+                self.release_handler(key, value)
+            del self.cache[key]
+    def __len__(self):
+        """Return the number of entries in the cache."""
+        return len(self.cache)
+    def __contains__(self, key):
+        """Return whether the cache contains the given key."""
+        return key in self.cache
+    def items(self):
+        """Return an iterator over the keys of the cache."""
+        return self.cache.items()
+    def keys(self):
+        """Return an iterator over the keys of the cache."""
+        return self.cache.keys()
+    def values(self):
+        """Return an iterator over the values of the cache."""
+        return self.cache.values()
+    def clear(self):
+        for key in list(self.keys()):
+            value = self.cache[key]
+            if self.release_handler is not None:
+                self.release_handler(key, value)
+            del self[key]
+    def __del__(self):
+        self.clear()

diffusion/data/wids/wids_mmtar.py ADDED Viewed

	@@ -0,0 +1,168 @@

+# Copyright 2024 NVIDIA CORPORATION & AFFILIATES
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+# This file is copied from https://github.com/NVlabs/VILA/tree/main/llava/wids
+import collections
+import fcntl
+import io
+import mmap
+import os
+import struct
+TarHeader = collections.namedtuple(
+    "TarHeader",
+    [
+        "name",
+        "mode",
+        "uid",
+        "gid",
+        "size",
+        "mtime",
+        "chksum",
+        "typeflag",
+        "linkname",
+        "magic",
+        "version",
+        "uname",
+        "gname",
+        "devmajor",
+        "devminor",
+        "prefix",
+    ],
+)
+def parse_tar_header(header_bytes):
+    header = struct.unpack("!100s8s8s8s12s12s8s1s100s6s2s32s32s8s8s155s", header_bytes)
+    return TarHeader(*header)
+def next_header(offset, header):
+    block_size = 512
+    size = header.size.decode("utf-8").strip("\x00")
+    if size == "":
+        return -1
+    size = int(size, 8)
+    # compute the file size rounded up to the next block size if it is a partial block
+    padded_file_size = (size + block_size - 1) // block_size * block_size
+    return offset + block_size + padded_file_size
+# TODO(ligeng): support gzip stream
+class MMIndexedTar:
+    def __init__(self, fname, index_file=None, verbose=True, cleanup_callback=None):
+        self.verbose = verbose
+        self.cleanup_callback = cleanup_callback
+        if isinstance(fname, str):
+            self.stream = open(fname, "rb")
+            self.fname = fname
+        elif isinstance(fname, io.IOBase):
+            self.stream = fname
+            self.fname = None
+        self.mmapped_file = mmap.mmap(self.stream.fileno(), 0, access=mmap.ACCESS_READ)
+        if cleanup_callback:
+            cleanup_callback(fname, self.stream.fileno(), "start")
+        self._build_index()
+    def close(self, dispose=False):
+        if self.cleanup_callback:
+            self.cleanup_callback(self.fname, self.stream.fileno(), "end")
+        self.mmapped_file.close()
+        self.stream.close()
+    def _build_index(self):
+        self.by_name = {}
+        self.by_index = []
+        offset = 0
+        while offset >= 0 and offset < len(self.mmapped_file):
+            header = parse_tar_header(self.mmapped_file[offset : offset + 500])
+            name = header.name.decode("utf-8").strip("\x00")
+            typeflag = header.typeflag.decode("utf-8").strip("\x00")
+            if name != "" and name != "././@PaxHeader" and typeflag in ["0", ""]:
+                try:
+                    size = int(header.size.decode("utf-8")[:-1], 8)
+                except ValueError as exn:
+                    print(header)
+                    raise exn
+                self.by_name[name] = offset
+                self.by_index.append((name, offset, size))
+            offset = next_header(offset, header)
+    def names(self):
+        return self.by_name.keys()
+    def get_at_offset(self, offset):
+        header = parse_tar_header(self.mmapped_file[offset : offset + 500])
+        name = header.name.decode("utf-8").strip("\x00")
+        start = offset + 512
+        end = start + int(header.size.decode("utf-8")[:-1], 8)
+        return name, self.mmapped_file[start:end]
+    def get_at_index(self, index):
+        name, offset, size = self.by_index[index]
+        return self.get_at_offset(offset)
+    def get_by_name(self, name):
+        offset = self.by_name[name]
+        return self.get_at_offset(offset)
+    def __iter__(self):
+        for name, offset, size in self.by_index:
+            yield name, self.mmapped_file[offset + 512 : offset + 512 + size]
+    def __getitem__(self, key):
+        if isinstance(key, int):
+            return self.get_at_index(key)
+        else:
+            return self.get_by_name(key)
+    def __len__(self):
+        return len(self.by_index)
+    def get_file(self, i):
+        fname, data = self.get_at_index(i)
+        return fname, io.BytesIO(data)
+def keep_while_reading(fname, fd, phase, delay=0.0):
+    """This is a possible cleanup callback for cleanup_callback of MIndexedTar.
+    It assumes that as long as there are some readers for a file,
+    more readers may be trying to open it.
+    Note that on Linux, unlinking the file doesn't matter after
+    it has been mmapped. The contents will only be deleted when
+    all readers close the file. The unlinking merely makes the file
+    unavailable to new readers, since the downloader checks first
+    whether the file exists.
+    """
+    assert delay == 0.0, "delay not implemented"
+    if fd < 0 or fname is None:
+        return
+    if phase == "start":
+        fcntl.flock(fd, fcntl.LOCK_SH)
+    elif phase == "end":
+        try:
+            fcntl.flock(fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
+            os.unlink(fname)
+        except FileNotFoundError:
+            # someone else deleted it already
+            pass
+        except BlockingIOError:
+            # we couldn't get an exclusive lock, so someone else is still reading
+            pass
+    else:
+        raise ValueError(f"Unknown phase {phase}")

diffusion/data/wids/wids_specs.py ADDED Viewed

	@@ -0,0 +1,192 @@

+# Copyright 2024 NVIDIA CORPORATION & AFFILIATES
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+# This file is copied from https://github.com/NVlabs/VILA/tree/main/llava/wids
+import io
+import json
+import os
+import tempfile
+from urllib.parse import urlparse, urlunparse
+from .wids_dl import download_and_open
+def urldir(url):
+    """Return the directory part of a url."""
+    parsed_url = urlparse(url)
+    path = parsed_url.path
+    directory = os.path.dirname(path)
+    return parsed_url._replace(path=directory).geturl()
+def urlmerge(base, url):
+    """Merge a base URL and a relative URL.
+    The function fills in any missing part of the url from the base,
+    except for params, query, and fragment, which are taken only from the 'url'.
+    For the pathname component, it merges the paths like os.path.join:
+    an absolute path in 'url' overrides the base path, otherwise the paths are merged.
+    Parameters:
+    base (str): The base URL.
+    url (str): The URL to merge with the base.
+    Returns:
+    str: The merged URL.
+    """
+    # Parse the base and the relative URL
+    parsed_base = urlparse(base)
+    parsed_url = urlparse(url)
+    # Merge paths using os.path.join
+    # If the url path is absolute, it overrides the base path
+    if parsed_url.path.startswith("/"):
+        merged_path = parsed_url.path
+    else:
+        merged_path = os.path.normpath(os.path.join(parsed_base.path, parsed_url.path))
+    # Construct the merged URL
+    merged_url = urlunparse(
+        (
+            parsed_url.scheme or parsed_base.scheme,
+            parsed_url.netloc or parsed_base.netloc,
+            merged_path,
+            parsed_url.params,  # Use params from the url only
+            parsed_url.query,  # Use query from the url only
+            parsed_url.fragment,  # Use fragment from the url only
+        )
+    )
+    return merged_url
+def check_shards(l):
+    """Check that a list of shards is well-formed.
+    This checks that the list is a list of dictionaries, and that
+    each dictionary has a "url" and a "nsamples" key.
+    """
+    assert isinstance(l, list)
+    for shard in l:
+        assert isinstance(shard, dict)
+        assert "url" in shard
+        assert "nsamples" in shard
+    return l
+def set_all(l, k, v):
+    """Set a key to a value in a list of dictionaries."""
+    if v is None:
+        return
+    for x in l:
+        if k not in x:
+            x[k] = v
+def load_remote_dsdesc_raw(source):
+    """Load a remote or local dataset description in JSON format."""
+    if isinstance(source, str):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            dlname = os.path.join(tmpdir, "dataset.json")
+            with download_and_open(source, dlname) as f:
+                dsdesc = json.load(f)
+    elif isinstance(source, io.IOBase):
+        dsdesc = json.load(source)
+    else:
+        # FIXME: use gopen
+        import requests
+        jsondata = requests.get(source).text
+        dsdesc = json.loads(jsondata)
+    return dsdesc
+def rebase_shardlist(shardlist, base):
+    """Rebase the URLs in a shardlist."""
+    if base is None:
+        return shardlist
+    for shard in shardlist:
+        shard["url"] = urlmerge(base, shard["url"])
+    return shardlist
+def resolve_dsdesc(dsdesc, *, options=None, base=None):
+    """Resolve a dataset description.
+    This rebases the shards as necessary and loads any remote references.
+    Dataset descriptions are JSON files. They must have the following format;
+    {
+        "wids_version": 1,
+        # optional immediate shardlist
+        "shardlist": [
+            {"url": "http://example.com/file.tar", "nsamples": 1000},
+            ...
+        ],
+        # sub-datasets
+        "datasets": [
+            {"source_url": "http://example.com/dataset.json"},
+            {"shardlist": [
+                {"url": "http://example.com/file.tar", "nsamples": 1000},
+                ...
+            ]}
+            ...
+        ]
+    }
+    """
+    if options is None:
+        options = {}
+    assert isinstance(dsdesc, dict)
+    dsdesc = dict(dsdesc, **options)
+    shardlist = rebase_shardlist(dsdesc.get("shardlist", []), base)
+    assert shardlist is not None
+    set_all(shardlist, "weight", dsdesc.get("weight"))
+    set_all(shardlist, "name", dsdesc.get("name"))
+    check_shards(shardlist)
+    assert "wids_version" in dsdesc, "No wids_version in dataset description"
+    assert dsdesc["wids_version"] == 1, "Unknown wids_version"
+    for component in dsdesc.get("datasets", []):
+        # we use the weight from the reference to the dataset,
+        # regardless of remote loading
+        weight = component.get("weight")
+        # follow any source_url dsdescs through remote loading
+        source_url = None
+        if "source_url" in component:
+            source_url = component["source_url"]
+            component = load_remote_dsdesc_raw(source_url)
+        assert "source_url" not in component, "double indirection in dataset description"
+        assert "shardlist" in component, "no shardlist in dataset description"
+        # if the component has a base, use it to rebase the shardlist
+        # otherwise use the base from the source_url, if any
+        subbase = component.get("base", urldir(source_url) if source_url else None)
+        if subbase is not None:
+            rebase_shardlist(component["shardlist"], subbase)
+        l = check_shards(component["shardlist"])
+        set_all(l, "weight", weight)
+        set_all(l, "source_url", source_url)
+        set_all(l, "dataset", component.get("name"))
+        shardlist.extend(l)
+    assert len(shardlist) > 0, "No shards found"
+    dsdesc["shardlist"] = shardlist
+    return dsdesc
+def load_dsdesc_and_resolve(source, *, options=None, base=None):
+    if options is None:
+        options = {}
+    dsdesc = load_remote_dsdesc_raw(source)
+    return resolve_dsdesc(dsdesc, base=base, options=options)

diffusion/data/wids/wids_tar.py ADDED Viewed

	@@ -0,0 +1,98 @@

+# Copyright 2024 NVIDIA CORPORATION & AFFILIATES
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+# This file is copied from https://github.com/NVlabs/VILA/tree/main/llava/wids
+import io
+import os
+import os.path
+import pickle
+import re
+import tarfile
+import numpy as np
+def find_index_file(file):
+    prefix, last_ext = os.path.splitext(file)
+    if re.match("._[0-9]+_$", last_ext):
+        return prefix + ".index"
+    else:
+        return file + ".index"
+class TarFileReader:
+    def __init__(self, file, index_file=find_index_file, verbose=True):
+        self.verbose = verbose
+        if callable(index_file):
+            index_file = index_file(file)
+        self.index_file = index_file
+        # Open the tar file and keep it open
+        if isinstance(file, str):
+            self.tar_file = tarfile.open(file, "r")
+        else:
+            self.tar_file = tarfile.open(fileobj=file, mode="r")
+        # Create the index
+        self._create_tar_index()
+    def _create_tar_index(self):
+        if self.index_file is not None and os.path.exists(self.index_file):
+            if self.verbose:
+                print("Loading tar index from", self.index_file)
+            with open(self.index_file, "rb") as stream:
+                self.fnames, self.index = pickle.load(stream)
+            return
+        # Create an empty list for the index
+        self.fnames = []
+        self.index = []
+        if self.verbose:
+            print("Creating tar index for", self.tar_file.name, "at", self.index_file)
+        # Iterate over the members of the tar file
+        for member in self.tar_file:
+            # If the member is a file, add it to the index
+            if member.isfile():
+                # Get the file's offset
+                offset = self.tar_file.fileobj.tell()
+                self.fnames.append(member.name)
+                self.index.append([offset, member.size])
+        if self.verbose:
+            print("Done creating tar index for", self.tar_file.name, "at", self.index_file)
+        self.index = np.array(self.index)
+        if self.index_file is not None:
+            if os.path.exists(self.index_file + ".temp"):
+                os.unlink(self.index_file + ".temp")
+            with open(self.index_file + ".temp", "wb") as stream:
+                pickle.dump((self.fnames, self.index), stream)
+            os.rename(self.index_file + ".temp", self.index_file)
+    def names(self):
+        return self.fnames
+    def __len__(self):
+        return len(self.index)
+    def get_file(self, i):
+        name = self.fnames[i]
+        offset, size = self.index[i]
+        self.tar_file.fileobj.seek(offset)
+        file_bytes = self.tar_file.fileobj.read(size)
+        return name, io.BytesIO(file_bytes)
+    def close(self):
+        # Close the tar file
+        self.tar_file.close()

diffusion/dpm_solver.py ADDED Viewed

	@@ -0,0 +1,69 @@

+# Copyright 2024 NVIDIA CORPORATION & AFFILIATES
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+import torch
+from .model import gaussian_diffusion as gd
+from .model.dpm_solver import DPM_Solver, NoiseScheduleFlow, NoiseScheduleVP, model_wrapper
+def DPMS(
+    model,
+    condition,
+    uncondition,
+    cfg_scale,
+    pag_scale=1.0,
+    pag_applied_layers=None,
+    model_type="noise",  # or "x_start" or "v" or "score", "flow"
+    noise_schedule="linear",
+    guidance_type="classifier-free",
+    model_kwargs=None,
+    diffusion_steps=1000,
+    schedule="VP",
+    interval_guidance=None,
+):
+    if pag_applied_layers is None:
+        pag_applied_layers = []
+    if model_kwargs is None:
+        model_kwargs = {}
+    if interval_guidance is None:
+        interval_guidance = [0, 1.0]
+    betas = torch.tensor(gd.get_named_beta_schedule(noise_schedule, diffusion_steps))
+    ## 1. Define the noise schedule.
+    if schedule == "VP":
+        noise_schedule = NoiseScheduleVP(schedule="discrete", betas=betas)
+    elif schedule == "FLOW":
+        noise_schedule = NoiseScheduleFlow(schedule="discrete_flow")
+    ## 2. Convert your discrete-time `model` to the continuous-time
+    ## noise prediction model. Here is an example for a diffusion model
+    ## `model` with the noise prediction type ("noise") .
+    model_fn = model_wrapper(
+        model,
+        noise_schedule,
+        model_type=model_type,
+        model_kwargs=model_kwargs,
+        guidance_type=guidance_type,
+        pag_scale=pag_scale,
+        pag_applied_layers=pag_applied_layers,
+        condition=condition,
+        unconditional_condition=uncondition,
+        guidance_scale=cfg_scale,
+        interval_guidance=interval_guidance,
+    )
+    ## 3. Define dpm-solver and sample by multistep DPM-Solver.
+    return DPM_Solver(model_fn, noise_schedule, algorithm_type="dpmsolver++")

diffusion/flow_euler_sampler.py ADDED Viewed

	@@ -0,0 +1,74 @@

+# Copyright 2024 NVIDIA CORPORATION & AFFILIATES
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+import os
+import torch
+from diffusers import FlowMatchEulerDiscreteScheduler
+from diffusers.models.modeling_outputs import Transformer2DModelOutput
+from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3 import retrieve_timesteps
+from tqdm import tqdm
+class FlowEuler:
+    def __init__(self, model_fn, condition, uncondition, cfg_scale, model_kwargs):
+        self.model = model_fn
+        self.condition = condition
+        self.uncondition = uncondition
+        self.cfg_scale = cfg_scale
+        self.model_kwargs = model_kwargs
+        # repo_id = "stabilityai/stable-diffusion-3-medium-diffusers"
+        self.scheduler = FlowMatchEulerDiscreteScheduler(shift=3.0)
+    def sample(self, latents, steps=28):
+        device = self.condition.device
+        timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, steps, device, None)
+        do_classifier_free_guidance = True
+        prompt_embeds = self.condition
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat([self.uncondition, self.condition], dim=0)
+        for i, t in tqdm(list(enumerate(timesteps)), disable=os.getenv("DPM_TQDM", "False") == "True"):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+            # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+            timestep = t.expand(latent_model_input.shape[0])
+            noise_pred = self.model(
+                latent_model_input,
+                timestep,
+                prompt_embeds,
+                **self.model_kwargs,
+            )
+            if isinstance(noise_pred, Transformer2DModelOutput):
+                noise_pred = noise_pred[0]
+            # perform guidance
+            if do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + self.cfg_scale * (noise_pred_text - noise_pred_uncond)
+            # compute the previous noisy sample x_t -> x_t-1
+            latents_dtype = latents.dtype
+            latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
+            if latents.dtype != latents_dtype:
+                latents = latents.to(latents_dtype)
+        return latents

diffusion/iddpm.py ADDED Viewed

	@@ -0,0 +1,76 @@

+# Copyright 2024 NVIDIA CORPORATION & AFFILIATES
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+# Modified from OpenAI's diffusion repos
+#     GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py
+#     ADM:   https://github.com/openai/guided-diffusion/blob/main/guided_diffusion
+#     IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py
+from diffusion.model.respace import SpacedDiffusion, space_timesteps
+from .model import gaussian_diffusion as gd
+def Scheduler(
+    timestep_respacing,
+    noise_schedule="linear",
+    use_kl=False,
+    sigma_small=False,
+    predict_xstart=False,
+    predict_v=False,
+    learn_sigma=True,
+    pred_sigma=True,
+    rescale_learned_sigmas=False,
+    diffusion_steps=1000,
+    snr=False,
+    return_startx=False,
+    flow_shift=1.0,
+):
+    betas = gd.get_named_beta_schedule(noise_schedule, diffusion_steps)
+    if use_kl:
+        loss_type = gd.LossType.RESCALED_KL
+    elif rescale_learned_sigmas:
+        loss_type = gd.LossType.RESCALED_MSE
+    else:
+        loss_type = gd.LossType.MSE
+    if timestep_respacing is None or timestep_respacing == "":
+        timestep_respacing = [diffusion_steps]
+    if predict_xstart:
+        model_mean_type = gd.ModelMeanType.START_X
+    elif predict_v:
+        model_mean_type = gd.ModelMeanType.VELOCITY
+    else:
+        model_mean_type = gd.ModelMeanType.EPSILON
+    return SpacedDiffusion(
+        use_timesteps=space_timesteps(diffusion_steps, timestep_respacing),
+        betas=betas,
+        model_mean_type=model_mean_type,
+        model_var_type=(
+            (
+                (gd.ModelVarType.FIXED_LARGE if not sigma_small else gd.ModelVarType.FIXED_SMALL)
+                if not learn_sigma
+                else gd.ModelVarType.LEARNED_RANGE
+            )
+            if pred_sigma
+            else None
+        ),
+        loss_type=loss_type,
+        snr=snr,
+        return_startx=return_startx,
+        # rescale_timesteps=rescale_timesteps,
+        flow="flow" in noise_schedule,
+        flow_shift=flow_shift,
+        diffusion_steps=diffusion_steps,
+    )

diffusion/lcm_scheduler.py ADDED Viewed

	@@ -0,0 +1,457 @@

+# Copyright 2023 Stanford University Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# DISCLAIMER: This code is strongly influenced by https://github.com/pesser/pytorch_diffusion
+# and https://github.com/hojonathanho/diffusion
+import math
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+import numpy as np
+import torch
+from diffusers import ConfigMixin, SchedulerMixin
+from diffusers.configuration_utils import register_to_config
+from diffusers.utils import BaseOutput
+@dataclass
+# Copied from diffusers.schedulers.scheduling_ddpm.DDPMSchedulerOutput with DDPM->DDIM
+class LCMSchedulerOutput(BaseOutput):
+    """
+    Output class for the scheduler's `step` function output.
+    Args:
+        prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
+            Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the
+            denoising loop.
+        pred_original_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
+            The predicted denoised sample `(x_{0})` based on the model output from the current timestep.
+            `pred_original_sample` can be used to preview progress or for guidance.
+    """
+    prev_sample: torch.FloatTensor
+    denoised: Optional[torch.FloatTensor] = None
+# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
+def betas_for_alpha_bar(
+    num_diffusion_timesteps,
+    max_beta=0.999,
+    alpha_transform_type="cosine",
+):
+    """
+    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
+    (1-beta) over time from t = [0,1].
+    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
+    to that part of the diffusion process.
+    Args:
+        num_diffusion_timesteps (`int`): the number of betas to produce.
+        max_beta (`float`): the maximum beta to use; use values lower than 1 to
+                     prevent singularities.
+        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
+                     Choose from `cosine` or `exp`
+    Returns:
+        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+    """
+    if alpha_transform_type == "cosine":
+        def alpha_bar_fn(t):
+            return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2
+    elif alpha_transform_type == "exp":
+        def alpha_bar_fn(t):
+            return math.exp(t * -12.0)
+    else:
+        raise ValueError(f"Unsupported alpha_tranform_type: {alpha_transform_type}")
+    betas = []
+    for i in range(num_diffusion_timesteps):
+        t1 = i / num_diffusion_timesteps
+        t2 = (i + 1) / num_diffusion_timesteps
+        betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta))
+    return torch.tensor(betas, dtype=torch.float32)
+def rescale_zero_terminal_snr(betas):
+    """
+    Rescales betas to have zero terminal SNR Based on https://arxiv.org/pdf/2305.08891.pdf (Algorithm 1)
+    Args:
+        betas (`torch.FloatTensor`):
+            the betas that the scheduler is being initialized with.
+    Returns:
+        `torch.FloatTensor`: rescaled betas with zero terminal SNR
+    """
+    # Convert betas to alphas_bar_sqrt
+    alphas = 1.0 - betas
+    alphas_cumprod = torch.cumprod(alphas, dim=0)
+    alphas_bar_sqrt = alphas_cumprod.sqrt()
+    # Store old values.
+    alphas_bar_sqrt_0 = alphas_bar_sqrt[0].clone()
+    alphas_bar_sqrt_T = alphas_bar_sqrt[-1].clone()
+    # Shift so the last timestep is zero.
+    alphas_bar_sqrt -= alphas_bar_sqrt_T
+    # Scale so the first timestep is back to the old value.
+    alphas_bar_sqrt *= alphas_bar_sqrt_0 / (alphas_bar_sqrt_0 - alphas_bar_sqrt_T)
+    # Convert alphas_bar_sqrt to betas
+    alphas_bar = alphas_bar_sqrt**2  # Revert sqrt
+    alphas = alphas_bar[1:] / alphas_bar[:-1]  # Revert cumprod
+    alphas = torch.cat([alphas_bar[0:1], alphas])
+    betas = 1 - alphas
+    return betas
+class LCMScheduler(SchedulerMixin, ConfigMixin):
+    """
+    `LCMScheduler` extends the denoising procedure introduced in denoising diffusion probabilistic models (DDPMs) with
+    non-Markovian guidance.
+    This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
+    methods the library implements for all schedulers such as loading and saving.
+    Args:
+        num_train_timesteps (`int`, defaults to 1000):
+            The number of diffusion steps to train the model.
+        beta_start (`float`, defaults to 0.0001):
+            The starting `beta` value of inference.
+        beta_end (`float`, defaults to 0.02):
+            The final `beta` value.
+        beta_schedule (`str`, defaults to `"linear"`):
+            The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
+            `linear`, `scaled_linear`, or `squaredcos_cap_v2`.
+        trained_betas (`np.ndarray`, *optional*):
+            Pass an array of betas directly to the constructor to bypass `beta_start` and `beta_end`.
+        clip_sample (`bool`, defaults to `True`):
+            Clip the predicted sample for numerical stability.
+        clip_sample_range (`float`, defaults to 1.0):
+            The maximum magnitude for sample clipping. Valid only when `clip_sample=True`.
+        set_alpha_to_one (`bool`, defaults to `True`):
+            Each diffusion step uses the alphas product value at that step and at the previous one. For the final step
+            there is no previous alpha. When this option is `True` the previous alpha product is fixed to `1`,
+            otherwise it uses the alpha value at step 0.
+        steps_offset (`int`, defaults to 0):
+            An offset added to the inference steps. You can use a combination of `offset=1` and
+            `set_alpha_to_one=False` to make the last step use step 0 for the previous alpha product like in Stable
+            Diffusion.
+        prediction_type (`str`, defaults to `epsilon`, *optional*):
+            Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
+            `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
+            Video](https://imagen.research.google/video/paper.pdf) paper).
+        thresholding (`bool`, defaults to `False`):
+            Whether to use the "dynamic thresholding" method. This is unsuitable for latent-space diffusion models such
+            as Stable Diffusion.
+        dynamic_thresholding_ratio (`float`, defaults to 0.995):
+            The ratio for the dynamic thresholding method. Valid only when `thresholding=True`.
+        sample_max_value (`float`, defaults to 1.0):
+            The threshold value for dynamic thresholding. Valid only when `thresholding=True`.
+        timestep_spacing (`str`, defaults to `"leading"`):
+            The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
+            Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
+        rescale_betas_zero_snr (`bool`, defaults to `False`):
+            Whether to rescale the betas to have zero terminal SNR. This enables the model to generate very bright and
+            dark samples instead of limiting it to samples with medium brightness. Loosely related to
+            [`--offset_noise`](https://github.com/huggingface/diffusers/blob/74fd735eb073eb1d774b1ab4154a0876eb82f055/examples/dreambooth/train_dreambooth.py#L506).
+    """
+    # _compatibles = [e.name for e in KarrasDiffusionSchedulers]
+    order = 1
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        beta_start: float = 0.0001,
+        beta_end: float = 0.02,
+        beta_schedule: str = "linear",
+        trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
+        clip_sample: bool = True,
+        set_alpha_to_one: bool = True,
+        steps_offset: int = 0,
+        prediction_type: str = "epsilon",
+        thresholding: bool = False,
+        dynamic_thresholding_ratio: float = 0.995,
+        clip_sample_range: float = 1.0,
+        sample_max_value: float = 1.0,
+        timestep_spacing: str = "leading",
+        rescale_betas_zero_snr: bool = False,
+    ):
+        if trained_betas is not None:
+            self.betas = torch.tensor(trained_betas, dtype=torch.float32)
+        elif beta_schedule == "linear":
+            self.betas = torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32)
+        elif beta_schedule == "scaled_linear":
+            # this schedule is very specific to the latent diffusion model.
+            self.betas = (
+                torch.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=torch.float32) ** 2
+            )
+        elif beta_schedule == "squaredcos_cap_v2":
+            # Glide cosine schedule
+            self.betas = betas_for_alpha_bar(num_train_timesteps)
+        else:
+            raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
+        # Rescale for zero SNR
+        if rescale_betas_zero_snr:
+            self.betas = rescale_zero_terminal_snr(self.betas)
+        self.alphas = 1.0 - self.betas
+        self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
+        # At every step in ddim, we are looking into the previous alphas_cumprod
+        # For the final step, there is no previous alphas_cumprod because we are already at 0
+        # `set_alpha_to_one` decides whether we set this parameter simply to one or
+        # whether we use the final alpha of the "non-previous" one.
+        self.final_alpha_cumprod = torch.tensor(1.0) if set_alpha_to_one else self.alphas_cumprod[0]
+        # standard deviation of the initial noise distribution
+        self.init_noise_sigma = 1.0
+        # setable values
+        self.num_inference_steps = None
+        self.timesteps = torch.from_numpy(np.arange(0, num_train_timesteps)[::-1].copy().astype(np.int64))
+    def scale_model_input(self, sample: torch.FloatTensor, timestep: Optional[int] = None) -> torch.FloatTensor:
+        """
+        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+        current timestep.
+        Args:
+            sample (`torch.FloatTensor`):
+                The input sample.
+            timestep (`int`, *optional*):
+                The current timestep in the diffusion chain.
+        Returns:
+            `torch.FloatTensor`:
+                A scaled input sample.
+        """
+        return sample
+    def _get_variance(self, timestep, prev_timestep):
+        alpha_prod_t = self.alphas_cumprod[timestep]
+        alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
+        beta_prod_t = 1 - alpha_prod_t
+        beta_prod_t_prev = 1 - alpha_prod_t_prev
+        variance = (beta_prod_t_prev / beta_prod_t) * (1 - alpha_prod_t / alpha_prod_t_prev)
+        return variance
+    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
+    def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor:
+        """
+        "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
+        prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
+        s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
+        pixels from saturation at each step. We find that dynamic thresholding results in significantly better
+        photorealism as well as better image-text alignment, especially when using very large guidance weights."
+        https://arxiv.org/abs/2205.11487
+        """
+        dtype = sample.dtype
+        batch_size, channels, height, width = sample.shape
+        if dtype not in (torch.float32, torch.float64):
+            sample = sample.float()  # upcast for quantile calculation, and clamp not implemented for cpu half
+        # Flatten sample for doing quantile calculation along each image
+        sample = sample.reshape(batch_size, channels * height * width)
+        abs_sample = sample.abs()  # "a certain percentile absolute pixel value"
+        s = torch.quantile(abs_sample, self.config.dynamic_thresholding_ratio, dim=1)
+        s = torch.clamp(
+            s, min=1, max=self.config.sample_max_value
+        )  # When clamped to min=1, equivalent to standard clipping to [-1, 1]
+        s = s.unsqueeze(1)  # (batch_size, 1) because clamp will broadcast along dim=0
+        sample = torch.clamp(sample, -s, s) / s  # "we threshold xt0 to the range [-s, s] and then divide by s"
+        sample = sample.reshape(batch_size, channels, height, width)
+        sample = sample.to(dtype)
+        return sample
+    def set_timesteps(self, num_inference_steps: int, lcm_origin_steps: int, device: Union[str, torch.device] = None):
+        """
+        Sets the discrete timesteps used for the diffusion chain (to be run before inference).
+        Args:
+            num_inference_steps (`int`):
+                The number of diffusion steps used when generating samples with a pre-trained model.
+        """
+        if num_inference_steps > self.config.num_train_timesteps:
+            raise ValueError(
+                f"`num_inference_steps`: {num_inference_steps} cannot be larger than `self.config.train_timesteps`:"
+                f" {self.config.num_train_timesteps} as the unet model trained with this scheduler can only handle"
+                f" maximal {self.config.num_train_timesteps} timesteps."
+            )
+        self.num_inference_steps = num_inference_steps
+        # LCM Timesteps Setting:  # Linear Spacing
+        c = self.config.num_train_timesteps // lcm_origin_steps
+        lcm_origin_timesteps = np.asarray(list(range(1, lcm_origin_steps + 1))) * c - 1  # LCM Training  Steps Schedule
+        skipping_step = len(lcm_origin_timesteps) // num_inference_steps
+        timesteps = lcm_origin_timesteps[::-skipping_step][:num_inference_steps]  # LCM Inference Steps Schedule
+        self.timesteps = torch.from_numpy(timesteps.copy()).to(device)
+    def get_scalings_for_boundary_condition_discrete(self, t):
+        self.sigma_data = 0.5  # Default: 0.5
+        # By dividing 0.1: This is almost a delta function at t=0.
+        c_skip = self.sigma_data**2 / ((t / 0.1) ** 2 + self.sigma_data**2)
+        c_out = (t / 0.1) / ((t / 0.1) ** 2 + self.sigma_data**2) ** 0.5
+        return c_skip, c_out
+    def step(
+        self,
+        model_output: torch.FloatTensor,
+        timeindex: int,
+        timestep: int,
+        sample: torch.FloatTensor,
+        eta: float = 0.0,
+        use_clipped_model_output: bool = False,
+        generator=None,
+        variance_noise: Optional[torch.FloatTensor] = None,
+        return_dict: bool = True,
+    ) -> Union[LCMSchedulerOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
+        process from the learned model outputs (most often the predicted noise).
+        Args:
+            model_output (`torch.FloatTensor`):
+                The direct output from learned diffusion model.
+            timestep (`float`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.FloatTensor`):
+                A current instance of a sample created by the diffusion process.
+            eta (`float`):
+                The weight of noise for added noise in diffusion step.
+            use_clipped_model_output (`bool`, defaults to `False`):
+                If `True`, computes "corrected" `model_output` from the clipped predicted original sample. Necessary
+                because predicted original sample is clipped to [-1, 1] when `self.config.clip_sample` is `True`. If no
+                clipping has happened, "corrected" `model_output` would coincide with the one provided as input and
+                `use_clipped_model_output` has no effect.
+            generator (`torch.Generator`, *optional*):
+                A random number generator.
+            variance_noise (`torch.FloatTensor`):
+                Alternative to generating noise with `generator` by directly providing the noise for the variance
+                itself. Useful for methods such as [`CycleDiffusion`].
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~schedulers.scheduling_lcm.LCMSchedulerOutput`] or `tuple`.
+        Returns:
+            [`~schedulers.scheduling_utils.LCMSchedulerOutput`] or `tuple`:
+                If return_dict is `True`, [`~schedulers.scheduling_lcm.LCMSchedulerOutput`] is returned, otherwise a
+                tuple is returned where the first element is the sample tensor.
+        """
+        if self.num_inference_steps is None:
+            raise ValueError(
+                "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
+            )
+        # 1. get previous step value
+        prev_timeindex = timeindex + 1
+        if prev_timeindex < len(self.timesteps):
+            prev_timestep = self.timesteps[prev_timeindex]
+        else:
+            prev_timestep = timestep
+        # 2. compute alphas, betas
+        alpha_prod_t = self.alphas_cumprod[timestep]
+        alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
+        beta_prod_t = 1 - alpha_prod_t
+        beta_prod_t_prev = 1 - alpha_prod_t_prev
+        # 3. Get scalings for boundary conditions
+        c_skip, c_out = self.get_scalings_for_boundary_condition_discrete(timestep)
+        # 4. Different Parameterization:
+        parameterization = self.config.prediction_type
+        if parameterization == "epsilon":  # noise-prediction
+            pred_x0 = (sample - beta_prod_t.sqrt() * model_output) / alpha_prod_t.sqrt()
+        elif parameterization == "sample":  # x-prediction
+            pred_x0 = model_output
+        elif parameterization == "v_prediction":  # v-prediction
+            pred_x0 = alpha_prod_t.sqrt() * sample - beta_prod_t.sqrt() * model_output
+        # 4. Denoise model output using boundary conditions
+        denoised = c_out * pred_x0 + c_skip * sample
+        # 5. Sample z ~ N(0, I), For MultiStep Inference
+        # Noise is not used for one-step sampling.
+        if len(self.timesteps) > 1:
+            noise = torch.randn(model_output.shape).to(model_output.device)
+            prev_sample = alpha_prod_t_prev.sqrt() * denoised + beta_prod_t_prev.sqrt() * noise
+        else:
+            prev_sample = denoised
+        if not return_dict:
+            return (prev_sample, denoised)
+        return LCMSchedulerOutput(prev_sample=prev_sample, denoised=denoised)
+    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.add_noise
+    def add_noise(
+        self,
+        original_samples: torch.FloatTensor,
+        noise: torch.FloatTensor,
+        timesteps: torch.IntTensor,
+    ) -> torch.FloatTensor:
+        # Make sure alphas_cumprod and timestep have same device and dtype as original_samples
+        alphas_cumprod = self.alphas_cumprod.to(device=original_samples.device, dtype=original_samples.dtype)
+        timesteps = timesteps.to(original_samples.device)
+        sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
+        sqrt_alpha_prod = sqrt_alpha_prod.flatten()
+        while len(sqrt_alpha_prod.shape) < len(original_samples.shape):
+            sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
+        sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
+        sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
+        while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape):
+            sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
+        noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
+        return noisy_samples
+    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.get_velocity
+    def get_velocity(
+        self, sample: torch.FloatTensor, noise: torch.FloatTensor, timesteps: torch.IntTensor
+    ) -> torch.FloatTensor:
+        # Make sure alphas_cumprod and timestep have same device and dtype as sample
+        alphas_cumprod = self.alphas_cumprod.to(device=sample.device, dtype=sample.dtype)
+        timesteps = timesteps.to(sample.device)
+        sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
+        sqrt_alpha_prod = sqrt_alpha_prod.flatten()
+        while len(sqrt_alpha_prod.shape) < len(sample.shape):
+            sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
+        sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
+        sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
+        while len(sqrt_one_minus_alpha_prod.shape) < len(sample.shape):
+            sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
+        velocity = sqrt_alpha_prod * noise - sqrt_one_minus_alpha_prod * sample
+        return velocity
+    def __len__(self):
+        return self.config.num_train_timesteps

diffusion/model/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .nets import *