fountai commited on
Commit
fca8815
1 Parent(s): 4d207eb
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitignore +167 -0
  2. LICENSE +201 -0
  3. app.py +1 -1
  4. cog.yaml +24 -0
  5. flux +0 -1
  6. image_datasets/canny_dataset.py +59 -0
  7. image_datasets/dataset.py +45 -0
  8. main.py +180 -0
  9. models_licence/LICENSE-FLUX1-dev +42 -0
  10. predict.py +134 -0
  11. src/flux/__init__.py +11 -0
  12. src/flux/__main__.py +4 -0
  13. src/flux/annotator/canny/__init__.py +6 -0
  14. src/flux/annotator/ckpts/ckpts.txt +1 -0
  15. src/flux/annotator/dwpose/__init__.py +68 -0
  16. src/flux/annotator/dwpose/onnxdet.py +125 -0
  17. src/flux/annotator/dwpose/onnxpose.py +360 -0
  18. src/flux/annotator/dwpose/util.py +297 -0
  19. src/flux/annotator/dwpose/wholebody.py +48 -0
  20. src/flux/annotator/hed/__init__.py +95 -0
  21. src/flux/annotator/midas/LICENSE +21 -0
  22. src/flux/annotator/midas/__init__.py +42 -0
  23. src/flux/annotator/midas/api.py +168 -0
  24. src/flux/annotator/midas/midas/__init__.py +0 -0
  25. src/flux/annotator/midas/midas/base_model.py +16 -0
  26. src/flux/annotator/midas/midas/blocks.py +342 -0
  27. src/flux/annotator/midas/midas/dpt_depth.py +109 -0
  28. src/flux/annotator/midas/midas/midas_net.py +76 -0
  29. src/flux/annotator/midas/midas/midas_net_custom.py +128 -0
  30. src/flux/annotator/midas/midas/transforms.py +234 -0
  31. src/flux/annotator/midas/midas/vit.py +491 -0
  32. src/flux/annotator/midas/utils.py +189 -0
  33. src/flux/annotator/mlsd/LICENSE +201 -0
  34. src/flux/annotator/mlsd/__init__.py +40 -0
  35. src/flux/annotator/mlsd/models/mbv2_mlsd_large.py +292 -0
  36. src/flux/annotator/mlsd/models/mbv2_mlsd_tiny.py +275 -0
  37. src/flux/annotator/mlsd/utils.py +580 -0
  38. src/flux/annotator/tile/__init__.py +26 -0
  39. src/flux/annotator/tile/guided_filter.py +280 -0
  40. src/flux/annotator/util.py +38 -0
  41. src/flux/api.py +194 -0
  42. src/flux/cli.py +254 -0
  43. src/flux/controlnet.py +222 -0
  44. src/flux/math.py +30 -0
  45. src/flux/model.py +228 -0
  46. src/flux/modules/autoencoder.py +312 -0
  47. src/flux/modules/conditioner.py +38 -0
  48. src/flux/modules/layers.py +567 -0
  49. src/flux/sampling.py +242 -0
  50. src/flux/util.py +383 -0
.gitignore ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ Makefile
12
+ build/
13
+ develop-eggs/
14
+ dist/
15
+ downloads/
16
+ eggs/
17
+ .eggs/
18
+ lib/
19
+ lib64/
20
+ parts/
21
+ sdist/
22
+ var/
23
+ wheels/
24
+ weights/
25
+
26
+ share/python-wheels/
27
+ *.egg-info/
28
+ .installed.cfg
29
+ *.egg
30
+ MANIFEST
31
+
32
+ # PyInstaller
33
+ # Usually these files are written by a python script from a template
34
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
35
+ *.manifest
36
+ *.spec
37
+
38
+ # Installer logs
39
+ pip-log.txt
40
+ pip-delete-this-directory.txt
41
+
42
+ # Unit test / coverage reports
43
+ htmlcov/
44
+ .tox/
45
+ .nox/
46
+ .coverage
47
+ .coverage.*
48
+ .cache/
49
+ nosetests.xml
50
+ coverage.xml
51
+ *.cover
52
+ *.py,cover
53
+ .hypothesis/
54
+ .pytest_cache/
55
+ cover/
56
+
57
+ # Translations
58
+ *.mo
59
+ *.pot
60
+
61
+ # Django stuff:
62
+ *.log
63
+ local_settings.py
64
+ db.sqlite3
65
+ db.sqlite3-journal
66
+
67
+ # Flask stuff:
68
+ instance/
69
+ .webassets-cache
70
+
71
+ # Scrapy stuff:
72
+ .scrapy
73
+
74
+ # Sphinx documentation
75
+ docs/_build/
76
+
77
+ # PyBuilder
78
+ .pybuilder/
79
+ target/
80
+
81
+ # Jupyter Notebook
82
+ .ipynb_checkpoints
83
+
84
+ # IPython
85
+ profile_default/
86
+ ipython_config.py
87
+
88
+ # pyenv
89
+ # For a library or package, you might want to ignore these files since the code is
90
+ # intended to run in multiple environments; otherwise, check them in:
91
+ # .python-version
92
+
93
+ # pipenv
94
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
95
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
96
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
97
+ # install all needed dependencies.
98
+ #Pipfile.lock
99
+
100
+ # poetry
101
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
102
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
103
+ # commonly ignored for libraries.
104
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
105
+ #poetry.lock
106
+
107
+ # pdm
108
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
109
+ #pdm.lock
110
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
111
+ # in version control.
112
+ # https://pdm.fming.dev/latest/usage/project/#working-with-version-control
113
+ .pdm.toml
114
+ .pdm-python
115
+ .pdm-build/
116
+
117
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
118
+ __pypackages__/
119
+
120
+ # Celery stuff
121
+ celerybeat-schedule
122
+ celerybeat.pid
123
+
124
+ # SageMath parsed files
125
+ *.sage.py
126
+
127
+ # Environments
128
+ .env
129
+ .venv
130
+ env/
131
+ venv/
132
+ ENV/
133
+ env.bak/
134
+ venv.bak/
135
+
136
+ # Spyder project settings
137
+ .spyderproject
138
+ .spyproject
139
+
140
+ # Rope project settings
141
+ .ropeproject
142
+
143
+ # mkdocs documentation
144
+ /site
145
+
146
+ # mypy
147
+ .mypy_cache/
148
+ .dmypy.json
149
+ dmypy.json
150
+
151
+ # Pyre type checker
152
+ .pyre/
153
+
154
+ # pytype static type analyzer
155
+ .pytype/
156
+
157
+ # Cython debug symbols
158
+ cython_debug/
159
+
160
+ # PyCharm
161
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
162
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
163
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
164
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
165
+ #.idea/
166
+
167
+ .DS_Store
LICENSE ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity. For the purposes of this definition,
18
+ "control" means (i) the power, direct or indirect, to cause the
19
+ direction or management of such entity, whether by contract or
20
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+ outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+ "You" (or "Your") shall mean an individual or Legal Entity
24
+ exercising permissions granted by this License.
25
+
26
+ "Source" form shall mean the preferred form for making modifications,
27
+ including but not limited to software source code, documentation
28
+ source, and configuration files.
29
+
30
+ "Object" form shall mean any form resulting from mechanical
31
+ transformation or translation of a Source form, including but
32
+ not limited to compiled object code, generated documentation,
33
+ and conversions to other media types.
34
+
35
+ "Work" shall mean the work of authorship, whether in Source or
36
+ Object form, made available under the License, as indicated by a
37
+ copyright notice that is included in or attached to the work
38
+ (an example is provided in the Appendix below).
39
+
40
+ "Derivative Works" shall mean any work, whether in Source or Object
41
+ form, that is based on (or derived from) the Work and for which the
42
+ editorial revisions, annotations, elaborations, or other modifications
43
+ represent, as a whole, an original work of authorship. For the purposes
44
+ of this License, Derivative Works shall not include works that remain
45
+ separable from, or merely link (or bind by name) to the interfaces of,
46
+ the Work and Derivative Works thereof.
47
+
48
+ "Contribution" shall mean any work of authorship, including
49
+ the original version of the Work and any modifications or additions
50
+ to that Work or Derivative Works thereof, that is intentionally
51
+ submitted to Licensor for inclusion in the Work by the copyright owner
52
+ or by an individual or Legal Entity authorized to submit on behalf of
53
+ the copyright owner. For the purposes of this definition, "submitted"
54
+ means any form of electronic, verbal, or written communication sent
55
+ to the Licensor or its representatives, including but not limited to
56
+ communication on electronic mailing lists, source code control systems,
57
+ and issue tracking systems that are managed by, or on behalf of, the
58
+ Licensor for the purpose of discussing and improving the Work, but
59
+ excluding communication that is conspicuously marked or otherwise
60
+ designated in writing by the copyright owner as "Not a Contribution."
61
+
62
+ "Contributor" shall mean Licensor and any individual or Legal Entity
63
+ on behalf of whom a Contribution has been received by Licensor and
64
+ subsequently incorporated within the Work.
65
+
66
+ 2. Grant of Copyright License. Subject to the terms and conditions of
67
+ this License, each Contributor hereby grants to You a perpetual,
68
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
+ copyright license to reproduce, prepare Derivative Works of,
70
+ publicly display, publicly perform, sublicense, and distribute the
71
+ Work and such Derivative Works in Source or Object form.
72
+
73
+ 3. Grant of Patent License. Subject to the terms and conditions of
74
+ this License, each Contributor hereby grants to You a perpetual,
75
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
+ (except as stated in this section) patent license to make, have made,
77
+ use, offer to sell, sell, import, and otherwise transfer the Work,
78
+ where such license applies only to those patent claims licensable
79
+ by such Contributor that are necessarily infringed by their
80
+ Contribution(s) alone or by combination of their Contribution(s)
81
+ with the Work to which such Contribution(s) was submitted. If You
82
+ institute patent litigation against any entity (including a
83
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
84
+ or a Contribution incorporated within the Work constitutes direct
85
+ or contributory patent infringement, then any patent licenses
86
+ granted to You under this License for that Work shall terminate
87
+ as of the date such litigation is filed.
88
+
89
+ 4. Redistribution. You may reproduce and distribute copies of the
90
+ Work or Derivative Works thereof in any medium, with or without
91
+ modifications, and in Source or Object form, provided that You
92
+ meet the following conditions:
93
+
94
+ (a) You must give any other recipients of the Work or
95
+ Derivative Works a copy of this License; and
96
+
97
+ (b) You must cause any modified files to carry prominent notices
98
+ stating that You changed the files; and
99
+
100
+ (c) You must retain, in the Source form of any Derivative Works
101
+ that You distribute, all copyright, patent, trademark, and
102
+ attribution notices from the Source form of the Work,
103
+ excluding those notices that do not pertain to any part of
104
+ the Derivative Works; and
105
+
106
+ (d) If the Work includes a "NOTICE" text file as part of its
107
+ distribution, then any Derivative Works that You distribute must
108
+ include a readable copy of the attribution notices contained
109
+ within such NOTICE file, excluding those notices that do not
110
+ pertain to any part of the Derivative Works, in at least one
111
+ of the following places: within a NOTICE text file distributed
112
+ as part of the Derivative Works; within the Source form or
113
+ documentation, if provided along with the Derivative Works; or,
114
+ within a display generated by the Derivative Works, if and
115
+ wherever such third-party notices normally appear. The contents
116
+ of the NOTICE file are for informational purposes only and
117
+ do not modify the License. You may add Your own attribution
118
+ notices within Derivative Works that You distribute, alongside
119
+ or as an addendum to the NOTICE text from the Work, provided
120
+ that such additional attribution notices cannot be construed
121
+ as modifying the License.
122
+
123
+ You may add Your own copyright statement to Your modifications and
124
+ may provide additional or different license terms and conditions
125
+ for use, reproduction, or distribution of Your modifications, or
126
+ for any such Derivative Works as a whole, provided Your use,
127
+ reproduction, and distribution of the Work otherwise complies with
128
+ the conditions stated in this License.
129
+
130
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
131
+ any Contribution intentionally submitted for inclusion in the Work
132
+ by You to the Licensor shall be under the terms and conditions of
133
+ this License, without any additional terms or conditions.
134
+ Notwithstanding the above, nothing herein shall supersede or modify
135
+ the terms of any separate license agreement you may have executed
136
+ with Licensor regarding such Contributions.
137
+
138
+ 6. Trademarks. This License does not grant permission to use the trade
139
+ names, trademarks, service marks, or product names of the Licensor,
140
+ except as required for reasonable and customary use in describing the
141
+ origin of the Work and reproducing the content of the NOTICE file.
142
+
143
+ 7. Disclaimer of Warranty. Unless required by applicable law or
144
+ agreed to in writing, Licensor provides the Work (and each
145
+ Contributor provides its Contributions) on an "AS IS" BASIS,
146
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
+ implied, including, without limitation, any warranties or conditions
148
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
+ PARTICULAR PURPOSE. You are solely responsible for determining the
150
+ appropriateness of using or redistributing the Work and assume any
151
+ risks associated with Your exercise of permissions under this License.
152
+
153
+ 8. Limitation of Liability. In no event and under no legal theory,
154
+ whether in tort (including negligence), contract, or otherwise,
155
+ unless required by applicable law (such as deliberate and grossly
156
+ negligent acts) or agreed to in writing, shall any Contributor be
157
+ liable to You for damages, including any direct, indirect, special,
158
+ incidental, or consequential damages of any character arising as a
159
+ result of this License or out of the use or inability to use the
160
+ Work (including but not limited to damages for loss of goodwill,
161
+ work stoppage, computer failure or malfunction, or any and all
162
+ other commercial damages or losses), even if such Contributor
163
+ has been advised of the possibility of such damages.
164
+
165
+ 9. Accepting Warranty or Additional Liability. While redistributing
166
+ the Work or Derivative Works thereof, You may choose to offer,
167
+ and charge a fee for, acceptance of support, warranty, indemnity,
168
+ or other liability obligations and/or rights consistent with this
169
+ License. However, in accepting such obligations, You may act only
170
+ on Your own behalf and on Your sole responsibility, not on behalf
171
+ of any other Contributor, and only if You agree to indemnify,
172
+ defend, and hold each Contributor harmless for any liability
173
+ incurred by, or claims asserted against, such Contributor by reason
174
+ of your accepting any such warranty or additional liability.
175
+
176
+ END OF TERMS AND CONDITIONS
177
+
178
+ APPENDIX: How to apply the Apache License to your work.
179
+
180
+ To apply the Apache License to your work, attach the following
181
+ boilerplate notice, with the fields enclosed by brackets "[]"
182
+ replaced with your own identifying information. (Don't include
183
+ the brackets!) The text should be enclosed in the appropriate
184
+ comment syntax for the file format. We also recommend that a
185
+ file or class name and description of purpose be included on the
186
+ same "printed page" as the copyright notice for easier
187
+ identification within third-party archives.
188
+
189
+ Copyright [yyyy] [name of copyright owner]
190
+
191
+ Licensed under the Apache License, Version 2.0 (the "License");
192
+ you may not use this file except in compliance with the License.
193
+ You may obtain a copy of the License at
194
+
195
+ http://www.apache.org/licenses/LICENSE-2.0
196
+
197
+ Unless required by applicable law or agreed to in writing, software
198
+ distributed under the License is distributed on an "AS IS" BASIS,
199
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200
+ See the License for the specific language governing permissions and
201
+ limitations under the License.
app.py CHANGED
@@ -1,7 +1,7 @@
1
  import gradio as gr
2
  from PIL import Image
3
  import os
4
- from flux.src.flux.xflux_pipeline import XFluxPipeline
5
  import random
6
  import spaces
7
 
 
1
  import gradio as gr
2
  from PIL import Image
3
  import os
4
+ from src.flux.xflux_pipeline import XFluxPipeline
5
  import random
6
  import spaces
7
 
cog.yaml ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Configuration for Cog ⚙️
2
+ # Reference: https://cog.run/yaml
3
+
4
+ build:
5
+ gpu: true
6
+ cuda: "12.1"
7
+ python_version: "3.11"
8
+ python_packages:
9
+ - "accelerate==0.30.1"
10
+ - "deepspeed==0.14.4"
11
+ - "einops==0.8.0"
12
+ - "transformers==4.43.3"
13
+ - "huggingface-hub==0.24.5"
14
+ - "einops==0.8.0"
15
+ - "pandas==2.2.2"
16
+ - "opencv-python==4.10.0.84"
17
+ - "pillow==10.4.0"
18
+ - "optimum-quanto==0.2.4"
19
+ - "sentencepiece==0.2.0"
20
+ run:
21
+ - curl -o /usr/local/bin/pget -L "https://github.com/replicate/pget/releases/download/v0.8.2/pget_linux_x86_64" && chmod +x /usr/local/bin/pget
22
+
23
+ # predict.py defines how predictions are run on your model
24
+ predict: "predict.py:Predictor"
flux DELETED
@@ -1 +0,0 @@
1
- Subproject commit 9e1dd391b2316b1cfc20e523e2885fd30134a2e4
 
 
image_datasets/canny_dataset.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pandas as pd
3
+ import numpy as np
4
+ from PIL import Image
5
+ import torch
6
+ from torch.utils.data import Dataset, DataLoader
7
+ import json
8
+ import random
9
+ import cv2
10
+
11
+
12
+ def canny_processor(image, low_threshold=100, high_threshold=200):
13
+ image = np.array(image)
14
+ image = cv2.Canny(image, low_threshold, high_threshold)
15
+ image = image[:, :, None]
16
+ image = np.concatenate([image, image, image], axis=2)
17
+ canny_image = Image.fromarray(image)
18
+ return canny_image
19
+
20
+
21
+ def c_crop(image):
22
+ width, height = image.size
23
+ new_size = min(width, height)
24
+ left = (width - new_size) / 2
25
+ top = (height - new_size) / 2
26
+ right = (width + new_size) / 2
27
+ bottom = (height + new_size) / 2
28
+ return image.crop((left, top, right, bottom))
29
+
30
+ class CustomImageDataset(Dataset):
31
+ def __init__(self, img_dir, img_size=512):
32
+ self.images = [os.path.join(img_dir, i) for i in os.listdir(img_dir) if '.jpg' in i or '.png' in i]
33
+ self.images.sort()
34
+ self.img_size = img_size
35
+
36
+ def __len__(self):
37
+ return len(self.images)
38
+
39
+ def __getitem__(self, idx):
40
+ try:
41
+ img = Image.open(self.images[idx])
42
+ img = c_crop(img)
43
+ img = img.resize((self.img_size, self.img_size))
44
+ hint = canny_processor(img)
45
+ img = torch.from_numpy((np.array(img) / 127.5) - 1)
46
+ img = img.permute(2, 0, 1)
47
+ hint = torch.from_numpy((np.array(hint) / 127.5) - 1)
48
+ hint = hint.permute(2, 0, 1)
49
+ json_path = self.images[idx].split('.')[0] + '.json'
50
+ prompt = json.load(open(json_path))['caption']
51
+ return img, hint, prompt
52
+ except Exception as e:
53
+ print(e)
54
+ return self.__getitem__(random.randint(0, len(self.images) - 1))
55
+
56
+
57
+ def loader(train_batch_size, num_workers, **args):
58
+ dataset = CustomImageDataset(**args)
59
+ return DataLoader(dataset, batch_size=train_batch_size, num_workers=num_workers, shuffle=True)
image_datasets/dataset.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pandas as pd
3
+ import numpy as np
4
+ from PIL import Image
5
+ import torch
6
+ from torch.utils.data import Dataset, DataLoader
7
+ import json
8
+ import random
9
+
10
+ def c_crop(image):
11
+ width, height = image.size
12
+ new_size = min(width, height)
13
+ left = (width - new_size) / 2
14
+ top = (height - new_size) / 2
15
+ right = (width + new_size) / 2
16
+ bottom = (height + new_size) / 2
17
+ return image.crop((left, top, right, bottom))
18
+
19
+ class CustomImageDataset(Dataset):
20
+ def __init__(self, img_dir, img_size=512):
21
+ self.images = [os.path.join(img_dir, i) for i in os.listdir(img_dir) if '.jpg' in i or '.png' in i]
22
+ self.images.sort()
23
+ self.img_size = img_size
24
+
25
+ def __len__(self):
26
+ return len(self.images)
27
+
28
+ def __getitem__(self, idx):
29
+ try:
30
+ img = Image.open(self.images[idx])
31
+ img = c_crop(img)
32
+ img = img.resize((self.img_size, self.img_size))
33
+ img = torch.from_numpy((np.array(img) / 127.5) - 1)
34
+ img = img.permute(2, 0, 1)
35
+ json_path = self.images[idx].split('.')[0] + '.json'
36
+ prompt = json.load(open(json_path))['caption']
37
+ return img, prompt
38
+ except Exception as e:
39
+ print(e)
40
+ return self.__getitem__(random.randint(0, len(self.images) - 1))
41
+
42
+
43
+ def loader(train_batch_size, num_workers, **args):
44
+ dataset = CustomImageDataset(**args)
45
+ return DataLoader(dataset, batch_size=train_batch_size, num_workers=num_workers, shuffle=True)
main.py ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ from PIL import Image
3
+ import os
4
+
5
+ from src.flux.xflux_pipeline import XFluxPipeline
6
+
7
+
8
+ def create_argparser():
9
+ parser = argparse.ArgumentParser()
10
+
11
+ parser.add_argument(
12
+ "--prompt", type=str, required=True,
13
+ help="The input text prompt"
14
+ )
15
+ parser.add_argument(
16
+ "--neg_prompt", type=str, default="",
17
+ help="The input text negative prompt"
18
+ )
19
+ parser.add_argument(
20
+ "--img_prompt", type=str, default=None,
21
+ help="Path to input image prompt"
22
+ )
23
+ parser.add_argument(
24
+ "--neg_img_prompt", type=str, default=None,
25
+ help="Path to input negative image prompt"
26
+ )
27
+ parser.add_argument(
28
+ "--ip_scale", type=float, default=1.0,
29
+ help="Strength of input image prompt"
30
+ )
31
+ parser.add_argument(
32
+ "--neg_ip_scale", type=float, default=1.0,
33
+ help="Strength of negative input image prompt"
34
+ )
35
+ parser.add_argument(
36
+ "--local_path", type=str, default=None,
37
+ help="Local path to the model checkpoint (Controlnet)"
38
+ )
39
+ parser.add_argument(
40
+ "--repo_id", type=str, default=None,
41
+ help="A HuggingFace repo id to download model (Controlnet)"
42
+ )
43
+ parser.add_argument(
44
+ "--name", type=str, default=None,
45
+ help="A filename to download from HuggingFace"
46
+ )
47
+ parser.add_argument(
48
+ "--ip_repo_id", type=str, default=None,
49
+ help="A HuggingFace repo id to download model (IP-Adapter)"
50
+ )
51
+ parser.add_argument(
52
+ "--ip_name", type=str, default=None,
53
+ help="A IP-Adapter filename to download from HuggingFace"
54
+ )
55
+ parser.add_argument(
56
+ "--ip_local_path", type=str, default=None,
57
+ help="Local path to the model checkpoint (IP-Adapter)"
58
+ )
59
+ parser.add_argument(
60
+ "--lora_repo_id", type=str, default=None,
61
+ help="A HuggingFace repo id to download model (LoRA)"
62
+ )
63
+ parser.add_argument(
64
+ "--lora_name", type=str, default=None,
65
+ help="A LoRA filename to download from HuggingFace"
66
+ )
67
+ parser.add_argument(
68
+ "--lora_local_path", type=str, default=None,
69
+ help="Local path to the model checkpoint (Controlnet)"
70
+ )
71
+ parser.add_argument(
72
+ "--device", type=str, default="cuda",
73
+ help="Device to use (e.g. cpu, cuda:0, cuda:1, etc.)"
74
+ )
75
+ parser.add_argument(
76
+ "--offload", action='store_true', help="Offload model to CPU when not in use"
77
+ )
78
+ parser.add_argument(
79
+ "--use_ip", action='store_true', help="Load IP model"
80
+ )
81
+ parser.add_argument(
82
+ "--use_lora", action='store_true', help="Load Lora model"
83
+ )
84
+ parser.add_argument(
85
+ "--use_controlnet", action='store_true', help="Load Controlnet model"
86
+ )
87
+ parser.add_argument(
88
+ "--num_images_per_prompt", type=int, default=1,
89
+ help="The number of images to generate per prompt"
90
+ )
91
+ parser.add_argument(
92
+ "--image", type=str, default=None, help="Path to image"
93
+ )
94
+ parser.add_argument(
95
+ "--lora_weight", type=float, default=0.9, help="Lora model strength (from 0 to 1.0)"
96
+ )
97
+ parser.add_argument(
98
+ "--control_type", type=str, default="canny",
99
+ choices=("canny", "openpose", "depth", "hed", "hough", "tile"),
100
+ help="Name of controlnet condition, example: canny"
101
+ )
102
+ parser.add_argument(
103
+ "--model_type", type=str, default="flux-dev",
104
+ choices=("flux-dev", "flux-dev-fp8", "flux-schnell"),
105
+ help="Model type to use (flux-dev, flux-dev-fp8, flux-schnell)"
106
+ )
107
+ parser.add_argument(
108
+ "--width", type=int, default=1024, help="The width for generated image"
109
+ )
110
+ parser.add_argument(
111
+ "--height", type=int, default=1024, help="The height for generated image"
112
+ )
113
+ parser.add_argument(
114
+ "--num_steps", type=int, default=25, help="The num_steps for diffusion process"
115
+ )
116
+ parser.add_argument(
117
+ "--guidance", type=float, default=4, help="The guidance for diffusion process"
118
+ )
119
+ parser.add_argument(
120
+ "--seed", type=int, default=123456789, help="A seed for reproducible inference"
121
+ )
122
+ parser.add_argument(
123
+ "--true_gs", type=float, default=3.5, help="true guidance"
124
+ )
125
+ parser.add_argument(
126
+ "--timestep_to_start_cfg", type=int, default=5, help="timestep to start true guidance"
127
+ )
128
+ parser.add_argument(
129
+ "--save_path", type=str, default='results', help="Path to save"
130
+ )
131
+ return parser
132
+
133
+
134
+ def main(args):
135
+ if args.image:
136
+ image = Image.open(args.image)
137
+ else:
138
+ image = None
139
+
140
+ xflux_pipeline = XFluxPipeline(args.model_type, args.device, args.offload)
141
+ if args.use_ip:
142
+ print('load ip-adapter:', args.ip_local_path, args.ip_repo_id, args.ip_name)
143
+ xflux_pipeline.set_ip(args.ip_local_path, args.ip_repo_id, args.ip_name)
144
+ if args.use_lora:
145
+ print('load lora:', args.lora_local_path, args.lora_repo_id, args.lora_name)
146
+ xflux_pipeline.set_lora(args.lora_local_path, args.lora_repo_id, args.lora_name, args.lora_weight)
147
+ if args.use_controlnet:
148
+ print('load controlnet:', args.local_path, args.repo_id, args.name)
149
+ xflux_pipeline.set_controlnet(args.control_type, args.local_path, args.repo_id, args.name)
150
+
151
+ image_prompt = Image.open(args.img_prompt) if args.img_prompt else None
152
+ neg_image_prompt = Image.open(args.neg_img_prompt) if args.neg_img_prompt else None
153
+
154
+ for _ in range(args.num_images_per_prompt):
155
+ result = xflux_pipeline(
156
+ prompt=args.prompt,
157
+ controlnet_image=image,
158
+ width=args.width,
159
+ height=args.height,
160
+ guidance=args.guidance,
161
+ num_steps=args.num_steps,
162
+ seed=args.seed,
163
+ true_gs=args.true_gs,
164
+ neg_prompt=args.neg_prompt,
165
+ timestep_to_start_cfg=args.timestep_to_start_cfg,
166
+ image_prompt=image_prompt,
167
+ neg_image_prompt=neg_image_prompt,
168
+ ip_scale=args.ip_scale,
169
+ neg_ip_scale=args.neg_ip_scale,
170
+ )
171
+ if not os.path.exists(args.save_path):
172
+ os.mkdir(args.save_path)
173
+ ind = len(os.listdir(args.save_path))
174
+ result.save(os.path.join(args.save_path, f"result_{ind}.png"))
175
+ args.seed = args.seed + 1
176
+
177
+
178
+ if __name__ == "__main__":
179
+ args = create_argparser().parse_args()
180
+ main(args)
models_licence/LICENSE-FLUX1-dev ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FLUX.1 [dev] Non-Commercial License
2
+ Black Forest Labs, Inc. (“we” or “our” or “Company”) is pleased to make available the weights, parameters and inference code for the FLUX.1 [dev] Model (as defined below) freely available for your non-commercial and non-production use as set forth in this FLUX.1 [dev] Non-Commercial License (“License”). The “FLUX.1 [dev] Model” means the FLUX.1 [dev] text-to-image AI model and its elements which includes algorithms, software, checkpoints, parameters, source code (inference code, evaluation code, and if applicable, fine-tuning code) and any other materials associated with the FLUX.1 [dev] AI model made available by Company under this License, including if any, the technical documentation, manuals and instructions for the use and operation thereof (collectively, “FLUX.1 [dev] Model”).
3
+ By downloading, accessing, use, Distributing (as defined below), or creating a Derivative (as defined below) of the FLUX.1 [dev] Model, you agree to the terms of this License. If you do not agree to this License, then you do not have any rights to access, use, Distribute or create a Derivative of the FLUX.1 [dev] Model and you must immediately cease using the FLUX.1 [dev] Model. If you are agreeing to be bound by the terms of this License on behalf of your employer or other entity, you represent and warrant to us that you have full legal authority to bind your employer or such entity to this License. If you do not have the requisite authority, you may not accept the License or access the FLUX.1 [dev] Model on behalf of your employer or other entity.
4
+ 1. Definitions. Capitalized terms used in this License but not defined herein have the following meanings:
5
+ a. “Derivative” means any (i) modified version of the FLUX.1 [dev] Model (including but not limited to any customized or fine-tuned version thereof), (ii) work based on the FLUX.1 [dev] Model, or (iii) any other derivative work thereof. For the avoidance of doubt, Outputs are not considered Derivatives under this License.
6
+ b. “Distribution” or “Distribute” or “Distributing” means providing or making available, by any means, a copy of the FLUX.1 [dev] Models and/or the Derivatives as the case may be.
7
+ c. “Non-Commercial Purpose” means any of the following uses, but only so far as you do not receive any direct or indirect payment arising from the use of the model or its output: (i) personal use for research, experiment, and testing for the benefit of public knowledge, personal study, private entertainment, hobby projects, or otherwise not directly or indirectly connected to any commercial activities, business operations, or employment responsibilities; (ii) use by commercial or for-profit entities for testing, evaluation, or non-commercial research and development in a non-production environment, (iii) use by any charitable organization for charitable purposes, or for testing or evaluation. For clarity, use for revenue-generating activity or direct interactions with or impacts on end users, or use to train, fine tune or distill other models for commercial use is not a Non-Commercial purpose.
8
+ d. “Outputs” means any content generated by the operation of the FLUX.1 [dev] Models or the Derivatives from a prompt (i.e., text instructions) provided by users. For the avoidance of doubt, Outputs do not include any components of a FLUX.1 [dev] Models, such as any fine-tuned versions of the FLUX.1 [dev] Models, the weights, or parameters.
9
+ e. “you” or “your” means the individual or entity entering into this License with Company.
10
+ 2. License Grant.
11
+ a. License. Subject to your compliance with this License, Company grants you a non-exclusive, worldwide, non-transferable, non-sublicensable, revocable, royalty free and limited license to access, use, create Derivatives of, and Distribute the FLUX.1 [dev] Models solely for your Non-Commercial Purposes. The foregoing license is personal to you, and you may not assign or sublicense this License or any other rights or obligations under this License without Company’s prior written consent; any such assignment or sublicense will be void and will automatically and immediately terminate this License. Any restrictions set forth herein in regarding the FLUX.1 [dev] Model also applies to any Derivative you create or that are created on your behalf.
12
+ b. Non-Commercial Use Only. You may only access, use, Distribute, or creative Derivatives of or the FLUX.1 [dev] Model or Derivatives for Non-Commercial Purposes. If You want to use a FLUX.1 [dev] Model a Derivative for any purpose that is not expressly authorized under this License, such as for a commercial activity, you must request a license from Company, which Company may grant to you in Company’s sole discretion and which additional use may be subject to a fee, royalty or other revenue share. Please contact Company at the following e-mail address if you want to discuss such a license: info@blackforestlabs.ai.
13
+ c. Reserved Rights. The grant of rights expressly set forth in this License are the complete grant of rights to you in the FLUX.1 [dev] Model, and no other licenses are granted, whether by waiver, estoppel, implication, equity or otherwise. Company and its licensors reserve all rights not expressly granted by this License.
14
+ d. Outputs. We claim no ownership rights in and to the Outputs. You are solely responsible for the Outputs you generate and their subsequent uses in accordance with this License. You may use Output for any purpose (including for commercial purposes), except as expressly prohibited herein. You may not use the Output to train, fine-tune or distill a model that is competitive with the FLUX.1 [dev] Model.
15
+ 3. Distribution. Subject to this License, you may Distribute copies of the FLUX.1 [dev] Model and/or Derivatives made by you, under the following conditions:
16
+ a. you must make available a copy of this License to third-party recipients of the FLUX.1 [dev] Models and/or Derivatives you Distribute, and specify that any rights to use the FLUX.1 [dev] Models and/or Derivatives shall be directly granted by Company to said third-party recipients pursuant to this License;
17
+ b. you must make prominently display the following notice alongside the Distribution of the FLUX.1 [dev] Model or Derivative (such as via a “Notice” text file distributed as part of such FLUX.1 [dev] Model or Derivative) (the “Attribution Notice”):
18
+ “The FLUX.1 [dev] Model is licensed by Black Forest Labs. Inc. under the FLUX.1 [dev] Non-Commercial License. Copyright Black Forest Labs. Inc.
19
+ IN NO EVENT SHALL BLACK FOREST LABS, INC. BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH USE OF THIS MODEL.”
20
+ c. in the case of Distribution of Derivatives made by you, you must also include in the Attribution Notice a statement that you have modified the applicable FLUX.1 [dev] Model; and
21
+ d. in the case of Distribution of Derivatives made by you, any terms and conditions you impose on any third-party recipients relating to Derivatives made by or for you shall neither limit such third-party recipients’ use of the FLUX.1 [dev] Model or any Derivatives made by or for Company in accordance with this License nor conflict with any of its terms and conditions.
22
+ e. In the case of Distribution of Derivatives made by you, you must not misrepresent or imply, through any means, that the Derivatives made by or for you and/or any modified version of the FLUX.1 [dev] Model you Distribute under your name and responsibility is an official product of the Company or has been endorsed, approved or validated by the Company, unless you are authorized by Company to do so in writing.
23
+ 4. Restrictions. You will not, and will not permit, assist or cause any third party to
24
+ a. use, modify, copy, reproduce, create Derivatives of, or Distribute the FLUX.1 [dev] Model (or any Derivative thereof, or any data produced by the FLUX.1 [dev] Model), in whole or in part, for (i) any commercial or production purposes, (ii) military purposes, (iii) purposes of surveillance, including any research or development relating to surveillance, (iv) biometric processing, (v) in any manner that infringes, misappropriates, or otherwise violates any third-party rights, or (vi) in any manner that violates any applicable law and violating any privacy or security laws, rules, regulations, directives, or governmental requirements (including the General Data Privacy Regulation (Regulation (EU) 2016/679), the California Consumer Privacy Act, and any and all laws governing the processing of biometric information), as well as all amendments and successor laws to any of the foregoing;
25
+ b. alter or remove copyright and other proprietary notices which appear on or in any portion of the FLUX.1 [dev] Model;
26
+ c. utilize any equipment, device, software, or other means to circumvent or remove any security or protection used by Company in connection with the FLUX.1 [dev] Model, or to circumvent or remove any usage restrictions, or to enable functionality disabled by FLUX.1 [dev] Model; or
27
+ d. offer or impose any terms on the FLUX.1 [dev] Model that alter, restrict, or are inconsistent with the terms of this License.
28
+ e. violate any applicable U.S. and non-U.S. export control and trade sanctions laws (“Export Laws”) in connection with your use or Distribution of any FLUX.1 [dev] Model;
29
+ f. directly or indirectly Distribute, export, or otherwise transfer FLUX.1 [dev] Model (a) to any individual, entity, or country prohibited by Export Laws; (b) to anyone on U.S. or non-U.S. government restricted parties lists; or (c) for any purpose prohibited by Export Laws, including nuclear, chemical or biological weapons, or missile technology applications; 3) use or download FLUX.1 [dev] Model if you or they are (a) located in a comprehensively sanctioned jurisdiction, (b) currently listed on any U.S. or non-U.S. restricted parties list, or (c) for any purpose prohibited by Export Laws; and (4) will not disguise your location through IP proxying or other methods.
30
+ 5. DISCLAIMERS. THE FLUX.1 [dev] MODEL IS PROVIDED “AS IS” AND “WITH ALL FAULTS” WITH NO WARRANTY OF ANY KIND, EXPRESS OR IMPLIED. COMPANY EXPRESSLY DISCLAIMS ALL REPRESENTATIONS AND WARRANTIES, EXPRESS OR IMPLIED, WHETHER BY STATUTE, CUSTOM, USAGE OR OTHERWISE AS TO ANY MATTERS RELATED TO THE FLUX.1 [dev] MODEL, INCLUDING BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE, SATISFACTORY QUALITY, OR NON-INFRINGEMENT. COMPANY MAKES NO WARRANTIES OR REPRESENTATIONS THAT THE FLUX.1 [dev] MODEL WILL BE ERROR FREE OR FREE OF VIRUSES OR OTHER HARMFUL COMPONENTS, OR PRODUCE ANY PARTICULAR RESULTS.
31
+ 6. LIMITATION OF LIABILITY. TO THE FULLEST EXTENT PERMITTED BY LAW, IN NO EVENT WILL COMPANY BE LIABLE TO YOU OR YOUR EMPLOYEES, AFFILIATES, USERS, OFFICERS OR DIRECTORS (A) UNDER ANY THEORY OF LIABILITY, WHETHER BASED IN CONTRACT, TORT, NEGLIGENCE, STRICT LIABILITY, WARRANTY, OR OTHERWISE UNDER THIS LICENSE, OR (B) FOR ANY INDIRECT, CONSEQUENTIAL, EXEMPLARY, INCIDENTAL, PUNITIVE OR SPECIAL DAMAGES OR LOST PROFITS, EVEN IF COMPANY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. THE FLUX.1 [dev] MODEL, ITS CONSTITUENT COMPONENTS, AND ANY OUTPUT (COLLECTIVELY, “MODEL MATERIALS”) ARE NOT DESIGNED OR INTENDED FOR USE IN ANY APPLICATION OR SITUATION WHERE FAILURE OR FAULT OF THE MODEL MATERIALS COULD REASONABLY BE ANTICIPATED TO LEAD TO SERIOUS INJURY OF ANY PERSON, INCLUDING POTENTIAL DISCRIMINATION OR VIOLATION OF AN INDIVIDUAL’S PRIVACY RIGHTS, OR TO SEVERE PHYSICAL, PROPERTY, OR ENVIRONMENTAL DAMAGE (EACH, A “HIGH-RISK USE”). IF YOU ELECT TO USE ANY OF THE MODEL MATERIALS FOR A HIGH-RISK USE, YOU DO SO AT YOUR OWN RISK. YOU AGREE TO DESIGN AND IMPLEMENT APPROPRIATE DECISION-MAKING AND RISK-MITIGATION PROCEDURES AND POLICIES IN CONNECTION WITH A HIGH-RISK USE SUCH THAT EVEN IF THERE IS A FAILURE OR FAULT IN ANY OF THE MODEL MATERIALS, THE SAFETY OF PERSONS OR PROPERTY AFFECTED BY THE ACTIVITY STAYS AT A LEVEL THAT IS REASONABLE, APPROPRIATE, AND LAWFUL FOR THE FIELD OF THE HIGH-RISK USE.
32
+ 7. INDEMNIFICATION
33
+
34
+ You will indemnify, defend and hold harmless Company and our subsidiaries and affiliates, and each of our respective shareholders, directors, officers, employees, agents, successors, and assigns (collectively, the “Company Parties”) from and against any losses, liabilities, damages, fines, penalties, and expenses (including reasonable attorneys’ fees) incurred by any Company Party in connection with any claim, demand, allegation, lawsuit, proceeding, or investigation (collectively, “Claims”) arising out of or related to (a) your access to or use of the FLUX.1 [dev] Model (as well as any Output, results or data generated from such access or use), including any High-Risk Use (defined below); (b) your violation of this License; or (c) your violation, misappropriation or infringement of any rights of another (including intellectual property or other proprietary rights and privacy rights). You will promptly notify the Company Parties of any such Claims, and cooperate with Company Parties in defending such Claims. You will also grant the Company Parties sole control of the defense or settlement, at Company’s sole option, of any Claims. This indemnity is in addition to, and not in lieu of, any other indemnities or remedies set forth in a written agreement between you and Company or the other Company Parties.
35
+ 8. Termination; Survival.
36
+ a. This License will automatically terminate upon any breach by you of the terms of this License.
37
+ b. We may terminate this License, in whole or in part, at any time upon notice (including electronic) to you.
38
+ c. If You initiate any legal action or proceedings against Company or any other entity (including a cross-claim or counterclaim in a lawsuit), alleging that the FLUX.1 [dev] Model or any Derivative, or any part thereof, infringe upon intellectual property or other rights owned or licensable by you, then any licenses granted to you under this License will immediately terminate as of the date such legal action or claim is filed or initiated.
39
+ d. Upon termination of this License, you must cease all use, access or Distribution of the FLUX.1 [dev] Model and any Derivatives. The following sections survive termination of this License 2(c), 2(d), 4-11.
40
+ 9. Third Party Materials. The FLUX.1 [dev] Model may contain third-party software or other components (including free and open source software) (all of the foregoing, “Third Party Materials”), which are subject to the license terms of the respective third-party licensors. Your dealings or correspondence with third parties and your use of or interaction with any Third Party Materials are solely between you and the third party. Company does not control or endorse, and makes no representations or warranties regarding, any Third Party Materials, and your access to and use of such Third Party Materials are at your own risk.
41
+ 10. Trademarks. You have not been granted any trademark license as part of this License and may not use any name or mark associated with Company without the prior written permission of Company, except to the extent necessary to make the reference required in the Attribution Notice as specified above or as is reasonably necessary in describing the FLUX.1 [dev] Model and its creators.
42
+ 11. General. This License will be governed and construed under the laws of the State of Delaware without regard to conflicts of law provisions. If any provision or part of a provision of this License is unlawful, void or unenforceable, that provision or part of the provision is deemed severed from this License, and will not affect the validity and enforceability of any remaining provisions. The failure of Company to exercise or enforce any right or provision of this License will not operate as a waiver of such right or provision. This License does not confer any third-party beneficiary rights upon any other person or entity. This License, together with the Documentation, contains the entire understanding between you and Company regarding the subject matter of this License, and supersedes all other written or oral agreements and understandings between you and Company regarding such subject matter. No change or addition to any provision of this License will be binding unless it is in writing and signed by an authorized representative of both you and Company.
predict.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Prediction interface for Cog ⚙️
2
+ # https://cog.run/python
3
+
4
+ from cog import BasePredictor, Input, Path
5
+ import os
6
+ import time
7
+ import torch
8
+ import subprocess
9
+ from PIL import Image
10
+ from typing import List
11
+ from image_datasets.canny_dataset import canny_processor, c_crop
12
+ from src.flux.util import load_ae, load_clip, load_t5, load_flow_model, load_controlnet, load_safetensors
13
+
14
+ OUTPUT_DIR = "controlnet_results"
15
+ MODEL_CACHE = "checkpoints"
16
+ CONTROLNET_URL = "https://huggingface.co/XLabs-AI/flux-controlnet-canny/resolve/main/controlnet.safetensors"
17
+ T5_URL = "https://weights.replicate.delivery/default/black-forest-labs/FLUX.1-dev/t5-cache.tar"
18
+ CLIP_URL = "https://weights.replicate.delivery/default/black-forest-labs/FLUX.1-dev/clip-cache.tar"
19
+ HF_TOKEN = "hf_..." # Your HuggingFace token
20
+
21
+ def download_weights(url, dest):
22
+ start = time.time()
23
+ print("downloading url: ", url)
24
+ print("downloading to: ", dest)
25
+ subprocess.check_call(["pget", "-xf", url, dest], close_fds=False)
26
+ print("downloading took: ", time.time() - start)
27
+
28
+ def get_models(name: str, device: torch.device, offload: bool, is_schnell: bool):
29
+ t5 = load_t5(device, max_length=256 if is_schnell else 512)
30
+ clip = load_clip(device)
31
+ model = load_flow_model(name, device="cpu" if offload else device)
32
+ ae = load_ae(name, device="cpu" if offload else device)
33
+ controlnet = load_controlnet(name, device).to(torch.bfloat16)
34
+ return model, ae, t5, clip, controlnet
35
+
36
+ class Predictor(BasePredictor):
37
+ def setup(self) -> None:
38
+ """Load the model into memory to make running multiple predictions efficient"""
39
+ t1 = time.time()
40
+ os.system(f"huggingface-cli login --token {HF_TOKEN}")
41
+ name = "flux-dev"
42
+ self.offload = False
43
+ checkpoint = "controlnet.safetensors"
44
+
45
+ print("Checking ControlNet weights")
46
+ checkpoint = "controlnet.safetensors"
47
+ if not os.path.exists(checkpoint):
48
+ os.system(f"wget {CONTROLNET_URL}")
49
+ print("Checking T5 weights")
50
+ if not os.path.exists(MODEL_CACHE+"/models--google--t5-v1_1-xxl"):
51
+ download_weights(T5_URL, MODEL_CACHE)
52
+ print("Checking CLIP weights")
53
+ if not os.path.exists(MODEL_CACHE+"/models--openai--clip-vit-large-patch14"):
54
+ download_weights(CLIP_URL, MODEL_CACHE)
55
+
56
+ self.is_schnell = False
57
+ device = "cuda"
58
+ self.torch_device = torch.device(device)
59
+ model, ae, t5, clip, controlnet = get_models(
60
+ name,
61
+ device=self.torch_device,
62
+ offload=self.offload,
63
+ is_schnell=self.is_schnell,
64
+ )
65
+ self.ae = ae
66
+ self.t5 = t5
67
+ self.clip = clip
68
+ self.controlnet = controlnet
69
+ self.model = model.to(self.torch_device)
70
+ if '.safetensors' in checkpoint:
71
+ checkpoint1 = load_safetensors(checkpoint)
72
+ else:
73
+ checkpoint1 = torch.load(checkpoint, map_location='cpu')
74
+
75
+ controlnet.load_state_dict(checkpoint1, strict=False)
76
+ t2 = time.time()
77
+ print(f"Setup time: {t2 - t1}")
78
+
79
+ def preprocess_canny_image(self, image_path: str, width: int = 512, height: int = 512):
80
+ image = Image.open(image_path)
81
+ image = c_crop(image)
82
+ image = image.resize((width, height))
83
+ image = canny_processor(image)
84
+ return image
85
+
86
+ def predict(
87
+ self,
88
+ prompt: str = Input(description="Input prompt", default="a handsome viking man with white hair, cinematic, MM full HD"),
89
+ image: Path = Input(description="Input image", default=None),
90
+ num_inference_steps: int = Input(description="Number of inference steps", ge=1, le=64, default=28),
91
+ cfg: float = Input(description="CFG", ge=0, le=10, default=3.5),
92
+ seed: int = Input(description="Random seed", default=None)
93
+ ) -> List[Path]:
94
+ """Run a single prediction on the model"""
95
+ if seed is None:
96
+ seed = int.from_bytes(os.urandom(2), "big")
97
+ print(f"Using seed: {seed}")
98
+
99
+ # clean output dir
100
+ output_dir = "controlnet_results"
101
+ os.system(f"rm -rf {output_dir}")
102
+
103
+ input_image = str(image)
104
+ img = Image.open(input_image)
105
+ width, height = img.size
106
+ # Resize input image if it's too large
107
+ max_image_size = 1536
108
+ scale = min(max_image_size / width, max_image_size / height, 1)
109
+ if scale < 1:
110
+ width = int(width * scale)
111
+ height = int(height * scale)
112
+ print(f"Scaling image down to {width}x{height}")
113
+ img = img.resize((width, height), resample=Image.Resampling.LANCZOS)
114
+ input_image = "/tmp/resized_image.png"
115
+ img.save(input_image)
116
+
117
+ subprocess.check_call(
118
+ ["python3", "main.py",
119
+ "--local_path", "controlnet.safetensors",
120
+ "--image", input_image,
121
+ "--use_controlnet",
122
+ "--control_type", "canny",
123
+ "--prompt", prompt,
124
+ "--width", str(width),
125
+ "--height", str(height),
126
+ "--num_steps", str(num_inference_steps),
127
+ "--guidance", str(cfg),
128
+ "--seed", str(seed)
129
+ ], close_fds=False)
130
+
131
+ # Find the first file that begins with "controlnet_result_"
132
+ for file in os.listdir(output_dir):
133
+ if file.startswith("controlnet_result_"):
134
+ return [Path(os.path.join(output_dir, file))]
src/flux/__init__.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ try:
2
+ from ._version import version as __version__ # type: ignore
3
+ from ._version import version_tuple
4
+ except ImportError:
5
+ __version__ = "unknown (no version information available)"
6
+ version_tuple = (0, 0, "unknown", "noinfo")
7
+
8
+ from pathlib import Path
9
+
10
+ PACKAGE = __package__.replace("_", "-")
11
+ PACKAGE_ROOT = Path(__file__).parent
src/flux/__main__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from .cli import app
2
+
3
+ if __name__ == "__main__":
4
+ app()
src/flux/annotator/canny/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ import cv2
2
+
3
+
4
+ class CannyDetector:
5
+ def __call__(self, img, low_threshold, high_threshold):
6
+ return cv2.Canny(img, low_threshold, high_threshold)
src/flux/annotator/ckpts/ckpts.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ Weights here.
src/flux/annotator/dwpose/__init__.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Openpose
2
+ # Original from CMU https://github.com/CMU-Perceptual-Computing-Lab/openpose
3
+ # 2nd Edited by https://github.com/Hzzone/pytorch-openpose
4
+ # 3rd Edited by ControlNet
5
+ # 4th Edited by ControlNet (added face and correct hands)
6
+
7
+ import os
8
+ os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
9
+
10
+ import torch
11
+ import numpy as np
12
+ from . import util
13
+ from .wholebody import Wholebody
14
+
15
+ def draw_pose(pose, H, W):
16
+ bodies = pose['bodies']
17
+ faces = pose['faces']
18
+ hands = pose['hands']
19
+ candidate = bodies['candidate']
20
+ subset = bodies['subset']
21
+ canvas = np.zeros(shape=(H, W, 3), dtype=np.uint8)
22
+
23
+ canvas = util.draw_bodypose(canvas, candidate, subset)
24
+
25
+ canvas = util.draw_handpose(canvas, hands)
26
+
27
+ canvas = util.draw_facepose(canvas, faces)
28
+
29
+ return canvas
30
+
31
+
32
+ class DWposeDetector:
33
+ def __init__(self, device):
34
+
35
+ self.pose_estimation = Wholebody(device)
36
+
37
+ def __call__(self, oriImg):
38
+ oriImg = oriImg.copy()
39
+ H, W, C = oriImg.shape
40
+ with torch.no_grad():
41
+ candidate, subset = self.pose_estimation(oriImg)
42
+ nums, keys, locs = candidate.shape
43
+ candidate[..., 0] /= float(W)
44
+ candidate[..., 1] /= float(H)
45
+ body = candidate[:,:18].copy()
46
+ body = body.reshape(nums*18, locs)
47
+ score = subset[:,:18]
48
+ for i in range(len(score)):
49
+ for j in range(len(score[i])):
50
+ if score[i][j] > 0.3:
51
+ score[i][j] = int(18*i+j)
52
+ else:
53
+ score[i][j] = -1
54
+
55
+ un_visible = subset<0.3
56
+ candidate[un_visible] = -1
57
+
58
+ foot = candidate[:,18:24]
59
+
60
+ faces = candidate[:,24:92]
61
+
62
+ hands = candidate[:,92:113]
63
+ hands = np.vstack([hands, candidate[:,113:]])
64
+
65
+ bodies = dict(candidate=body, subset=score)
66
+ pose = dict(bodies=bodies, hands=hands, faces=faces)
67
+
68
+ return draw_pose(pose, H, W)
src/flux/annotator/dwpose/onnxdet.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import cv2
2
+ import numpy as np
3
+
4
+ import onnxruntime
5
+
6
+ def nms(boxes, scores, nms_thr):
7
+ """Single class NMS implemented in Numpy."""
8
+ x1 = boxes[:, 0]
9
+ y1 = boxes[:, 1]
10
+ x2 = boxes[:, 2]
11
+ y2 = boxes[:, 3]
12
+
13
+ areas = (x2 - x1 + 1) * (y2 - y1 + 1)
14
+ order = scores.argsort()[::-1]
15
+
16
+ keep = []
17
+ while order.size > 0:
18
+ i = order[0]
19
+ keep.append(i)
20
+ xx1 = np.maximum(x1[i], x1[order[1:]])
21
+ yy1 = np.maximum(y1[i], y1[order[1:]])
22
+ xx2 = np.minimum(x2[i], x2[order[1:]])
23
+ yy2 = np.minimum(y2[i], y2[order[1:]])
24
+
25
+ w = np.maximum(0.0, xx2 - xx1 + 1)
26
+ h = np.maximum(0.0, yy2 - yy1 + 1)
27
+ inter = w * h
28
+ ovr = inter / (areas[i] + areas[order[1:]] - inter)
29
+
30
+ inds = np.where(ovr <= nms_thr)[0]
31
+ order = order[inds + 1]
32
+
33
+ return keep
34
+
35
+ def multiclass_nms(boxes, scores, nms_thr, score_thr):
36
+ """Multiclass NMS implemented in Numpy. Class-aware version."""
37
+ final_dets = []
38
+ num_classes = scores.shape[1]
39
+ for cls_ind in range(num_classes):
40
+ cls_scores = scores[:, cls_ind]
41
+ valid_score_mask = cls_scores > score_thr
42
+ if valid_score_mask.sum() == 0:
43
+ continue
44
+ else:
45
+ valid_scores = cls_scores[valid_score_mask]
46
+ valid_boxes = boxes[valid_score_mask]
47
+ keep = nms(valid_boxes, valid_scores, nms_thr)
48
+ if len(keep) > 0:
49
+ cls_inds = np.ones((len(keep), 1)) * cls_ind
50
+ dets = np.concatenate(
51
+ [valid_boxes[keep], valid_scores[keep, None], cls_inds], 1
52
+ )
53
+ final_dets.append(dets)
54
+ if len(final_dets) == 0:
55
+ return None
56
+ return np.concatenate(final_dets, 0)
57
+
58
+ def demo_postprocess(outputs, img_size, p6=False):
59
+ grids = []
60
+ expanded_strides = []
61
+ strides = [8, 16, 32] if not p6 else [8, 16, 32, 64]
62
+
63
+ hsizes = [img_size[0] // stride for stride in strides]
64
+ wsizes = [img_size[1] // stride for stride in strides]
65
+
66
+ for hsize, wsize, stride in zip(hsizes, wsizes, strides):
67
+ xv, yv = np.meshgrid(np.arange(wsize), np.arange(hsize))
68
+ grid = np.stack((xv, yv), 2).reshape(1, -1, 2)
69
+ grids.append(grid)
70
+ shape = grid.shape[:2]
71
+ expanded_strides.append(np.full((*shape, 1), stride))
72
+
73
+ grids = np.concatenate(grids, 1)
74
+ expanded_strides = np.concatenate(expanded_strides, 1)
75
+ outputs[..., :2] = (outputs[..., :2] + grids) * expanded_strides
76
+ outputs[..., 2:4] = np.exp(outputs[..., 2:4]) * expanded_strides
77
+
78
+ return outputs
79
+
80
+ def preprocess(img, input_size, swap=(2, 0, 1)):
81
+ if len(img.shape) == 3:
82
+ padded_img = np.ones((input_size[0], input_size[1], 3), dtype=np.uint8) * 114
83
+ else:
84
+ padded_img = np.ones(input_size, dtype=np.uint8) * 114
85
+
86
+ r = min(input_size[0] / img.shape[0], input_size[1] / img.shape[1])
87
+ resized_img = cv2.resize(
88
+ img,
89
+ (int(img.shape[1] * r), int(img.shape[0] * r)),
90
+ interpolation=cv2.INTER_LINEAR,
91
+ ).astype(np.uint8)
92
+ padded_img[: int(img.shape[0] * r), : int(img.shape[1] * r)] = resized_img
93
+
94
+ padded_img = padded_img.transpose(swap)
95
+ padded_img = np.ascontiguousarray(padded_img, dtype=np.float32)
96
+ return padded_img, r
97
+
98
+ def inference_detector(session, oriImg):
99
+ input_shape = (640,640)
100
+ img, ratio = preprocess(oriImg, input_shape)
101
+
102
+ ort_inputs = {session.get_inputs()[0].name: img[None, :, :, :]}
103
+ output = session.run(None, ort_inputs)
104
+ predictions = demo_postprocess(output[0], input_shape)[0]
105
+
106
+ boxes = predictions[:, :4]
107
+ scores = predictions[:, 4:5] * predictions[:, 5:]
108
+
109
+ boxes_xyxy = np.ones_like(boxes)
110
+ boxes_xyxy[:, 0] = boxes[:, 0] - boxes[:, 2]/2.
111
+ boxes_xyxy[:, 1] = boxes[:, 1] - boxes[:, 3]/2.
112
+ boxes_xyxy[:, 2] = boxes[:, 0] + boxes[:, 2]/2.
113
+ boxes_xyxy[:, 3] = boxes[:, 1] + boxes[:, 3]/2.
114
+ boxes_xyxy /= ratio
115
+ dets = multiclass_nms(boxes_xyxy, scores, nms_thr=0.45, score_thr=0.1)
116
+ if dets is not None:
117
+ final_boxes, final_scores, final_cls_inds = dets[:, :4], dets[:, 4], dets[:, 5]
118
+ isscore = final_scores>0.3
119
+ iscat = final_cls_inds == 0
120
+ isbbox = [ i and j for (i, j) in zip(isscore, iscat)]
121
+ final_boxes = final_boxes[isbbox]
122
+ else:
123
+ final_boxes = np.array([])
124
+
125
+ return final_boxes
src/flux/annotator/dwpose/onnxpose.py ADDED
@@ -0,0 +1,360 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Tuple
2
+
3
+ import cv2
4
+ import numpy as np
5
+ import onnxruntime as ort
6
+
7
+ def preprocess(
8
+ img: np.ndarray, out_bbox, input_size: Tuple[int, int] = (192, 256)
9
+ ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
10
+ """Do preprocessing for RTMPose model inference.
11
+
12
+ Args:
13
+ img (np.ndarray): Input image in shape.
14
+ input_size (tuple): Input image size in shape (w, h).
15
+
16
+ Returns:
17
+ tuple:
18
+ - resized_img (np.ndarray): Preprocessed image.
19
+ - center (np.ndarray): Center of image.
20
+ - scale (np.ndarray): Scale of image.
21
+ """
22
+ # get shape of image
23
+ img_shape = img.shape[:2]
24
+ out_img, out_center, out_scale = [], [], []
25
+ if len(out_bbox) == 0:
26
+ out_bbox = [[0, 0, img_shape[1], img_shape[0]]]
27
+ for i in range(len(out_bbox)):
28
+ x0 = out_bbox[i][0]
29
+ y0 = out_bbox[i][1]
30
+ x1 = out_bbox[i][2]
31
+ y1 = out_bbox[i][3]
32
+ bbox = np.array([x0, y0, x1, y1])
33
+
34
+ # get center and scale
35
+ center, scale = bbox_xyxy2cs(bbox, padding=1.25)
36
+
37
+ # do affine transformation
38
+ resized_img, scale = top_down_affine(input_size, scale, center, img)
39
+
40
+ # normalize image
41
+ mean = np.array([123.675, 116.28, 103.53])
42
+ std = np.array([58.395, 57.12, 57.375])
43
+ resized_img = (resized_img - mean) / std
44
+
45
+ out_img.append(resized_img)
46
+ out_center.append(center)
47
+ out_scale.append(scale)
48
+
49
+ return out_img, out_center, out_scale
50
+
51
+
52
+ def inference(sess: ort.InferenceSession, img: np.ndarray) -> np.ndarray:
53
+ """Inference RTMPose model.
54
+
55
+ Args:
56
+ sess (ort.InferenceSession): ONNXRuntime session.
57
+ img (np.ndarray): Input image in shape.
58
+
59
+ Returns:
60
+ outputs (np.ndarray): Output of RTMPose model.
61
+ """
62
+ all_out = []
63
+ # build input
64
+ for i in range(len(img)):
65
+ input = [img[i].transpose(2, 0, 1)]
66
+
67
+ # build output
68
+ sess_input = {sess.get_inputs()[0].name: input}
69
+ sess_output = []
70
+ for out in sess.get_outputs():
71
+ sess_output.append(out.name)
72
+
73
+ # run model
74
+ outputs = sess.run(sess_output, sess_input)
75
+ all_out.append(outputs)
76
+
77
+ return all_out
78
+
79
+
80
+ def postprocess(outputs: List[np.ndarray],
81
+ model_input_size: Tuple[int, int],
82
+ center: Tuple[int, int],
83
+ scale: Tuple[int, int],
84
+ simcc_split_ratio: float = 2.0
85
+ ) -> Tuple[np.ndarray, np.ndarray]:
86
+ """Postprocess for RTMPose model output.
87
+
88
+ Args:
89
+ outputs (np.ndarray): Output of RTMPose model.
90
+ model_input_size (tuple): RTMPose model Input image size.
91
+ center (tuple): Center of bbox in shape (x, y).
92
+ scale (tuple): Scale of bbox in shape (w, h).
93
+ simcc_split_ratio (float): Split ratio of simcc.
94
+
95
+ Returns:
96
+ tuple:
97
+ - keypoints (np.ndarray): Rescaled keypoints.
98
+ - scores (np.ndarray): Model predict scores.
99
+ """
100
+ all_key = []
101
+ all_score = []
102
+ for i in range(len(outputs)):
103
+ # use simcc to decode
104
+ simcc_x, simcc_y = outputs[i]
105
+ keypoints, scores = decode(simcc_x, simcc_y, simcc_split_ratio)
106
+
107
+ # rescale keypoints
108
+ keypoints = keypoints / model_input_size * scale[i] + center[i] - scale[i] / 2
109
+ all_key.append(keypoints[0])
110
+ all_score.append(scores[0])
111
+
112
+ return np.array(all_key), np.array(all_score)
113
+
114
+
115
+ def bbox_xyxy2cs(bbox: np.ndarray,
116
+ padding: float = 1.) -> Tuple[np.ndarray, np.ndarray]:
117
+ """Transform the bbox format from (x,y,w,h) into (center, scale)
118
+
119
+ Args:
120
+ bbox (ndarray): Bounding box(es) in shape (4,) or (n, 4), formatted
121
+ as (left, top, right, bottom)
122
+ padding (float): BBox padding factor that will be multilied to scale.
123
+ Default: 1.0
124
+
125
+ Returns:
126
+ tuple: A tuple containing center and scale.
127
+ - np.ndarray[float32]: Center (x, y) of the bbox in shape (2,) or
128
+ (n, 2)
129
+ - np.ndarray[float32]: Scale (w, h) of the bbox in shape (2,) or
130
+ (n, 2)
131
+ """
132
+ # convert single bbox from (4, ) to (1, 4)
133
+ dim = bbox.ndim
134
+ if dim == 1:
135
+ bbox = bbox[None, :]
136
+
137
+ # get bbox center and scale
138
+ x1, y1, x2, y2 = np.hsplit(bbox, [1, 2, 3])
139
+ center = np.hstack([x1 + x2, y1 + y2]) * 0.5
140
+ scale = np.hstack([x2 - x1, y2 - y1]) * padding
141
+
142
+ if dim == 1:
143
+ center = center[0]
144
+ scale = scale[0]
145
+
146
+ return center, scale
147
+
148
+
149
+ def _fix_aspect_ratio(bbox_scale: np.ndarray,
150
+ aspect_ratio: float) -> np.ndarray:
151
+ """Extend the scale to match the given aspect ratio.
152
+
153
+ Args:
154
+ scale (np.ndarray): The image scale (w, h) in shape (2, )
155
+ aspect_ratio (float): The ratio of ``w/h``
156
+
157
+ Returns:
158
+ np.ndarray: The reshaped image scale in (2, )
159
+ """
160
+ w, h = np.hsplit(bbox_scale, [1])
161
+ bbox_scale = np.where(w > h * aspect_ratio,
162
+ np.hstack([w, w / aspect_ratio]),
163
+ np.hstack([h * aspect_ratio, h]))
164
+ return bbox_scale
165
+
166
+
167
+ def _rotate_point(pt: np.ndarray, angle_rad: float) -> np.ndarray:
168
+ """Rotate a point by an angle.
169
+
170
+ Args:
171
+ pt (np.ndarray): 2D point coordinates (x, y) in shape (2, )
172
+ angle_rad (float): rotation angle in radian
173
+
174
+ Returns:
175
+ np.ndarray: Rotated point in shape (2, )
176
+ """
177
+ sn, cs = np.sin(angle_rad), np.cos(angle_rad)
178
+ rot_mat = np.array([[cs, -sn], [sn, cs]])
179
+ return rot_mat @ pt
180
+
181
+
182
+ def _get_3rd_point(a: np.ndarray, b: np.ndarray) -> np.ndarray:
183
+ """To calculate the affine matrix, three pairs of points are required. This
184
+ function is used to get the 3rd point, given 2D points a & b.
185
+
186
+ The 3rd point is defined by rotating vector `a - b` by 90 degrees
187
+ anticlockwise, using b as the rotation center.
188
+
189
+ Args:
190
+ a (np.ndarray): The 1st point (x,y) in shape (2, )
191
+ b (np.ndarray): The 2nd point (x,y) in shape (2, )
192
+
193
+ Returns:
194
+ np.ndarray: The 3rd point.
195
+ """
196
+ direction = a - b
197
+ c = b + np.r_[-direction[1], direction[0]]
198
+ return c
199
+
200
+
201
+ def get_warp_matrix(center: np.ndarray,
202
+ scale: np.ndarray,
203
+ rot: float,
204
+ output_size: Tuple[int, int],
205
+ shift: Tuple[float, float] = (0., 0.),
206
+ inv: bool = False) -> np.ndarray:
207
+ """Calculate the affine transformation matrix that can warp the bbox area
208
+ in the input image to the output size.
209
+
210
+ Args:
211
+ center (np.ndarray[2, ]): Center of the bounding box (x, y).
212
+ scale (np.ndarray[2, ]): Scale of the bounding box
213
+ wrt [width, height].
214
+ rot (float): Rotation angle (degree).
215
+ output_size (np.ndarray[2, ] | list(2,)): Size of the
216
+ destination heatmaps.
217
+ shift (0-100%): Shift translation ratio wrt the width/height.
218
+ Default (0., 0.).
219
+ inv (bool): Option to inverse the affine transform direction.
220
+ (inv=False: src->dst or inv=True: dst->src)
221
+
222
+ Returns:
223
+ np.ndarray: A 2x3 transformation matrix
224
+ """
225
+ shift = np.array(shift)
226
+ src_w = scale[0]
227
+ dst_w = output_size[0]
228
+ dst_h = output_size[1]
229
+
230
+ # compute transformation matrix
231
+ rot_rad = np.deg2rad(rot)
232
+ src_dir = _rotate_point(np.array([0., src_w * -0.5]), rot_rad)
233
+ dst_dir = np.array([0., dst_w * -0.5])
234
+
235
+ # get four corners of the src rectangle in the original image
236
+ src = np.zeros((3, 2), dtype=np.float32)
237
+ src[0, :] = center + scale * shift
238
+ src[1, :] = center + src_dir + scale * shift
239
+ src[2, :] = _get_3rd_point(src[0, :], src[1, :])
240
+
241
+ # get four corners of the dst rectangle in the input image
242
+ dst = np.zeros((3, 2), dtype=np.float32)
243
+ dst[0, :] = [dst_w * 0.5, dst_h * 0.5]
244
+ dst[1, :] = np.array([dst_w * 0.5, dst_h * 0.5]) + dst_dir
245
+ dst[2, :] = _get_3rd_point(dst[0, :], dst[1, :])
246
+
247
+ if inv:
248
+ warp_mat = cv2.getAffineTransform(np.float32(dst), np.float32(src))
249
+ else:
250
+ warp_mat = cv2.getAffineTransform(np.float32(src), np.float32(dst))
251
+
252
+ return warp_mat
253
+
254
+
255
+ def top_down_affine(input_size: dict, bbox_scale: dict, bbox_center: dict,
256
+ img: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
257
+ """Get the bbox image as the model input by affine transform.
258
+
259
+ Args:
260
+ input_size (dict): The input size of the model.
261
+ bbox_scale (dict): The bbox scale of the img.
262
+ bbox_center (dict): The bbox center of the img.
263
+ img (np.ndarray): The original image.
264
+
265
+ Returns:
266
+ tuple: A tuple containing center and scale.
267
+ - np.ndarray[float32]: img after affine transform.
268
+ - np.ndarray[float32]: bbox scale after affine transform.
269
+ """
270
+ w, h = input_size
271
+ warp_size = (int(w), int(h))
272
+
273
+ # reshape bbox to fixed aspect ratio
274
+ bbox_scale = _fix_aspect_ratio(bbox_scale, aspect_ratio=w / h)
275
+
276
+ # get the affine matrix
277
+ center = bbox_center
278
+ scale = bbox_scale
279
+ rot = 0
280
+ warp_mat = get_warp_matrix(center, scale, rot, output_size=(w, h))
281
+
282
+ # do affine transform
283
+ img = cv2.warpAffine(img, warp_mat, warp_size, flags=cv2.INTER_LINEAR)
284
+
285
+ return img, bbox_scale
286
+
287
+
288
+ def get_simcc_maximum(simcc_x: np.ndarray,
289
+ simcc_y: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
290
+ """Get maximum response location and value from simcc representations.
291
+
292
+ Note:
293
+ instance number: N
294
+ num_keypoints: K
295
+ heatmap height: H
296
+ heatmap width: W
297
+
298
+ Args:
299
+ simcc_x (np.ndarray): x-axis SimCC in shape (K, Wx) or (N, K, Wx)
300
+ simcc_y (np.ndarray): y-axis SimCC in shape (K, Wy) or (N, K, Wy)
301
+
302
+ Returns:
303
+ tuple:
304
+ - locs (np.ndarray): locations of maximum heatmap responses in shape
305
+ (K, 2) or (N, K, 2)
306
+ - vals (np.ndarray): values of maximum heatmap responses in shape
307
+ (K,) or (N, K)
308
+ """
309
+ N, K, Wx = simcc_x.shape
310
+ simcc_x = simcc_x.reshape(N * K, -1)
311
+ simcc_y = simcc_y.reshape(N * K, -1)
312
+
313
+ # get maximum value locations
314
+ x_locs = np.argmax(simcc_x, axis=1)
315
+ y_locs = np.argmax(simcc_y, axis=1)
316
+ locs = np.stack((x_locs, y_locs), axis=-1).astype(np.float32)
317
+ max_val_x = np.amax(simcc_x, axis=1)
318
+ max_val_y = np.amax(simcc_y, axis=1)
319
+
320
+ # get maximum value across x and y axis
321
+ mask = max_val_x > max_val_y
322
+ max_val_x[mask] = max_val_y[mask]
323
+ vals = max_val_x
324
+ locs[vals <= 0.] = -1
325
+
326
+ # reshape
327
+ locs = locs.reshape(N, K, 2)
328
+ vals = vals.reshape(N, K)
329
+
330
+ return locs, vals
331
+
332
+
333
+ def decode(simcc_x: np.ndarray, simcc_y: np.ndarray,
334
+ simcc_split_ratio) -> Tuple[np.ndarray, np.ndarray]:
335
+ """Modulate simcc distribution with Gaussian.
336
+
337
+ Args:
338
+ simcc_x (np.ndarray[K, Wx]): model predicted simcc in x.
339
+ simcc_y (np.ndarray[K, Wy]): model predicted simcc in y.
340
+ simcc_split_ratio (int): The split ratio of simcc.
341
+
342
+ Returns:
343
+ tuple: A tuple containing center and scale.
344
+ - np.ndarray[float32]: keypoints in shape (K, 2) or (n, K, 2)
345
+ - np.ndarray[float32]: scores in shape (K,) or (n, K)
346
+ """
347
+ keypoints, scores = get_simcc_maximum(simcc_x, simcc_y)
348
+ keypoints /= simcc_split_ratio
349
+
350
+ return keypoints, scores
351
+
352
+
353
+ def inference_pose(session, out_bbox, oriImg):
354
+ h, w = session.get_inputs()[0].shape[2:]
355
+ model_input_size = (w, h)
356
+ resized_img, center, scale = preprocess(oriImg, out_bbox, model_input_size)
357
+ outputs = inference(session, resized_img)
358
+ keypoints, scores = postprocess(outputs, model_input_size, center, scale)
359
+
360
+ return keypoints, scores
src/flux/annotator/dwpose/util.py ADDED
@@ -0,0 +1,297 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import numpy as np
3
+ import matplotlib
4
+ import cv2
5
+
6
+
7
+ eps = 0.01
8
+
9
+
10
+ def smart_resize(x, s):
11
+ Ht, Wt = s
12
+ if x.ndim == 2:
13
+ Ho, Wo = x.shape
14
+ Co = 1
15
+ else:
16
+ Ho, Wo, Co = x.shape
17
+ if Co == 3 or Co == 1:
18
+ k = float(Ht + Wt) / float(Ho + Wo)
19
+ return cv2.resize(x, (int(Wt), int(Ht)), interpolation=cv2.INTER_AREA if k < 1 else cv2.INTER_LANCZOS4)
20
+ else:
21
+ return np.stack([smart_resize(x[:, :, i], s) for i in range(Co)], axis=2)
22
+
23
+
24
+ def smart_resize_k(x, fx, fy):
25
+ if x.ndim == 2:
26
+ Ho, Wo = x.shape
27
+ Co = 1
28
+ else:
29
+ Ho, Wo, Co = x.shape
30
+ Ht, Wt = Ho * fy, Wo * fx
31
+ if Co == 3 or Co == 1:
32
+ k = float(Ht + Wt) / float(Ho + Wo)
33
+ return cv2.resize(x, (int(Wt), int(Ht)), interpolation=cv2.INTER_AREA if k < 1 else cv2.INTER_LANCZOS4)
34
+ else:
35
+ return np.stack([smart_resize_k(x[:, :, i], fx, fy) for i in range(Co)], axis=2)
36
+
37
+
38
+ def padRightDownCorner(img, stride, padValue):
39
+ h = img.shape[0]
40
+ w = img.shape[1]
41
+
42
+ pad = 4 * [None]
43
+ pad[0] = 0 # up
44
+ pad[1] = 0 # left
45
+ pad[2] = 0 if (h % stride == 0) else stride - (h % stride) # down
46
+ pad[3] = 0 if (w % stride == 0) else stride - (w % stride) # right
47
+
48
+ img_padded = img
49
+ pad_up = np.tile(img_padded[0:1, :, :]*0 + padValue, (pad[0], 1, 1))
50
+ img_padded = np.concatenate((pad_up, img_padded), axis=0)
51
+ pad_left = np.tile(img_padded[:, 0:1, :]*0 + padValue, (1, pad[1], 1))
52
+ img_padded = np.concatenate((pad_left, img_padded), axis=1)
53
+ pad_down = np.tile(img_padded[-2:-1, :, :]*0 + padValue, (pad[2], 1, 1))
54
+ img_padded = np.concatenate((img_padded, pad_down), axis=0)
55
+ pad_right = np.tile(img_padded[:, -2:-1, :]*0 + padValue, (1, pad[3], 1))
56
+ img_padded = np.concatenate((img_padded, pad_right), axis=1)
57
+
58
+ return img_padded, pad
59
+
60
+
61
+ def transfer(model, model_weights):
62
+ transfered_model_weights = {}
63
+ for weights_name in model.state_dict().keys():
64
+ transfered_model_weights[weights_name] = model_weights['.'.join(weights_name.split('.')[1:])]
65
+ return transfered_model_weights
66
+
67
+
68
+ def draw_bodypose(canvas, candidate, subset):
69
+ H, W, C = canvas.shape
70
+ candidate = np.array(candidate)
71
+ subset = np.array(subset)
72
+
73
+ stickwidth = 4
74
+
75
+ limbSeq = [[2, 3], [2, 6], [3, 4], [4, 5], [6, 7], [7, 8], [2, 9], [9, 10], \
76
+ [10, 11], [2, 12], [12, 13], [13, 14], [2, 1], [1, 15], [15, 17], \
77
+ [1, 16], [16, 18], [3, 17], [6, 18]]
78
+
79
+ colors = [[255, 0, 0], [255, 85, 0], [255, 170, 0], [255, 255, 0], [170, 255, 0], [85, 255, 0], [0, 255, 0], \
80
+ [0, 255, 85], [0, 255, 170], [0, 255, 255], [0, 170, 255], [0, 85, 255], [0, 0, 255], [85, 0, 255], \
81
+ [170, 0, 255], [255, 0, 255], [255, 0, 170], [255, 0, 85]]
82
+
83
+ for i in range(17):
84
+ for n in range(len(subset)):
85
+ index = subset[n][np.array(limbSeq[i]) - 1]
86
+ if -1 in index:
87
+ continue
88
+ Y = candidate[index.astype(int), 0] * float(W)
89
+ X = candidate[index.astype(int), 1] * float(H)
90
+ mX = np.mean(X)
91
+ mY = np.mean(Y)
92
+ length = ((X[0] - X[1]) ** 2 + (Y[0] - Y[1]) ** 2) ** 0.5
93
+ angle = math.degrees(math.atan2(X[0] - X[1], Y[0] - Y[1]))
94
+ polygon = cv2.ellipse2Poly((int(mY), int(mX)), (int(length / 2), stickwidth), int(angle), 0, 360, 1)
95
+ cv2.fillConvexPoly(canvas, polygon, colors[i])
96
+
97
+ canvas = (canvas * 0.6).astype(np.uint8)
98
+
99
+ for i in range(18):
100
+ for n in range(len(subset)):
101
+ index = int(subset[n][i])
102
+ if index == -1:
103
+ continue
104
+ x, y = candidate[index][0:2]
105
+ x = int(x * W)
106
+ y = int(y * H)
107
+ cv2.circle(canvas, (int(x), int(y)), 4, colors[i], thickness=-1)
108
+
109
+ return canvas
110
+
111
+
112
+ def draw_handpose(canvas, all_hand_peaks):
113
+ H, W, C = canvas.shape
114
+
115
+ edges = [[0, 1], [1, 2], [2, 3], [3, 4], [0, 5], [5, 6], [6, 7], [7, 8], [0, 9], [9, 10], \
116
+ [10, 11], [11, 12], [0, 13], [13, 14], [14, 15], [15, 16], [0, 17], [17, 18], [18, 19], [19, 20]]
117
+
118
+ for peaks in all_hand_peaks:
119
+ peaks = np.array(peaks)
120
+
121
+ for ie, e in enumerate(edges):
122
+ x1, y1 = peaks[e[0]]
123
+ x2, y2 = peaks[e[1]]
124
+ x1 = int(x1 * W)
125
+ y1 = int(y1 * H)
126
+ x2 = int(x2 * W)
127
+ y2 = int(y2 * H)
128
+ if x1 > eps and y1 > eps and x2 > eps and y2 > eps:
129
+ cv2.line(canvas, (x1, y1), (x2, y2), matplotlib.colors.hsv_to_rgb([ie / float(len(edges)), 1.0, 1.0]) * 255, thickness=2)
130
+
131
+ for i, keyponit in enumerate(peaks):
132
+ x, y = keyponit
133
+ x = int(x * W)
134
+ y = int(y * H)
135
+ if x > eps and y > eps:
136
+ cv2.circle(canvas, (x, y), 4, (0, 0, 255), thickness=-1)
137
+ return canvas
138
+
139
+
140
+ def draw_facepose(canvas, all_lmks):
141
+ H, W, C = canvas.shape
142
+ for lmks in all_lmks:
143
+ lmks = np.array(lmks)
144
+ for lmk in lmks:
145
+ x, y = lmk
146
+ x = int(x * W)
147
+ y = int(y * H)
148
+ if x > eps and y > eps:
149
+ cv2.circle(canvas, (x, y), 3, (255, 255, 255), thickness=-1)
150
+ return canvas
151
+
152
+
153
+ # detect hand according to body pose keypoints
154
+ # please refer to https://github.com/CMU-Perceptual-Computing-Lab/openpose/blob/master/src/openpose/hand/handDetector.cpp
155
+ def handDetect(candidate, subset, oriImg):
156
+ # right hand: wrist 4, elbow 3, shoulder 2
157
+ # left hand: wrist 7, elbow 6, shoulder 5
158
+ ratioWristElbow = 0.33
159
+ detect_result = []
160
+ image_height, image_width = oriImg.shape[0:2]
161
+ for person in subset.astype(int):
162
+ # if any of three not detected
163
+ has_left = np.sum(person[[5, 6, 7]] == -1) == 0
164
+ has_right = np.sum(person[[2, 3, 4]] == -1) == 0
165
+ if not (has_left or has_right):
166
+ continue
167
+ hands = []
168
+ #left hand
169
+ if has_left:
170
+ left_shoulder_index, left_elbow_index, left_wrist_index = person[[5, 6, 7]]
171
+ x1, y1 = candidate[left_shoulder_index][:2]
172
+ x2, y2 = candidate[left_elbow_index][:2]
173
+ x3, y3 = candidate[left_wrist_index][:2]
174
+ hands.append([x1, y1, x2, y2, x3, y3, True])
175
+ # right hand
176
+ if has_right:
177
+ right_shoulder_index, right_elbow_index, right_wrist_index = person[[2, 3, 4]]
178
+ x1, y1 = candidate[right_shoulder_index][:2]
179
+ x2, y2 = candidate[right_elbow_index][:2]
180
+ x3, y3 = candidate[right_wrist_index][:2]
181
+ hands.append([x1, y1, x2, y2, x3, y3, False])
182
+
183
+ for x1, y1, x2, y2, x3, y3, is_left in hands:
184
+ # pos_hand = pos_wrist + ratio * (pos_wrist - pos_elbox) = (1 + ratio) * pos_wrist - ratio * pos_elbox
185
+ # handRectangle.x = posePtr[wrist*3] + ratioWristElbow * (posePtr[wrist*3] - posePtr[elbow*3]);
186
+ # handRectangle.y = posePtr[wrist*3+1] + ratioWristElbow * (posePtr[wrist*3+1] - posePtr[elbow*3+1]);
187
+ # const auto distanceWristElbow = getDistance(poseKeypoints, person, wrist, elbow);
188
+ # const auto distanceElbowShoulder = getDistance(poseKeypoints, person, elbow, shoulder);
189
+ # handRectangle.width = 1.5f * fastMax(distanceWristElbow, 0.9f * distanceElbowShoulder);
190
+ x = x3 + ratioWristElbow * (x3 - x2)
191
+ y = y3 + ratioWristElbow * (y3 - y2)
192
+ distanceWristElbow = math.sqrt((x3 - x2) ** 2 + (y3 - y2) ** 2)
193
+ distanceElbowShoulder = math.sqrt((x2 - x1) ** 2 + (y2 - y1) ** 2)
194
+ width = 1.5 * max(distanceWristElbow, 0.9 * distanceElbowShoulder)
195
+ # x-y refers to the center --> offset to topLeft point
196
+ # handRectangle.x -= handRectangle.width / 2.f;
197
+ # handRectangle.y -= handRectangle.height / 2.f;
198
+ x -= width / 2
199
+ y -= width / 2 # width = height
200
+ # overflow the image
201
+ if x < 0: x = 0
202
+ if y < 0: y = 0
203
+ width1 = width
204
+ width2 = width
205
+ if x + width > image_width: width1 = image_width - x
206
+ if y + width > image_height: width2 = image_height - y
207
+ width = min(width1, width2)
208
+ # the max hand box value is 20 pixels
209
+ if width >= 20:
210
+ detect_result.append([int(x), int(y), int(width), is_left])
211
+
212
+ '''
213
+ return value: [[x, y, w, True if left hand else False]].
214
+ width=height since the network require squared input.
215
+ x, y is the coordinate of top left
216
+ '''
217
+ return detect_result
218
+
219
+
220
+ # Written by Lvmin
221
+ def faceDetect(candidate, subset, oriImg):
222
+ # left right eye ear 14 15 16 17
223
+ detect_result = []
224
+ image_height, image_width = oriImg.shape[0:2]
225
+ for person in subset.astype(int):
226
+ has_head = person[0] > -1
227
+ if not has_head:
228
+ continue
229
+
230
+ has_left_eye = person[14] > -1
231
+ has_right_eye = person[15] > -1
232
+ has_left_ear = person[16] > -1
233
+ has_right_ear = person[17] > -1
234
+
235
+ if not (has_left_eye or has_right_eye or has_left_ear or has_right_ear):
236
+ continue
237
+
238
+ head, left_eye, right_eye, left_ear, right_ear = person[[0, 14, 15, 16, 17]]
239
+
240
+ width = 0.0
241
+ x0, y0 = candidate[head][:2]
242
+
243
+ if has_left_eye:
244
+ x1, y1 = candidate[left_eye][:2]
245
+ d = max(abs(x0 - x1), abs(y0 - y1))
246
+ width = max(width, d * 3.0)
247
+
248
+ if has_right_eye:
249
+ x1, y1 = candidate[right_eye][:2]
250
+ d = max(abs(x0 - x1), abs(y0 - y1))
251
+ width = max(width, d * 3.0)
252
+
253
+ if has_left_ear:
254
+ x1, y1 = candidate[left_ear][:2]
255
+ d = max(abs(x0 - x1), abs(y0 - y1))
256
+ width = max(width, d * 1.5)
257
+
258
+ if has_right_ear:
259
+ x1, y1 = candidate[right_ear][:2]
260
+ d = max(abs(x0 - x1), abs(y0 - y1))
261
+ width = max(width, d * 1.5)
262
+
263
+ x, y = x0, y0
264
+
265
+ x -= width
266
+ y -= width
267
+
268
+ if x < 0:
269
+ x = 0
270
+
271
+ if y < 0:
272
+ y = 0
273
+
274
+ width1 = width * 2
275
+ width2 = width * 2
276
+
277
+ if x + width > image_width:
278
+ width1 = image_width - x
279
+
280
+ if y + width > image_height:
281
+ width2 = image_height - y
282
+
283
+ width = min(width1, width2)
284
+
285
+ if width >= 20:
286
+ detect_result.append([int(x), int(y), int(width)])
287
+
288
+ return detect_result
289
+
290
+
291
+ # get max index of 2d array
292
+ def npmax(array):
293
+ arrayindex = array.argmax(1)
294
+ arrayvalue = array.max(1)
295
+ i = arrayvalue.argmax()
296
+ j = arrayindex[i]
297
+ return i, j
src/flux/annotator/dwpose/wholebody.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import cv2
2
+ import numpy as np
3
+
4
+ import onnxruntime as ort
5
+ from huggingface_hub import hf_hub_download
6
+ from .onnxdet import inference_detector
7
+ from .onnxpose import inference_pose
8
+
9
+
10
+ class Wholebody:
11
+ def __init__(self, device="cuda:0"):
12
+ providers = ['CPUExecutionProvider'] if device == 'cpu' else ['CUDAExecutionProvider']
13
+ onnx_det = hf_hub_download("yzd-v/DWPose", "yolox_l.onnx")
14
+ onnx_pose = hf_hub_download("yzd-v/DWPose", "dw-ll_ucoco_384.onnx")
15
+
16
+ self.session_det = ort.InferenceSession(path_or_bytes=onnx_det, providers=providers)
17
+ self.session_pose = ort.InferenceSession(path_or_bytes=onnx_pose, providers=providers)
18
+
19
+ def __call__(self, oriImg):
20
+ det_result = inference_detector(self.session_det, oriImg)
21
+ keypoints, scores = inference_pose(self.session_pose, det_result, oriImg)
22
+
23
+ keypoints_info = np.concatenate(
24
+ (keypoints, scores[..., None]), axis=-1)
25
+ # compute neck joint
26
+ neck = np.mean(keypoints_info[:, [5, 6]], axis=1)
27
+ # neck score when visualizing pred
28
+ neck[:, 2:4] = np.logical_and(
29
+ keypoints_info[:, 5, 2:4] > 0.3,
30
+ keypoints_info[:, 6, 2:4] > 0.3).astype(int)
31
+ new_keypoints_info = np.insert(
32
+ keypoints_info, 17, neck, axis=1)
33
+ mmpose_idx = [
34
+ 17, 6, 8, 10, 7, 9, 12, 14, 16, 13, 15, 2, 1, 4, 3
35
+ ]
36
+ openpose_idx = [
37
+ 1, 2, 3, 4, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 17
38
+ ]
39
+ new_keypoints_info[:, openpose_idx] = \
40
+ new_keypoints_info[:, mmpose_idx]
41
+ keypoints_info = new_keypoints_info
42
+
43
+ keypoints, scores = keypoints_info[
44
+ ..., :2], keypoints_info[..., 2]
45
+
46
+ return keypoints, scores
47
+
48
+
src/flux/annotator/hed/__init__.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This is an improved version and model of HED edge detection with Apache License, Version 2.0.
2
+ # Please use this implementation in your products
3
+ # This implementation may produce slightly different results from Saining Xie's official implementations,
4
+ # but it generates smoother edges and is more suitable for ControlNet as well as other image-to-image translations.
5
+ # Different from official models and other implementations, this is an RGB-input model (rather than BGR)
6
+ # and in this way it works better for gradio's RGB protocol
7
+
8
+ import os
9
+ import cv2
10
+ import torch
11
+ import numpy as np
12
+
13
+ from huggingface_hub import hf_hub_download
14
+ from einops import rearrange
15
+ from ...annotator.util import annotator_ckpts_path
16
+
17
+
18
+ class DoubleConvBlock(torch.nn.Module):
19
+ def __init__(self, input_channel, output_channel, layer_number):
20
+ super().__init__()
21
+ self.convs = torch.nn.Sequential()
22
+ self.convs.append(torch.nn.Conv2d(in_channels=input_channel, out_channels=output_channel, kernel_size=(3, 3), stride=(1, 1), padding=1))
23
+ for i in range(1, layer_number):
24
+ self.convs.append(torch.nn.Conv2d(in_channels=output_channel, out_channels=output_channel, kernel_size=(3, 3), stride=(1, 1), padding=1))
25
+ self.projection = torch.nn.Conv2d(in_channels=output_channel, out_channels=1, kernel_size=(1, 1), stride=(1, 1), padding=0)
26
+
27
+ def __call__(self, x, down_sampling=False):
28
+ h = x
29
+ if down_sampling:
30
+ h = torch.nn.functional.max_pool2d(h, kernel_size=(2, 2), stride=(2, 2))
31
+ for conv in self.convs:
32
+ h = conv(h)
33
+ h = torch.nn.functional.relu(h)
34
+ return h, self.projection(h)
35
+
36
+
37
+ class ControlNetHED_Apache2(torch.nn.Module):
38
+ def __init__(self):
39
+ super().__init__()
40
+ self.norm = torch.nn.Parameter(torch.zeros(size=(1, 3, 1, 1)))
41
+ self.block1 = DoubleConvBlock(input_channel=3, output_channel=64, layer_number=2)
42
+ self.block2 = DoubleConvBlock(input_channel=64, output_channel=128, layer_number=2)
43
+ self.block3 = DoubleConvBlock(input_channel=128, output_channel=256, layer_number=3)
44
+ self.block4 = DoubleConvBlock(input_channel=256, output_channel=512, layer_number=3)
45
+ self.block5 = DoubleConvBlock(input_channel=512, output_channel=512, layer_number=3)
46
+
47
+ def __call__(self, x):
48
+ h = x - self.norm
49
+ h, projection1 = self.block1(h)
50
+ h, projection2 = self.block2(h, down_sampling=True)
51
+ h, projection3 = self.block3(h, down_sampling=True)
52
+ h, projection4 = self.block4(h, down_sampling=True)
53
+ h, projection5 = self.block5(h, down_sampling=True)
54
+ return projection1, projection2, projection3, projection4, projection5
55
+
56
+
57
+ class HEDdetector:
58
+ def __init__(self):
59
+ modelpath = os.path.join(annotator_ckpts_path, "ControlNetHED.pth")
60
+ if not os.path.exists(modelpath):
61
+ modelpath = hf_hub_download("lllyasviel/Annotators", "ControlNetHED.pth")
62
+ self.netNetwork = ControlNetHED_Apache2().float().cuda().eval()
63
+ self.netNetwork.load_state_dict(torch.load(modelpath))
64
+
65
+ def __call__(self, input_image):
66
+ assert input_image.ndim == 3
67
+ H, W, C = input_image.shape
68
+ with torch.no_grad():
69
+ image_hed = torch.from_numpy(input_image.copy()).float().cuda()
70
+ image_hed = rearrange(image_hed, 'h w c -> 1 c h w')
71
+ edges = self.netNetwork(image_hed)
72
+ edges = [e.detach().cpu().numpy().astype(np.float32)[0, 0] for e in edges]
73
+ edges = [cv2.resize(e, (W, H), interpolation=cv2.INTER_LINEAR) for e in edges]
74
+ edges = np.stack(edges, axis=2)
75
+ edge = 1 / (1 + np.exp(-np.mean(edges, axis=2).astype(np.float64)))
76
+ edge = (edge * 255.0).clip(0, 255).astype(np.uint8)
77
+ return edge
78
+
79
+
80
+ def nms(x, t, s):
81
+ x = cv2.GaussianBlur(x.astype(np.float32), (0, 0), s)
82
+
83
+ f1 = np.array([[0, 0, 0], [1, 1, 1], [0, 0, 0]], dtype=np.uint8)
84
+ f2 = np.array([[0, 1, 0], [0, 1, 0], [0, 1, 0]], dtype=np.uint8)
85
+ f3 = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]], dtype=np.uint8)
86
+ f4 = np.array([[0, 0, 1], [0, 1, 0], [1, 0, 0]], dtype=np.uint8)
87
+
88
+ y = np.zeros_like(x)
89
+
90
+ for f in [f1, f2, f3, f4]:
91
+ np.putmask(y, cv2.dilate(x, kernel=f) == x, x)
92
+
93
+ z = np.zeros_like(y, dtype=np.uint8)
94
+ z[y > t] = 255
95
+ return z
src/flux/annotator/midas/LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2019 Intel ISL (Intel Intelligent Systems Lab)
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
src/flux/annotator/midas/__init__.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Midas Depth Estimation
2
+ # From https://github.com/isl-org/MiDaS
3
+ # MIT LICENSE
4
+
5
+ import cv2
6
+ import numpy as np
7
+ import torch
8
+
9
+ from einops import rearrange
10
+ from .api import MiDaSInference
11
+
12
+
13
+ class MidasDetector:
14
+ def __init__(self):
15
+ self.model = MiDaSInference(model_type="dpt_hybrid").cuda()
16
+
17
+ def __call__(self, input_image, a=np.pi * 2.0, bg_th=0.1):
18
+ assert input_image.ndim == 3
19
+ image_depth = input_image
20
+ with torch.no_grad():
21
+ image_depth = torch.from_numpy(image_depth).float().cuda()
22
+ image_depth = image_depth / 127.5 - 1.0
23
+ image_depth = rearrange(image_depth, 'h w c -> 1 c h w')
24
+ depth = self.model(image_depth)[0]
25
+
26
+ depth_pt = depth.clone()
27
+ depth_pt -= torch.min(depth_pt)
28
+ depth_pt /= torch.max(depth_pt)
29
+ depth_pt = depth_pt.cpu().numpy()
30
+ depth_image = (depth_pt * 255.0).clip(0, 255).astype(np.uint8)
31
+
32
+ depth_np = depth.cpu().numpy()
33
+ x = cv2.Sobel(depth_np, cv2.CV_32F, 1, 0, ksize=3)
34
+ y = cv2.Sobel(depth_np, cv2.CV_32F, 0, 1, ksize=3)
35
+ z = np.ones_like(x) * a
36
+ x[depth_pt < bg_th] = 0
37
+ y[depth_pt < bg_th] = 0
38
+ normal = np.stack([x, y, z], axis=2)
39
+ normal /= np.sum(normal ** 2.0, axis=2, keepdims=True) ** 0.5
40
+ normal_image = (normal * 127.5 + 127.5).clip(0, 255).astype(np.uint8)
41
+
42
+ return depth_image, normal_image
src/flux/annotator/midas/api.py ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # based on https://github.com/isl-org/MiDaS
2
+
3
+ import cv2
4
+ import os
5
+ import torch
6
+ import torch.nn as nn
7
+ from torchvision.transforms import Compose
8
+
9
+ from huggingface_hub import hf_hub_download
10
+
11
+ from .midas.dpt_depth import DPTDepthModel
12
+ from .midas.midas_net import MidasNet
13
+ from .midas.midas_net_custom import MidasNet_small
14
+ from .midas.transforms import Resize, NormalizeImage, PrepareForNet
15
+ from ...annotator.util import annotator_ckpts_path
16
+
17
+
18
+ ISL_PATHS = {
19
+ "dpt_large": os.path.join(annotator_ckpts_path, "dpt_large-midas-2f21e586.pt"),
20
+ "dpt_hybrid": os.path.join(annotator_ckpts_path, "dpt_hybrid-midas-501f0c75.pt"),
21
+ "midas_v21": "",
22
+ "midas_v21_small": "",
23
+ }
24
+
25
+
26
+ def disabled_train(self, mode=True):
27
+ """Overwrite model.train with this function to make sure train/eval mode
28
+ does not change anymore."""
29
+ return self
30
+
31
+
32
+ def load_midas_transform(model_type):
33
+ # https://github.com/isl-org/MiDaS/blob/master/run.py
34
+ # load transform only
35
+ if model_type == "dpt_large": # DPT-Large
36
+ net_w, net_h = 384, 384
37
+ resize_mode = "minimal"
38
+ normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
39
+
40
+ elif model_type == "dpt_hybrid": # DPT-Hybrid
41
+ net_w, net_h = 384, 384
42
+ resize_mode = "minimal"
43
+ normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
44
+
45
+ elif model_type == "midas_v21":
46
+ net_w, net_h = 384, 384
47
+ resize_mode = "upper_bound"
48
+ normalization = NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
49
+
50
+ elif model_type == "midas_v21_small":
51
+ net_w, net_h = 256, 256
52
+ resize_mode = "upper_bound"
53
+ normalization = NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
54
+
55
+ else:
56
+ assert False, f"model_type '{model_type}' not implemented, use: --model_type large"
57
+
58
+ transform = Compose(
59
+ [
60
+ Resize(
61
+ net_w,
62
+ net_h,
63
+ resize_target=None,
64
+ keep_aspect_ratio=True,
65
+ ensure_multiple_of=32,
66
+ resize_method=resize_mode,
67
+ image_interpolation_method=cv2.INTER_CUBIC,
68
+ ),
69
+ normalization,
70
+ PrepareForNet(),
71
+ ]
72
+ )
73
+
74
+ return transform
75
+
76
+
77
+ def load_model(model_type):
78
+ # https://github.com/isl-org/MiDaS/blob/master/run.py
79
+ # load network
80
+ model_path = ISL_PATHS[model_type]
81
+ if model_type == "dpt_large": # DPT-Large
82
+ model = DPTDepthModel(
83
+ path=model_path,
84
+ backbone="vitl16_384",
85
+ non_negative=True,
86
+ )
87
+ net_w, net_h = 384, 384
88
+ resize_mode = "minimal"
89
+ normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
90
+
91
+ elif model_type == "dpt_hybrid": # DPT-Hybrid
92
+ if not os.path.exists(model_path):
93
+ model_path = hf_hub_download("lllyasviel/Annotators", "dpt_hybrid-midas-501f0c75.pt")
94
+
95
+ model = DPTDepthModel(
96
+ path=model_path,
97
+ backbone="vitb_rn50_384",
98
+ non_negative=True,
99
+ )
100
+ net_w, net_h = 384, 384
101
+ resize_mode = "minimal"
102
+ normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
103
+
104
+ elif model_type == "midas_v21":
105
+ model = MidasNet(model_path, non_negative=True)
106
+ net_w, net_h = 384, 384
107
+ resize_mode = "upper_bound"
108
+ normalization = NormalizeImage(
109
+ mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
110
+ )
111
+
112
+ elif model_type == "midas_v21_small":
113
+ model = MidasNet_small(model_path, features=64, backbone="efficientnet_lite3", exportable=True,
114
+ non_negative=True, blocks={'expand': True})
115
+ net_w, net_h = 256, 256
116
+ resize_mode = "upper_bound"
117
+ normalization = NormalizeImage(
118
+ mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
119
+ )
120
+
121
+ else:
122
+ print(f"model_type '{model_type}' not implemented, use: --model_type large")
123
+ assert False
124
+
125
+ transform = Compose(
126
+ [
127
+ Resize(
128
+ net_w,
129
+ net_h,
130
+ resize_target=None,
131
+ keep_aspect_ratio=True,
132
+ ensure_multiple_of=32,
133
+ resize_method=resize_mode,
134
+ image_interpolation_method=cv2.INTER_CUBIC,
135
+ ),
136
+ normalization,
137
+ PrepareForNet(),
138
+ ]
139
+ )
140
+
141
+ return model.eval(), transform
142
+
143
+
144
+ class MiDaSInference(nn.Module):
145
+ MODEL_TYPES_TORCH_HUB = [
146
+ "DPT_Large",
147
+ "DPT_Hybrid",
148
+ "MiDaS_small"
149
+ ]
150
+ MODEL_TYPES_ISL = [
151
+ "dpt_large",
152
+ "dpt_hybrid",
153
+ "midas_v21",
154
+ "midas_v21_small",
155
+ ]
156
+
157
+ def __init__(self, model_type):
158
+ super().__init__()
159
+ assert (model_type in self.MODEL_TYPES_ISL)
160
+ model, _ = load_model(model_type)
161
+ self.model = model
162
+ self.model.train = disabled_train
163
+
164
+ def forward(self, x):
165
+ with torch.no_grad():
166
+ prediction = self.model(x)
167
+ return prediction
168
+
src/flux/annotator/midas/midas/__init__.py ADDED
File without changes
src/flux/annotator/midas/midas/base_model.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+
3
+
4
+ class BaseModel(torch.nn.Module):
5
+ def load(self, path):
6
+ """Load model from file.
7
+
8
+ Args:
9
+ path (str): file path
10
+ """
11
+ parameters = torch.load(path, map_location=torch.device('cpu'))
12
+
13
+ if "optimizer" in parameters:
14
+ parameters = parameters["model"]
15
+
16
+ self.load_state_dict(parameters)
src/flux/annotator/midas/midas/blocks.py ADDED
@@ -0,0 +1,342 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+
4
+ from .vit import (
5
+ _make_pretrained_vitb_rn50_384,
6
+ _make_pretrained_vitl16_384,
7
+ _make_pretrained_vitb16_384,
8
+ forward_vit,
9
+ )
10
+
11
+ def _make_encoder(backbone, features, use_pretrained, groups=1, expand=False, exportable=True, hooks=None, use_vit_only=False, use_readout="ignore",):
12
+ if backbone == "vitl16_384":
13
+ pretrained = _make_pretrained_vitl16_384(
14
+ use_pretrained, hooks=hooks, use_readout=use_readout
15
+ )
16
+ scratch = _make_scratch(
17
+ [256, 512, 1024, 1024], features, groups=groups, expand=expand
18
+ ) # ViT-L/16 - 85.0% Top1 (backbone)
19
+ elif backbone == "vitb_rn50_384":
20
+ pretrained = _make_pretrained_vitb_rn50_384(
21
+ use_pretrained,
22
+ hooks=hooks,
23
+ use_vit_only=use_vit_only,
24
+ use_readout=use_readout,
25
+ )
26
+ scratch = _make_scratch(
27
+ [256, 512, 768, 768], features, groups=groups, expand=expand
28
+ ) # ViT-H/16 - 85.0% Top1 (backbone)
29
+ elif backbone == "vitb16_384":
30
+ pretrained = _make_pretrained_vitb16_384(
31
+ use_pretrained, hooks=hooks, use_readout=use_readout
32
+ )
33
+ scratch = _make_scratch(
34
+ [96, 192, 384, 768], features, groups=groups, expand=expand
35
+ ) # ViT-B/16 - 84.6% Top1 (backbone)
36
+ elif backbone == "resnext101_wsl":
37
+ pretrained = _make_pretrained_resnext101_wsl(use_pretrained)
38
+ scratch = _make_scratch([256, 512, 1024, 2048], features, groups=groups, expand=expand) # efficientnet_lite3
39
+ elif backbone == "efficientnet_lite3":
40
+ pretrained = _make_pretrained_efficientnet_lite3(use_pretrained, exportable=exportable)
41
+ scratch = _make_scratch([32, 48, 136, 384], features, groups=groups, expand=expand) # efficientnet_lite3
42
+ else:
43
+ print(f"Backbone '{backbone}' not implemented")
44
+ assert False
45
+
46
+ return pretrained, scratch
47
+
48
+
49
+ def _make_scratch(in_shape, out_shape, groups=1, expand=False):
50
+ scratch = nn.Module()
51
+
52
+ out_shape1 = out_shape
53
+ out_shape2 = out_shape
54
+ out_shape3 = out_shape
55
+ out_shape4 = out_shape
56
+ if expand==True:
57
+ out_shape1 = out_shape
58
+ out_shape2 = out_shape*2
59
+ out_shape3 = out_shape*4
60
+ out_shape4 = out_shape*8
61
+
62
+ scratch.layer1_rn = nn.Conv2d(
63
+ in_shape[0], out_shape1, kernel_size=3, stride=1, padding=1, bias=False, groups=groups
64
+ )
65
+ scratch.layer2_rn = nn.Conv2d(
66
+ in_shape[1], out_shape2, kernel_size=3, stride=1, padding=1, bias=False, groups=groups
67
+ )
68
+ scratch.layer3_rn = nn.Conv2d(
69
+ in_shape[2], out_shape3, kernel_size=3, stride=1, padding=1, bias=False, groups=groups
70
+ )
71
+ scratch.layer4_rn = nn.Conv2d(
72
+ in_shape[3], out_shape4, kernel_size=3, stride=1, padding=1, bias=False, groups=groups
73
+ )
74
+
75
+ return scratch
76
+
77
+
78
+ def _make_pretrained_efficientnet_lite3(use_pretrained, exportable=False):
79
+ efficientnet = torch.hub.load(
80
+ "rwightman/gen-efficientnet-pytorch",
81
+ "tf_efficientnet_lite3",
82
+ pretrained=use_pretrained,
83
+ exportable=exportable
84
+ )
85
+ return _make_efficientnet_backbone(efficientnet)
86
+
87
+
88
+ def _make_efficientnet_backbone(effnet):
89
+ pretrained = nn.Module()
90
+
91
+ pretrained.layer1 = nn.Sequential(
92
+ effnet.conv_stem, effnet.bn1, effnet.act1, *effnet.blocks[0:2]
93
+ )
94
+ pretrained.layer2 = nn.Sequential(*effnet.blocks[2:3])
95
+ pretrained.layer3 = nn.Sequential(*effnet.blocks[3:5])
96
+ pretrained.layer4 = nn.Sequential(*effnet.blocks[5:9])
97
+
98
+ return pretrained
99
+
100
+
101
+ def _make_resnet_backbone(resnet):
102
+ pretrained = nn.Module()
103
+ pretrained.layer1 = nn.Sequential(
104
+ resnet.conv1, resnet.bn1, resnet.relu, resnet.maxpool, resnet.layer1
105
+ )
106
+
107
+ pretrained.layer2 = resnet.layer2
108
+ pretrained.layer3 = resnet.layer3
109
+ pretrained.layer4 = resnet.layer4
110
+
111
+ return pretrained
112
+
113
+
114
+ def _make_pretrained_resnext101_wsl(use_pretrained):
115
+ resnet = torch.hub.load("facebookresearch/WSL-Images", "resnext101_32x8d_wsl")
116
+ return _make_resnet_backbone(resnet)
117
+
118
+
119
+
120
+ class Interpolate(nn.Module):
121
+ """Interpolation module.
122
+ """
123
+
124
+ def __init__(self, scale_factor, mode, align_corners=False):
125
+ """Init.
126
+
127
+ Args:
128
+ scale_factor (float): scaling
129
+ mode (str): interpolation mode
130
+ """
131
+ super(Interpolate, self).__init__()
132
+
133
+ self.interp = nn.functional.interpolate
134
+ self.scale_factor = scale_factor
135
+ self.mode = mode
136
+ self.align_corners = align_corners
137
+
138
+ def forward(self, x):
139
+ """Forward pass.
140
+
141
+ Args:
142
+ x (tensor): input
143
+
144
+ Returns:
145
+ tensor: interpolated data
146
+ """
147
+
148
+ x = self.interp(
149
+ x, scale_factor=self.scale_factor, mode=self.mode, align_corners=self.align_corners
150
+ )
151
+
152
+ return x
153
+
154
+
155
+ class ResidualConvUnit(nn.Module):
156
+ """Residual convolution module.
157
+ """
158
+
159
+ def __init__(self, features):
160
+ """Init.
161
+
162
+ Args:
163
+ features (int): number of features
164
+ """
165
+ super().__init__()
166
+
167
+ self.conv1 = nn.Conv2d(
168
+ features, features, kernel_size=3, stride=1, padding=1, bias=True
169
+ )
170
+
171
+ self.conv2 = nn.Conv2d(
172
+ features, features, kernel_size=3, stride=1, padding=1, bias=True
173
+ )
174
+
175
+ self.relu = nn.ReLU(inplace=True)
176
+
177
+ def forward(self, x):
178
+ """Forward pass.
179
+
180
+ Args:
181
+ x (tensor): input
182
+
183
+ Returns:
184
+ tensor: output
185
+ """
186
+ out = self.relu(x)
187
+ out = self.conv1(out)
188
+ out = self.relu(out)
189
+ out = self.conv2(out)
190
+
191
+ return out + x
192
+
193
+
194
+ class FeatureFusionBlock(nn.Module):
195
+ """Feature fusion block.
196
+ """
197
+
198
+ def __init__(self, features):
199
+ """Init.
200
+
201
+ Args:
202
+ features (int): number of features
203
+ """
204
+ super(FeatureFusionBlock, self).__init__()
205
+
206
+ self.resConfUnit1 = ResidualConvUnit(features)
207
+ self.resConfUnit2 = ResidualConvUnit(features)
208
+
209
+ def forward(self, *xs):
210
+ """Forward pass.
211
+
212
+ Returns:
213
+ tensor: output
214
+ """
215
+ output = xs[0]
216
+
217
+ if len(xs) == 2:
218
+ output += self.resConfUnit1(xs[1])
219
+
220
+ output = self.resConfUnit2(output)
221
+
222
+ output = nn.functional.interpolate(
223
+ output, scale_factor=2, mode="bilinear", align_corners=True
224
+ )
225
+
226
+ return output
227
+
228
+
229
+
230
+
231
+ class ResidualConvUnit_custom(nn.Module):
232
+ """Residual convolution module.
233
+ """
234
+
235
+ def __init__(self, features, activation, bn):
236
+ """Init.
237
+
238
+ Args:
239
+ features (int): number of features
240
+ """
241
+ super().__init__()
242
+
243
+ self.bn = bn
244
+
245
+ self.groups=1
246
+
247
+ self.conv1 = nn.Conv2d(
248
+ features, features, kernel_size=3, stride=1, padding=1, bias=True, groups=self.groups
249
+ )
250
+
251
+ self.conv2 = nn.Conv2d(
252
+ features, features, kernel_size=3, stride=1, padding=1, bias=True, groups=self.groups
253
+ )
254
+
255
+ if self.bn==True:
256
+ self.bn1 = nn.BatchNorm2d(features)
257
+ self.bn2 = nn.BatchNorm2d(features)
258
+
259
+ self.activation = activation
260
+
261
+ self.skip_add = nn.quantized.FloatFunctional()
262
+
263
+ def forward(self, x):
264
+ """Forward pass.
265
+
266
+ Args:
267
+ x (tensor): input
268
+
269
+ Returns:
270
+ tensor: output
271
+ """
272
+
273
+ out = self.activation(x)
274
+ out = self.conv1(out)
275
+ if self.bn==True:
276
+ out = self.bn1(out)
277
+
278
+ out = self.activation(out)
279
+ out = self.conv2(out)
280
+ if self.bn==True:
281
+ out = self.bn2(out)
282
+
283
+ if self.groups > 1:
284
+ out = self.conv_merge(out)
285
+
286
+ return self.skip_add.add(out, x)
287
+
288
+ # return out + x
289
+
290
+
291
+ class FeatureFusionBlock_custom(nn.Module):
292
+ """Feature fusion block.
293
+ """
294
+
295
+ def __init__(self, features, activation, deconv=False, bn=False, expand=False, align_corners=True):
296
+ """Init.
297
+
298
+ Args:
299
+ features (int): number of features
300
+ """
301
+ super(FeatureFusionBlock_custom, self).__init__()
302
+
303
+ self.deconv = deconv
304
+ self.align_corners = align_corners
305
+
306
+ self.groups=1
307
+
308
+ self.expand = expand
309
+ out_features = features
310
+ if self.expand==True:
311
+ out_features = features//2
312
+
313
+ self.out_conv = nn.Conv2d(features, out_features, kernel_size=1, stride=1, padding=0, bias=True, groups=1)
314
+
315
+ self.resConfUnit1 = ResidualConvUnit_custom(features, activation, bn)
316
+ self.resConfUnit2 = ResidualConvUnit_custom(features, activation, bn)
317
+
318
+ self.skip_add = nn.quantized.FloatFunctional()
319
+
320
+ def forward(self, *xs):
321
+ """Forward pass.
322
+
323
+ Returns:
324
+ tensor: output
325
+ """
326
+ output = xs[0]
327
+
328
+ if len(xs) == 2:
329
+ res = self.resConfUnit1(xs[1])
330
+ output = self.skip_add.add(output, res)
331
+ # output += res
332
+
333
+ output = self.resConfUnit2(output)
334
+
335
+ output = nn.functional.interpolate(
336
+ output, scale_factor=2, mode="bilinear", align_corners=self.align_corners
337
+ )
338
+
339
+ output = self.out_conv(output)
340
+
341
+ return output
342
+
src/flux/annotator/midas/midas/dpt_depth.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.nn.functional as F
4
+
5
+ from .base_model import BaseModel
6
+ from .blocks import (
7
+ FeatureFusionBlock,
8
+ FeatureFusionBlock_custom,
9
+ Interpolate,
10
+ _make_encoder,
11
+ forward_vit,
12
+ )
13
+
14
+
15
+ def _make_fusion_block(features, use_bn):
16
+ return FeatureFusionBlock_custom(
17
+ features,
18
+ nn.ReLU(False),
19
+ deconv=False,
20
+ bn=use_bn,
21
+ expand=False,
22
+ align_corners=True,
23
+ )
24
+
25
+
26
+ class DPT(BaseModel):
27
+ def __init__(
28
+ self,
29
+ head,
30
+ features=256,
31
+ backbone="vitb_rn50_384",
32
+ readout="project",
33
+ channels_last=False,
34
+ use_bn=False,
35
+ ):
36
+
37
+ super(DPT, self).__init__()
38
+
39
+ self.channels_last = channels_last
40
+
41
+ hooks = {
42
+ "vitb_rn50_384": [0, 1, 8, 11],
43
+ "vitb16_384": [2, 5, 8, 11],
44
+ "vitl16_384": [5, 11, 17, 23],
45
+ }
46
+
47
+ # Instantiate backbone and reassemble blocks
48
+ self.pretrained, self.scratch = _make_encoder(
49
+ backbone,
50
+ features,
51
+ False, # Set to true of you want to train from scratch, uses ImageNet weights
52
+ groups=1,
53
+ expand=False,
54
+ exportable=False,
55
+ hooks=hooks[backbone],
56
+ use_readout=readout,
57
+ )
58
+
59
+ self.scratch.refinenet1 = _make_fusion_block(features, use_bn)
60
+ self.scratch.refinenet2 = _make_fusion_block(features, use_bn)
61
+ self.scratch.refinenet3 = _make_fusion_block(features, use_bn)
62
+ self.scratch.refinenet4 = _make_fusion_block(features, use_bn)
63
+
64
+ self.scratch.output_conv = head
65
+
66
+
67
+ def forward(self, x):
68
+ if self.channels_last == True:
69
+ x.contiguous(memory_format=torch.channels_last)
70
+
71
+ layer_1, layer_2, layer_3, layer_4 = forward_vit(self.pretrained, x)
72
+
73
+ layer_1_rn = self.scratch.layer1_rn(layer_1)
74
+ layer_2_rn = self.scratch.layer2_rn(layer_2)
75
+ layer_3_rn = self.scratch.layer3_rn(layer_3)
76
+ layer_4_rn = self.scratch.layer4_rn(layer_4)
77
+
78
+ path_4 = self.scratch.refinenet4(layer_4_rn)
79
+ path_3 = self.scratch.refinenet3(path_4, layer_3_rn)
80
+ path_2 = self.scratch.refinenet2(path_3, layer_2_rn)
81
+ path_1 = self.scratch.refinenet1(path_2, layer_1_rn)
82
+
83
+ out = self.scratch.output_conv(path_1)
84
+
85
+ return out
86
+
87
+
88
+ class DPTDepthModel(DPT):
89
+ def __init__(self, path=None, non_negative=True, **kwargs):
90
+ features = kwargs["features"] if "features" in kwargs else 256
91
+
92
+ head = nn.Sequential(
93
+ nn.Conv2d(features, features // 2, kernel_size=3, stride=1, padding=1),
94
+ Interpolate(scale_factor=2, mode="bilinear", align_corners=True),
95
+ nn.Conv2d(features // 2, 32, kernel_size=3, stride=1, padding=1),
96
+ nn.ReLU(True),
97
+ nn.Conv2d(32, 1, kernel_size=1, stride=1, padding=0),
98
+ nn.ReLU(True) if non_negative else nn.Identity(),
99
+ nn.Identity(),
100
+ )
101
+
102
+ super().__init__(head, **kwargs)
103
+
104
+ if path is not None:
105
+ self.load(path)
106
+
107
+ def forward(self, x):
108
+ return super().forward(x).squeeze(dim=1)
109
+
src/flux/annotator/midas/midas/midas_net.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """MidashNet: Network for monocular depth estimation trained by mixing several datasets.
2
+ This file contains code that is adapted from
3
+ https://github.com/thomasjpfan/pytorch_refinenet/blob/master/pytorch_refinenet/refinenet/refinenet_4cascade.py
4
+ """
5
+ import torch
6
+ import torch.nn as nn
7
+
8
+ from .base_model import BaseModel
9
+ from .blocks import FeatureFusionBlock, Interpolate, _make_encoder
10
+
11
+
12
+ class MidasNet(BaseModel):
13
+ """Network for monocular depth estimation.
14
+ """
15
+
16
+ def __init__(self, path=None, features=256, non_negative=True):
17
+ """Init.
18
+
19
+ Args:
20
+ path (str, optional): Path to saved model. Defaults to None.
21
+ features (int, optional): Number of features. Defaults to 256.
22
+ backbone (str, optional): Backbone network for encoder. Defaults to resnet50
23
+ """
24
+ print("Loading weights: ", path)
25
+
26
+ super(MidasNet, self).__init__()
27
+
28
+ use_pretrained = False if path is None else True
29
+
30
+ self.pretrained, self.scratch = _make_encoder(backbone="resnext101_wsl", features=features, use_pretrained=use_pretrained)
31
+
32
+ self.scratch.refinenet4 = FeatureFusionBlock(features)
33
+ self.scratch.refinenet3 = FeatureFusionBlock(features)
34
+ self.scratch.refinenet2 = FeatureFusionBlock(features)
35
+ self.scratch.refinenet1 = FeatureFusionBlock(features)
36
+
37
+ self.scratch.output_conv = nn.Sequential(
38
+ nn.Conv2d(features, 128, kernel_size=3, stride=1, padding=1),
39
+ Interpolate(scale_factor=2, mode="bilinear"),
40
+ nn.Conv2d(128, 32, kernel_size=3, stride=1, padding=1),
41
+ nn.ReLU(True),
42
+ nn.Conv2d(32, 1, kernel_size=1, stride=1, padding=0),
43
+ nn.ReLU(True) if non_negative else nn.Identity(),
44
+ )
45
+
46
+ if path:
47
+ self.load(path)
48
+
49
+ def forward(self, x):
50
+ """Forward pass.
51
+
52
+ Args:
53
+ x (tensor): input data (image)
54
+
55
+ Returns:
56
+ tensor: depth
57
+ """
58
+
59
+ layer_1 = self.pretrained.layer1(x)
60
+ layer_2 = self.pretrained.layer2(layer_1)
61
+ layer_3 = self.pretrained.layer3(layer_2)
62
+ layer_4 = self.pretrained.layer4(layer_3)
63
+
64
+ layer_1_rn = self.scratch.layer1_rn(layer_1)
65
+ layer_2_rn = self.scratch.layer2_rn(layer_2)
66
+ layer_3_rn = self.scratch.layer3_rn(layer_3)
67
+ layer_4_rn = self.scratch.layer4_rn(layer_4)
68
+
69
+ path_4 = self.scratch.refinenet4(layer_4_rn)
70
+ path_3 = self.scratch.refinenet3(path_4, layer_3_rn)
71
+ path_2 = self.scratch.refinenet2(path_3, layer_2_rn)
72
+ path_1 = self.scratch.refinenet1(path_2, layer_1_rn)
73
+
74
+ out = self.scratch.output_conv(path_1)
75
+
76
+ return torch.squeeze(out, dim=1)
src/flux/annotator/midas/midas/midas_net_custom.py ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """MidashNet: Network for monocular depth estimation trained by mixing several datasets.
2
+ This file contains code that is adapted from
3
+ https://github.com/thomasjpfan/pytorch_refinenet/blob/master/pytorch_refinenet/refinenet/refinenet_4cascade.py
4
+ """
5
+ import torch
6
+ import torch.nn as nn
7
+
8
+ from .base_model import BaseModel
9
+ from .blocks import FeatureFusionBlock, FeatureFusionBlock_custom, Interpolate, _make_encoder
10
+
11
+
12
+ class MidasNet_small(BaseModel):
13
+ """Network for monocular depth estimation.
14
+ """
15
+
16
+ def __init__(self, path=None, features=64, backbone="efficientnet_lite3", non_negative=True, exportable=True, channels_last=False, align_corners=True,
17
+ blocks={'expand': True}):
18
+ """Init.
19
+
20
+ Args:
21
+ path (str, optional): Path to saved model. Defaults to None.
22
+ features (int, optional): Number of features. Defaults to 256.
23
+ backbone (str, optional): Backbone network for encoder. Defaults to resnet50
24
+ """
25
+ print("Loading weights: ", path)
26
+
27
+ super(MidasNet_small, self).__init__()
28
+
29
+ use_pretrained = False if path else True
30
+
31
+ self.channels_last = channels_last
32
+ self.blocks = blocks
33
+ self.backbone = backbone
34
+
35
+ self.groups = 1
36
+
37
+ features1=features
38
+ features2=features
39
+ features3=features
40
+ features4=features
41
+ self.expand = False
42
+ if "expand" in self.blocks and self.blocks['expand'] == True:
43
+ self.expand = True
44
+ features1=features
45
+ features2=features*2
46
+ features3=features*4
47
+ features4=features*8
48
+
49
+ self.pretrained, self.scratch = _make_encoder(self.backbone, features, use_pretrained, groups=self.groups, expand=self.expand, exportable=exportable)
50
+
51
+ self.scratch.activation = nn.ReLU(False)
52
+
53
+ self.scratch.refinenet4 = FeatureFusionBlock_custom(features4, self.scratch.activation, deconv=False, bn=False, expand=self.expand, align_corners=align_corners)
54
+ self.scratch.refinenet3 = FeatureFusionBlock_custom(features3, self.scratch.activation, deconv=False, bn=False, expand=self.expand, align_corners=align_corners)
55
+ self.scratch.refinenet2 = FeatureFusionBlock_custom(features2, self.scratch.activation, deconv=False, bn=False, expand=self.expand, align_corners=align_corners)
56
+ self.scratch.refinenet1 = FeatureFusionBlock_custom(features1, self.scratch.activation, deconv=False, bn=False, align_corners=align_corners)
57
+
58
+
59
+ self.scratch.output_conv = nn.Sequential(
60
+ nn.Conv2d(features, features//2, kernel_size=3, stride=1, padding=1, groups=self.groups),
61
+ Interpolate(scale_factor=2, mode="bilinear"),
62
+ nn.Conv2d(features//2, 32, kernel_size=3, stride=1, padding=1),
63
+ self.scratch.activation,
64
+ nn.Conv2d(32, 1, kernel_size=1, stride=1, padding=0),
65
+ nn.ReLU(True) if non_negative else nn.Identity(),
66
+ nn.Identity(),
67
+ )
68
+
69
+ if path:
70
+ self.load(path)
71
+
72
+
73
+ def forward(self, x):
74
+ """Forward pass.
75
+
76
+ Args:
77
+ x (tensor): input data (image)
78
+
79
+ Returns:
80
+ tensor: depth
81
+ """
82
+ if self.channels_last==True:
83
+ print("self.channels_last = ", self.channels_last)
84
+ x.contiguous(memory_format=torch.channels_last)
85
+
86
+
87
+ layer_1 = self.pretrained.layer1(x)
88
+ layer_2 = self.pretrained.layer2(layer_1)
89
+ layer_3 = self.pretrained.layer3(layer_2)
90
+ layer_4 = self.pretrained.layer4(layer_3)
91
+
92
+ layer_1_rn = self.scratch.layer1_rn(layer_1)
93
+ layer_2_rn = self.scratch.layer2_rn(layer_2)
94
+ layer_3_rn = self.scratch.layer3_rn(layer_3)
95
+ layer_4_rn = self.scratch.layer4_rn(layer_4)
96
+
97
+
98
+ path_4 = self.scratch.refinenet4(layer_4_rn)
99
+ path_3 = self.scratch.refinenet3(path_4, layer_3_rn)
100
+ path_2 = self.scratch.refinenet2(path_3, layer_2_rn)
101
+ path_1 = self.scratch.refinenet1(path_2, layer_1_rn)
102
+
103
+ out = self.scratch.output_conv(path_1)
104
+
105
+ return torch.squeeze(out, dim=1)
106
+
107
+
108
+
109
+ def fuse_model(m):
110
+ prev_previous_type = nn.Identity()
111
+ prev_previous_name = ''
112
+ previous_type = nn.Identity()
113
+ previous_name = ''
114
+ for name, module in m.named_modules():
115
+ if prev_previous_type == nn.Conv2d and previous_type == nn.BatchNorm2d and type(module) == nn.ReLU:
116
+ # print("FUSED ", prev_previous_name, previous_name, name)
117
+ torch.quantization.fuse_modules(m, [prev_previous_name, previous_name, name], inplace=True)
118
+ elif prev_previous_type == nn.Conv2d and previous_type == nn.BatchNorm2d:
119
+ # print("FUSED ", prev_previous_name, previous_name)
120
+ torch.quantization.fuse_modules(m, [prev_previous_name, previous_name], inplace=True)
121
+ # elif previous_type == nn.Conv2d and type(module) == nn.ReLU:
122
+ # print("FUSED ", previous_name, name)
123
+ # torch.quantization.fuse_modules(m, [previous_name, name], inplace=True)
124
+
125
+ prev_previous_type = previous_type
126
+ prev_previous_name = previous_name
127
+ previous_type = type(module)
128
+ previous_name = name
src/flux/annotator/midas/midas/transforms.py ADDED
@@ -0,0 +1,234 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import cv2
3
+ import math
4
+
5
+
6
+ def apply_min_size(sample, size, image_interpolation_method=cv2.INTER_AREA):
7
+ """Rezise the sample to ensure the given size. Keeps aspect ratio.
8
+
9
+ Args:
10
+ sample (dict): sample
11
+ size (tuple): image size
12
+
13
+ Returns:
14
+ tuple: new size
15
+ """
16
+ shape = list(sample["disparity"].shape)
17
+
18
+ if shape[0] >= size[0] and shape[1] >= size[1]:
19
+ return sample
20
+
21
+ scale = [0, 0]
22
+ scale[0] = size[0] / shape[0]
23
+ scale[1] = size[1] / shape[1]
24
+
25
+ scale = max(scale)
26
+
27
+ shape[0] = math.ceil(scale * shape[0])
28
+ shape[1] = math.ceil(scale * shape[1])
29
+
30
+ # resize
31
+ sample["image"] = cv2.resize(
32
+ sample["image"], tuple(shape[::-1]), interpolation=image_interpolation_method
33
+ )
34
+
35
+ sample["disparity"] = cv2.resize(
36
+ sample["disparity"], tuple(shape[::-1]), interpolation=cv2.INTER_NEAREST
37
+ )
38
+ sample["mask"] = cv2.resize(
39
+ sample["mask"].astype(np.float32),
40
+ tuple(shape[::-1]),
41
+ interpolation=cv2.INTER_NEAREST,
42
+ )
43
+ sample["mask"] = sample["mask"].astype(bool)
44
+
45
+ return tuple(shape)
46
+
47
+
48
+ class Resize(object):
49
+ """Resize sample to given size (width, height).
50
+ """
51
+
52
+ def __init__(
53
+ self,
54
+ width,
55
+ height,
56
+ resize_target=True,
57
+ keep_aspect_ratio=False,
58
+ ensure_multiple_of=1,
59
+ resize_method="lower_bound",
60
+ image_interpolation_method=cv2.INTER_AREA,
61
+ ):
62
+ """Init.
63
+
64
+ Args:
65
+ width (int): desired output width
66
+ height (int): desired output height
67
+ resize_target (bool, optional):
68
+ True: Resize the full sample (image, mask, target).
69
+ False: Resize image only.
70
+ Defaults to True.
71
+ keep_aspect_ratio (bool, optional):
72
+ True: Keep the aspect ratio of the input sample.
73
+ Output sample might not have the given width and height, and
74
+ resize behaviour depends on the parameter 'resize_method'.
75
+ Defaults to False.
76
+ ensure_multiple_of (int, optional):
77
+ Output width and height is constrained to be multiple of this parameter.
78
+ Defaults to 1.
79
+ resize_method (str, optional):
80
+ "lower_bound": Output will be at least as large as the given size.
81
+ "upper_bound": Output will be at max as large as the given size. (Output size might be smaller than given size.)
82
+ "minimal": Scale as least as possible. (Output size might be smaller than given size.)
83
+ Defaults to "lower_bound".
84
+ """
85
+ self.__width = width
86
+ self.__height = height
87
+
88
+ self.__resize_target = resize_target
89
+ self.__keep_aspect_ratio = keep_aspect_ratio
90
+ self.__multiple_of = ensure_multiple_of
91
+ self.__resize_method = resize_method
92
+ self.__image_interpolation_method = image_interpolation_method
93
+
94
+ def constrain_to_multiple_of(self, x, min_val=0, max_val=None):
95
+ y = (np.round(x / self.__multiple_of) * self.__multiple_of).astype(int)
96
+
97
+ if max_val is not None and y > max_val:
98
+ y = (np.floor(x / self.__multiple_of) * self.__multiple_of).astype(int)
99
+
100
+ if y < min_val:
101
+ y = (np.ceil(x / self.__multiple_of) * self.__multiple_of).astype(int)
102
+
103
+ return y
104
+
105
+ def get_size(self, width, height):
106
+ # determine new height and width
107
+ scale_height = self.__height / height
108
+ scale_width = self.__width / width
109
+
110
+ if self.__keep_aspect_ratio:
111
+ if self.__resize_method == "lower_bound":
112
+ # scale such that output size is lower bound
113
+ if scale_width > scale_height:
114
+ # fit width
115
+ scale_height = scale_width
116
+ else:
117
+ # fit height
118
+ scale_width = scale_height
119
+ elif self.__resize_method == "upper_bound":
120
+ # scale such that output size is upper bound
121
+ if scale_width < scale_height:
122
+ # fit width
123
+ scale_height = scale_width
124
+ else:
125
+ # fit height
126
+ scale_width = scale_height
127
+ elif self.__resize_method == "minimal":
128
+ # scale as least as possbile
129
+ if abs(1 - scale_width) < abs(1 - scale_height):
130
+ # fit width
131
+ scale_height = scale_width
132
+ else:
133
+ # fit height
134
+ scale_width = scale_height
135
+ else:
136
+ raise ValueError(
137
+ f"resize_method {self.__resize_method} not implemented"
138
+ )
139
+
140
+ if self.__resize_method == "lower_bound":
141
+ new_height = self.constrain_to_multiple_of(
142
+ scale_height * height, min_val=self.__height
143
+ )
144
+ new_width = self.constrain_to_multiple_of(
145
+ scale_width * width, min_val=self.__width
146
+ )
147
+ elif self.__resize_method == "upper_bound":
148
+ new_height = self.constrain_to_multiple_of(
149
+ scale_height * height, max_val=self.__height
150
+ )
151
+ new_width = self.constrain_to_multiple_of(
152
+ scale_width * width, max_val=self.__width
153
+ )
154
+ elif self.__resize_method == "minimal":
155
+ new_height = self.constrain_to_multiple_of(scale_height * height)
156
+ new_width = self.constrain_to_multiple_of(scale_width * width)
157
+ else:
158
+ raise ValueError(f"resize_method {self.__resize_method} not implemented")
159
+
160
+ return (new_width, new_height)
161
+
162
+ def __call__(self, sample):
163
+ width, height = self.get_size(
164
+ sample["image"].shape[1], sample["image"].shape[0]
165
+ )
166
+
167
+ # resize sample
168
+ sample["image"] = cv2.resize(
169
+ sample["image"],
170
+ (width, height),
171
+ interpolation=self.__image_interpolation_method,
172
+ )
173
+
174
+ if self.__resize_target:
175
+ if "disparity" in sample:
176
+ sample["disparity"] = cv2.resize(
177
+ sample["disparity"],
178
+ (width, height),
179
+ interpolation=cv2.INTER_NEAREST,
180
+ )
181
+
182
+ if "depth" in sample:
183
+ sample["depth"] = cv2.resize(
184
+ sample["depth"], (width, height), interpolation=cv2.INTER_NEAREST
185
+ )
186
+
187
+ sample["mask"] = cv2.resize(
188
+ sample["mask"].astype(np.float32),
189
+ (width, height),
190
+ interpolation=cv2.INTER_NEAREST,
191
+ )
192
+ sample["mask"] = sample["mask"].astype(bool)
193
+
194
+ return sample
195
+
196
+
197
+ class NormalizeImage(object):
198
+ """Normlize image by given mean and std.
199
+ """
200
+
201
+ def __init__(self, mean, std):
202
+ self.__mean = mean
203
+ self.__std = std
204
+
205
+ def __call__(self, sample):
206
+ sample["image"] = (sample["image"] - self.__mean) / self.__std
207
+
208
+ return sample
209
+
210
+
211
+ class PrepareForNet(object):
212
+ """Prepare sample for usage as network input.
213
+ """
214
+
215
+ def __init__(self):
216
+ pass
217
+
218
+ def __call__(self, sample):
219
+ image = np.transpose(sample["image"], (2, 0, 1))
220
+ sample["image"] = np.ascontiguousarray(image).astype(np.float32)
221
+
222
+ if "mask" in sample:
223
+ sample["mask"] = sample["mask"].astype(np.float32)
224
+ sample["mask"] = np.ascontiguousarray(sample["mask"])
225
+
226
+ if "disparity" in sample:
227
+ disparity = sample["disparity"].astype(np.float32)
228
+ sample["disparity"] = np.ascontiguousarray(disparity)
229
+
230
+ if "depth" in sample:
231
+ depth = sample["depth"].astype(np.float32)
232
+ sample["depth"] = np.ascontiguousarray(depth)
233
+
234
+ return sample
src/flux/annotator/midas/midas/vit.py ADDED
@@ -0,0 +1,491 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import timm
4
+ import types
5
+ import math
6
+ import torch.nn.functional as F
7
+
8
+
9
+ class Slice(nn.Module):
10
+ def __init__(self, start_index=1):
11
+ super(Slice, self).__init__()
12
+ self.start_index = start_index
13
+
14
+ def forward(self, x):
15
+ return x[:, self.start_index :]
16
+
17
+
18
+ class AddReadout(nn.Module):
19
+ def __init__(self, start_index=1):
20
+ super(AddReadout, self).__init__()
21
+ self.start_index = start_index
22
+
23
+ def forward(self, x):
24
+ if self.start_index == 2:
25
+ readout = (x[:, 0] + x[:, 1]) / 2
26
+ else:
27
+ readout = x[:, 0]
28
+ return x[:, self.start_index :] + readout.unsqueeze(1)
29
+
30
+
31
+ class ProjectReadout(nn.Module):
32
+ def __init__(self, in_features, start_index=1):
33
+ super(ProjectReadout, self).__init__()
34
+ self.start_index = start_index
35
+
36
+ self.project = nn.Sequential(nn.Linear(2 * in_features, in_features), nn.GELU())
37
+
38
+ def forward(self, x):
39
+ readout = x[:, 0].unsqueeze(1).expand_as(x[:, self.start_index :])
40
+ features = torch.cat((x[:, self.start_index :], readout), -1)
41
+
42
+ return self.project(features)
43
+
44
+
45
+ class Transpose(nn.Module):
46
+ def __init__(self, dim0, dim1):
47
+ super(Transpose, self).__init__()
48
+ self.dim0 = dim0
49
+ self.dim1 = dim1
50
+
51
+ def forward(self, x):
52
+ x = x.transpose(self.dim0, self.dim1)
53
+ return x
54
+
55
+
56
+ def forward_vit(pretrained, x):
57
+ b, c, h, w = x.shape
58
+
59
+ glob = pretrained.model.forward_flex(x)
60
+
61
+ layer_1 = pretrained.activations["1"]
62
+ layer_2 = pretrained.activations["2"]
63
+ layer_3 = pretrained.activations["3"]
64
+ layer_4 = pretrained.activations["4"]
65
+
66
+ layer_1 = pretrained.act_postprocess1[0:2](layer_1)
67
+ layer_2 = pretrained.act_postprocess2[0:2](layer_2)
68
+ layer_3 = pretrained.act_postprocess3[0:2](layer_3)
69
+ layer_4 = pretrained.act_postprocess4[0:2](layer_4)
70
+
71
+ unflatten = nn.Sequential(
72
+ nn.Unflatten(
73
+ 2,
74
+ torch.Size(
75
+ [
76
+ h // pretrained.model.patch_size[1],
77
+ w // pretrained.model.patch_size[0],
78
+ ]
79
+ ),
80
+ )
81
+ )
82
+
83
+ if layer_1.ndim == 3:
84
+ layer_1 = unflatten(layer_1)
85
+ if layer_2.ndim == 3:
86
+ layer_2 = unflatten(layer_2)
87
+ if layer_3.ndim == 3:
88
+ layer_3 = unflatten(layer_3)
89
+ if layer_4.ndim == 3:
90
+ layer_4 = unflatten(layer_4)
91
+
92
+ layer_1 = pretrained.act_postprocess1[3 : len(pretrained.act_postprocess1)](layer_1)
93
+ layer_2 = pretrained.act_postprocess2[3 : len(pretrained.act_postprocess2)](layer_2)
94
+ layer_3 = pretrained.act_postprocess3[3 : len(pretrained.act_postprocess3)](layer_3)
95
+ layer_4 = pretrained.act_postprocess4[3 : len(pretrained.act_postprocess4)](layer_4)
96
+
97
+ return layer_1, layer_2, layer_3, layer_4
98
+
99
+
100
+ def _resize_pos_embed(self, posemb, gs_h, gs_w):
101
+ posemb_tok, posemb_grid = (
102
+ posemb[:, : self.start_index],
103
+ posemb[0, self.start_index :],
104
+ )
105
+
106
+ gs_old = int(math.sqrt(len(posemb_grid)))
107
+
108
+ posemb_grid = posemb_grid.reshape(1, gs_old, gs_old, -1).permute(0, 3, 1, 2)
109
+ posemb_grid = F.interpolate(posemb_grid, size=(gs_h, gs_w), mode="bilinear")
110
+ posemb_grid = posemb_grid.permute(0, 2, 3, 1).reshape(1, gs_h * gs_w, -1)
111
+
112
+ posemb = torch.cat([posemb_tok, posemb_grid], dim=1)
113
+
114
+ return posemb
115
+
116
+
117
+ def forward_flex(self, x):
118
+ b, c, h, w = x.shape
119
+
120
+ pos_embed = self._resize_pos_embed(
121
+ self.pos_embed, h // self.patch_size[1], w // self.patch_size[0]
122
+ )
123
+
124
+ B = x.shape[0]
125
+
126
+ if hasattr(self.patch_embed, "backbone"):
127
+ x = self.patch_embed.backbone(x)
128
+ if isinstance(x, (list, tuple)):
129
+ x = x[-1] # last feature if backbone outputs list/tuple of features
130
+
131
+ x = self.patch_embed.proj(x).flatten(2).transpose(1, 2)
132
+
133
+ if getattr(self, "dist_token", None) is not None:
134
+ cls_tokens = self.cls_token.expand(
135
+ B, -1, -1
136
+ ) # stole cls_tokens impl from Phil Wang, thanks
137
+ dist_token = self.dist_token.expand(B, -1, -1)
138
+ x = torch.cat((cls_tokens, dist_token, x), dim=1)
139
+ else:
140
+ cls_tokens = self.cls_token.expand(
141
+ B, -1, -1
142
+ ) # stole cls_tokens impl from Phil Wang, thanks
143
+ x = torch.cat((cls_tokens, x), dim=1)
144
+
145
+ x = x + pos_embed
146
+ x = self.pos_drop(x)
147
+
148
+ for blk in self.blocks:
149
+ x = blk(x)
150
+
151
+ x = self.norm(x)
152
+
153
+ return x
154
+
155
+
156
+ activations = {}
157
+
158
+
159
+ def get_activation(name):
160
+ def hook(model, input, output):
161
+ activations[name] = output
162
+
163
+ return hook
164
+
165
+
166
+ def get_readout_oper(vit_features, features, use_readout, start_index=1):
167
+ if use_readout == "ignore":
168
+ readout_oper = [Slice(start_index)] * len(features)
169
+ elif use_readout == "add":
170
+ readout_oper = [AddReadout(start_index)] * len(features)
171
+ elif use_readout == "project":
172
+ readout_oper = [
173
+ ProjectReadout(vit_features, start_index) for out_feat in features
174
+ ]
175
+ else:
176
+ assert (
177
+ False
178
+ ), "wrong operation for readout token, use_readout can be 'ignore', 'add', or 'project'"
179
+
180
+ return readout_oper
181
+
182
+
183
+ def _make_vit_b16_backbone(
184
+ model,
185
+ features=[96, 192, 384, 768],
186
+ size=[384, 384],
187
+ hooks=[2, 5, 8, 11],
188
+ vit_features=768,
189
+ use_readout="ignore",
190
+ start_index=1,
191
+ ):
192
+ pretrained = nn.Module()
193
+
194
+ pretrained.model = model
195
+ pretrained.model.blocks[hooks[0]].register_forward_hook(get_activation("1"))
196
+ pretrained.model.blocks[hooks[1]].register_forward_hook(get_activation("2"))
197
+ pretrained.model.blocks[hooks[2]].register_forward_hook(get_activation("3"))
198
+ pretrained.model.blocks[hooks[3]].register_forward_hook(get_activation("4"))
199
+
200
+ pretrained.activations = activations
201
+
202
+ readout_oper = get_readout_oper(vit_features, features, use_readout, start_index)
203
+
204
+ # 32, 48, 136, 384
205
+ pretrained.act_postprocess1 = nn.Sequential(
206
+ readout_oper[0],
207
+ Transpose(1, 2),
208
+ nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
209
+ nn.Conv2d(
210
+ in_channels=vit_features,
211
+ out_channels=features[0],
212
+ kernel_size=1,
213
+ stride=1,
214
+ padding=0,
215
+ ),
216
+ nn.ConvTranspose2d(
217
+ in_channels=features[0],
218
+ out_channels=features[0],
219
+ kernel_size=4,
220
+ stride=4,
221
+ padding=0,
222
+ bias=True,
223
+ dilation=1,
224
+ groups=1,
225
+ ),
226
+ )
227
+
228
+ pretrained.act_postprocess2 = nn.Sequential(
229
+ readout_oper[1],
230
+ Transpose(1, 2),
231
+ nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
232
+ nn.Conv2d(
233
+ in_channels=vit_features,
234
+ out_channels=features[1],
235
+ kernel_size=1,
236
+ stride=1,
237
+ padding=0,
238
+ ),
239
+ nn.ConvTranspose2d(
240
+ in_channels=features[1],
241
+ out_channels=features[1],
242
+ kernel_size=2,
243
+ stride=2,
244
+ padding=0,
245
+ bias=True,
246
+ dilation=1,
247
+ groups=1,
248
+ ),
249
+ )
250
+
251
+ pretrained.act_postprocess3 = nn.Sequential(
252
+ readout_oper[2],
253
+ Transpose(1, 2),
254
+ nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
255
+ nn.Conv2d(
256
+ in_channels=vit_features,
257
+ out_channels=features[2],
258
+ kernel_size=1,
259
+ stride=1,
260
+ padding=0,
261
+ ),
262
+ )
263
+
264
+ pretrained.act_postprocess4 = nn.Sequential(
265
+ readout_oper[3],
266
+ Transpose(1, 2),
267
+ nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
268
+ nn.Conv2d(
269
+ in_channels=vit_features,
270
+ out_channels=features[3],
271
+ kernel_size=1,
272
+ stride=1,
273
+ padding=0,
274
+ ),
275
+ nn.Conv2d(
276
+ in_channels=features[3],
277
+ out_channels=features[3],
278
+ kernel_size=3,
279
+ stride=2,
280
+ padding=1,
281
+ ),
282
+ )
283
+
284
+ pretrained.model.start_index = start_index
285
+ pretrained.model.patch_size = [16, 16]
286
+
287
+ # We inject this function into the VisionTransformer instances so that
288
+ # we can use it with interpolated position embeddings without modifying the library source.
289
+ pretrained.model.forward_flex = types.MethodType(forward_flex, pretrained.model)
290
+ pretrained.model._resize_pos_embed = types.MethodType(
291
+ _resize_pos_embed, pretrained.model
292
+ )
293
+
294
+ return pretrained
295
+
296
+
297
+ def _make_pretrained_vitl16_384(pretrained, use_readout="ignore", hooks=None):
298
+ model = timm.create_model("vit_large_patch16_384", pretrained=pretrained)
299
+
300
+ hooks = [5, 11, 17, 23] if hooks == None else hooks
301
+ return _make_vit_b16_backbone(
302
+ model,
303
+ features=[256, 512, 1024, 1024],
304
+ hooks=hooks,
305
+ vit_features=1024,
306
+ use_readout=use_readout,
307
+ )
308
+
309
+
310
+ def _make_pretrained_vitb16_384(pretrained, use_readout="ignore", hooks=None):
311
+ model = timm.create_model("vit_base_patch16_384", pretrained=pretrained)
312
+
313
+ hooks = [2, 5, 8, 11] if hooks == None else hooks
314
+ return _make_vit_b16_backbone(
315
+ model, features=[96, 192, 384, 768], hooks=hooks, use_readout=use_readout
316
+ )
317
+
318
+
319
+ def _make_pretrained_deitb16_384(pretrained, use_readout="ignore", hooks=None):
320
+ model = timm.create_model("vit_deit_base_patch16_384", pretrained=pretrained)
321
+
322
+ hooks = [2, 5, 8, 11] if hooks == None else hooks
323
+ return _make_vit_b16_backbone(
324
+ model, features=[96, 192, 384, 768], hooks=hooks, use_readout=use_readout
325
+ )
326
+
327
+
328
+ def _make_pretrained_deitb16_distil_384(pretrained, use_readout="ignore", hooks=None):
329
+ model = timm.create_model(
330
+ "vit_deit_base_distilled_patch16_384", pretrained=pretrained
331
+ )
332
+
333
+ hooks = [2, 5, 8, 11] if hooks == None else hooks
334
+ return _make_vit_b16_backbone(
335
+ model,
336
+ features=[96, 192, 384, 768],
337
+ hooks=hooks,
338
+ use_readout=use_readout,
339
+ start_index=2,
340
+ )
341
+
342
+
343
+ def _make_vit_b_rn50_backbone(
344
+ model,
345
+ features=[256, 512, 768, 768],
346
+ size=[384, 384],
347
+ hooks=[0, 1, 8, 11],
348
+ vit_features=768,
349
+ use_vit_only=False,
350
+ use_readout="ignore",
351
+ start_index=1,
352
+ ):
353
+ pretrained = nn.Module()
354
+
355
+ pretrained.model = model
356
+
357
+ if use_vit_only == True:
358
+ pretrained.model.blocks[hooks[0]].register_forward_hook(get_activation("1"))
359
+ pretrained.model.blocks[hooks[1]].register_forward_hook(get_activation("2"))
360
+ else:
361
+ pretrained.model.patch_embed.backbone.stages[0].register_forward_hook(
362
+ get_activation("1")
363
+ )
364
+ pretrained.model.patch_embed.backbone.stages[1].register_forward_hook(
365
+ get_activation("2")
366
+ )
367
+
368
+ pretrained.model.blocks[hooks[2]].register_forward_hook(get_activation("3"))
369
+ pretrained.model.blocks[hooks[3]].register_forward_hook(get_activation("4"))
370
+
371
+ pretrained.activations = activations
372
+
373
+ readout_oper = get_readout_oper(vit_features, features, use_readout, start_index)
374
+
375
+ if use_vit_only == True:
376
+ pretrained.act_postprocess1 = nn.Sequential(
377
+ readout_oper[0],
378
+ Transpose(1, 2),
379
+ nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
380
+ nn.Conv2d(
381
+ in_channels=vit_features,
382
+ out_channels=features[0],
383
+ kernel_size=1,
384
+ stride=1,
385
+ padding=0,
386
+ ),
387
+ nn.ConvTranspose2d(
388
+ in_channels=features[0],
389
+ out_channels=features[0],
390
+ kernel_size=4,
391
+ stride=4,
392
+ padding=0,
393
+ bias=True,
394
+ dilation=1,
395
+ groups=1,
396
+ ),
397
+ )
398
+
399
+ pretrained.act_postprocess2 = nn.Sequential(
400
+ readout_oper[1],
401
+ Transpose(1, 2),
402
+ nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
403
+ nn.Conv2d(
404
+ in_channels=vit_features,
405
+ out_channels=features[1],
406
+ kernel_size=1,
407
+ stride=1,
408
+ padding=0,
409
+ ),
410
+ nn.ConvTranspose2d(
411
+ in_channels=features[1],
412
+ out_channels=features[1],
413
+ kernel_size=2,
414
+ stride=2,
415
+ padding=0,
416
+ bias=True,
417
+ dilation=1,
418
+ groups=1,
419
+ ),
420
+ )
421
+ else:
422
+ pretrained.act_postprocess1 = nn.Sequential(
423
+ nn.Identity(), nn.Identity(), nn.Identity()
424
+ )
425
+ pretrained.act_postprocess2 = nn.Sequential(
426
+ nn.Identity(), nn.Identity(), nn.Identity()
427
+ )
428
+
429
+ pretrained.act_postprocess3 = nn.Sequential(
430
+ readout_oper[2],
431
+ Transpose(1, 2),
432
+ nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
433
+ nn.Conv2d(
434
+ in_channels=vit_features,
435
+ out_channels=features[2],
436
+ kernel_size=1,
437
+ stride=1,
438
+ padding=0,
439
+ ),
440
+ )
441
+
442
+ pretrained.act_postprocess4 = nn.Sequential(
443
+ readout_oper[3],
444
+ Transpose(1, 2),
445
+ nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
446
+ nn.Conv2d(
447
+ in_channels=vit_features,
448
+ out_channels=features[3],
449
+ kernel_size=1,
450
+ stride=1,
451
+ padding=0,
452
+ ),
453
+ nn.Conv2d(
454
+ in_channels=features[3],
455
+ out_channels=features[3],
456
+ kernel_size=3,
457
+ stride=2,
458
+ padding=1,
459
+ ),
460
+ )
461
+
462
+ pretrained.model.start_index = start_index
463
+ pretrained.model.patch_size = [16, 16]
464
+
465
+ # We inject this function into the VisionTransformer instances so that
466
+ # we can use it with interpolated position embeddings without modifying the library source.
467
+ pretrained.model.forward_flex = types.MethodType(forward_flex, pretrained.model)
468
+
469
+ # We inject this function into the VisionTransformer instances so that
470
+ # we can use it with interpolated position embeddings without modifying the library source.
471
+ pretrained.model._resize_pos_embed = types.MethodType(
472
+ _resize_pos_embed, pretrained.model
473
+ )
474
+
475
+ return pretrained
476
+
477
+
478
+ def _make_pretrained_vitb_rn50_384(
479
+ pretrained, use_readout="ignore", hooks=None, use_vit_only=False
480
+ ):
481
+ model = timm.create_model("vit_base_resnet50_384", pretrained=pretrained)
482
+
483
+ hooks = [0, 1, 8, 11] if hooks == None else hooks
484
+ return _make_vit_b_rn50_backbone(
485
+ model,
486
+ features=[256, 512, 768, 768],
487
+ size=[384, 384],
488
+ hooks=hooks,
489
+ use_vit_only=use_vit_only,
490
+ use_readout=use_readout,
491
+ )
src/flux/annotator/midas/utils.py ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Utils for monoDepth."""
2
+ import sys
3
+ import re
4
+ import numpy as np
5
+ import cv2
6
+ import torch
7
+
8
+
9
+ def read_pfm(path):
10
+ """Read pfm file.
11
+
12
+ Args:
13
+ path (str): path to file
14
+
15
+ Returns:
16
+ tuple: (data, scale)
17
+ """
18
+ with open(path, "rb") as file:
19
+
20
+ color = None
21
+ width = None
22
+ height = None
23
+ scale = None
24
+ endian = None
25
+
26
+ header = file.readline().rstrip()
27
+ if header.decode("ascii") == "PF":
28
+ color = True
29
+ elif header.decode("ascii") == "Pf":
30
+ color = False
31
+ else:
32
+ raise Exception("Not a PFM file: " + path)
33
+
34
+ dim_match = re.match(r"^(\d+)\s(\d+)\s$", file.readline().decode("ascii"))
35
+ if dim_match:
36
+ width, height = list(map(int, dim_match.groups()))
37
+ else:
38
+ raise Exception("Malformed PFM header.")
39
+
40
+ scale = float(file.readline().decode("ascii").rstrip())
41
+ if scale < 0:
42
+ # little-endian
43
+ endian = "<"
44
+ scale = -scale
45
+ else:
46
+ # big-endian
47
+ endian = ">"
48
+
49
+ data = np.fromfile(file, endian + "f")
50
+ shape = (height, width, 3) if color else (height, width)
51
+
52
+ data = np.reshape(data, shape)
53
+ data = np.flipud(data)
54
+
55
+ return data, scale
56
+
57
+
58
+ def write_pfm(path, image, scale=1):
59
+ """Write pfm file.
60
+
61
+ Args:
62
+ path (str): pathto file
63
+ image (array): data
64
+ scale (int, optional): Scale. Defaults to 1.
65
+ """
66
+
67
+ with open(path, "wb") as file:
68
+ color = None
69
+
70
+ if image.dtype.name != "float32":
71
+ raise Exception("Image dtype must be float32.")
72
+
73
+ image = np.flipud(image)
74
+
75
+ if len(image.shape) == 3 and image.shape[2] == 3: # color image
76
+ color = True
77
+ elif (
78
+ len(image.shape) == 2 or len(image.shape) == 3 and image.shape[2] == 1
79
+ ): # greyscale
80
+ color = False
81
+ else:
82
+ raise Exception("Image must have H x W x 3, H x W x 1 or H x W dimensions.")
83
+
84
+ file.write("PF\n" if color else "Pf\n".encode())
85
+ file.write("%d %d\n".encode() % (image.shape[1], image.shape[0]))
86
+
87
+ endian = image.dtype.byteorder
88
+
89
+ if endian == "<" or endian == "=" and sys.byteorder == "little":
90
+ scale = -scale
91
+
92
+ file.write("%f\n".encode() % scale)
93
+
94
+ image.tofile(file)
95
+
96
+
97
+ def read_image(path):
98
+ """Read image and output RGB image (0-1).
99
+
100
+ Args:
101
+ path (str): path to file
102
+
103
+ Returns:
104
+ array: RGB image (0-1)
105
+ """
106
+ img = cv2.imread(path)
107
+
108
+ if img.ndim == 2:
109
+ img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
110
+
111
+ img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) / 255.0
112
+
113
+ return img
114
+
115
+
116
+ def resize_image(img):
117
+ """Resize image and make it fit for network.
118
+
119
+ Args:
120
+ img (array): image
121
+
122
+ Returns:
123
+ tensor: data ready for network
124
+ """
125
+ height_orig = img.shape[0]
126
+ width_orig = img.shape[1]
127
+
128
+ if width_orig > height_orig:
129
+ scale = width_orig / 384
130
+ else:
131
+ scale = height_orig / 384
132
+
133
+ height = (np.ceil(height_orig / scale / 32) * 32).astype(int)
134
+ width = (np.ceil(width_orig / scale / 32) * 32).astype(int)
135
+
136
+ img_resized = cv2.resize(img, (width, height), interpolation=cv2.INTER_AREA)
137
+
138
+ img_resized = (
139
+ torch.from_numpy(np.transpose(img_resized, (2, 0, 1))).contiguous().float()
140
+ )
141
+ img_resized = img_resized.unsqueeze(0)
142
+
143
+ return img_resized
144
+
145
+
146
+ def resize_depth(depth, width, height):
147
+ """Resize depth map and bring to CPU (numpy).
148
+
149
+ Args:
150
+ depth (tensor): depth
151
+ width (int): image width
152
+ height (int): image height
153
+
154
+ Returns:
155
+ array: processed depth
156
+ """
157
+ depth = torch.squeeze(depth[0, :, :, :]).to("cpu")
158
+
159
+ depth_resized = cv2.resize(
160
+ depth.numpy(), (width, height), interpolation=cv2.INTER_CUBIC
161
+ )
162
+
163
+ return depth_resized
164
+
165
+ def write_depth(path, depth, bits=1):
166
+ """Write depth map to pfm and png file.
167
+
168
+ Args:
169
+ path (str): filepath without extension
170
+ depth (array): depth
171
+ """
172
+ write_pfm(path + ".pfm", depth.astype(np.float32))
173
+
174
+ depth_min = depth.min()
175
+ depth_max = depth.max()
176
+
177
+ max_val = (2**(8*bits))-1
178
+
179
+ if depth_max - depth_min > np.finfo("float").eps:
180
+ out = max_val * (depth - depth_min) / (depth_max - depth_min)
181
+ else:
182
+ out = np.zeros(depth.shape, dtype=depth.type)
183
+
184
+ if bits == 1:
185
+ cv2.imwrite(path + ".png", out.astype("uint8"))
186
+ elif bits == 2:
187
+ cv2.imwrite(path + ".png", out.astype("uint16"))
188
+
189
+ return
src/flux/annotator/mlsd/LICENSE ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity. For the purposes of this definition,
18
+ "control" means (i) the power, direct or indirect, to cause the
19
+ direction or management of such entity, whether by contract or
20
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+ outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+ "You" (or "Your") shall mean an individual or Legal Entity
24
+ exercising permissions granted by this License.
25
+
26
+ "Source" form shall mean the preferred form for making modifications,
27
+ including but not limited to software source code, documentation
28
+ source, and configuration files.
29
+
30
+ "Object" form shall mean any form resulting from mechanical
31
+ transformation or translation of a Source form, including but
32
+ not limited to compiled object code, generated documentation,
33
+ and conversions to other media types.
34
+
35
+ "Work" shall mean the work of authorship, whether in Source or
36
+ Object form, made available under the License, as indicated by a
37
+ copyright notice that is included in or attached to the work
38
+ (an example is provided in the Appendix below).
39
+
40
+ "Derivative Works" shall mean any work, whether in Source or Object
41
+ form, that is based on (or derived from) the Work and for which the
42
+ editorial revisions, annotations, elaborations, or other modifications
43
+ represent, as a whole, an original work of authorship. For the purposes
44
+ of this License, Derivative Works shall not include works that remain
45
+ separable from, or merely link (or bind by name) to the interfaces of,
46
+ the Work and Derivative Works thereof.
47
+
48
+ "Contribution" shall mean any work of authorship, including
49
+ the original version of the Work and any modifications or additions
50
+ to that Work or Derivative Works thereof, that is intentionally
51
+ submitted to Licensor for inclusion in the Work by the copyright owner
52
+ or by an individual or Legal Entity authorized to submit on behalf of
53
+ the copyright owner. For the purposes of this definition, "submitted"
54
+ means any form of electronic, verbal, or written communication sent
55
+ to the Licensor or its representatives, including but not limited to
56
+ communication on electronic mailing lists, source code control systems,
57
+ and issue tracking systems that are managed by, or on behalf of, the
58
+ Licensor for the purpose of discussing and improving the Work, but
59
+ excluding communication that is conspicuously marked or otherwise
60
+ designated in writing by the copyright owner as "Not a Contribution."
61
+
62
+ "Contributor" shall mean Licensor and any individual or Legal Entity
63
+ on behalf of whom a Contribution has been received by Licensor and
64
+ subsequently incorporated within the Work.
65
+
66
+ 2. Grant of Copyright License. Subject to the terms and conditions of
67
+ this License, each Contributor hereby grants to You a perpetual,
68
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
+ copyright license to reproduce, prepare Derivative Works of,
70
+ publicly display, publicly perform, sublicense, and distribute the
71
+ Work and such Derivative Works in Source or Object form.
72
+
73
+ 3. Grant of Patent License. Subject to the terms and conditions of
74
+ this License, each Contributor hereby grants to You a perpetual,
75
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
+ (except as stated in this section) patent license to make, have made,
77
+ use, offer to sell, sell, import, and otherwise transfer the Work,
78
+ where such license applies only to those patent claims licensable
79
+ by such Contributor that are necessarily infringed by their
80
+ Contribution(s) alone or by combination of their Contribution(s)
81
+ with the Work to which such Contribution(s) was submitted. If You
82
+ institute patent litigation against any entity (including a
83
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
84
+ or a Contribution incorporated within the Work constitutes direct
85
+ or contributory patent infringement, then any patent licenses
86
+ granted to You under this License for that Work shall terminate
87
+ as of the date such litigation is filed.
88
+
89
+ 4. Redistribution. You may reproduce and distribute copies of the
90
+ Work or Derivative Works thereof in any medium, with or without
91
+ modifications, and in Source or Object form, provided that You
92
+ meet the following conditions:
93
+
94
+ (a) You must give any other recipients of the Work or
95
+ Derivative Works a copy of this License; and
96
+
97
+ (b) You must cause any modified files to carry prominent notices
98
+ stating that You changed the files; and
99
+
100
+ (c) You must retain, in the Source form of any Derivative Works
101
+ that You distribute, all copyright, patent, trademark, and
102
+ attribution notices from the Source form of the Work,
103
+ excluding those notices that do not pertain to any part of
104
+ the Derivative Works; and
105
+
106
+ (d) If the Work includes a "NOTICE" text file as part of its
107
+ distribution, then any Derivative Works that You distribute must
108
+ include a readable copy of the attribution notices contained
109
+ within such NOTICE file, excluding those notices that do not
110
+ pertain to any part of the Derivative Works, in at least one
111
+ of the following places: within a NOTICE text file distributed
112
+ as part of the Derivative Works; within the Source form or
113
+ documentation, if provided along with the Derivative Works; or,
114
+ within a display generated by the Derivative Works, if and
115
+ wherever such third-party notices normally appear. The contents
116
+ of the NOTICE file are for informational purposes only and
117
+ do not modify the License. You may add Your own attribution
118
+ notices within Derivative Works that You distribute, alongside
119
+ or as an addendum to the NOTICE text from the Work, provided
120
+ that such additional attribution notices cannot be construed
121
+ as modifying the License.
122
+
123
+ You may add Your own copyright statement to Your modifications and
124
+ may provide additional or different license terms and conditions
125
+ for use, reproduction, or distribution of Your modifications, or
126
+ for any such Derivative Works as a whole, provided Your use,
127
+ reproduction, and distribution of the Work otherwise complies with
128
+ the conditions stated in this License.
129
+
130
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
131
+ any Contribution intentionally submitted for inclusion in the Work
132
+ by You to the Licensor shall be under the terms and conditions of
133
+ this License, without any additional terms or conditions.
134
+ Notwithstanding the above, nothing herein shall supersede or modify
135
+ the terms of any separate license agreement you may have executed
136
+ with Licensor regarding such Contributions.
137
+
138
+ 6. Trademarks. This License does not grant permission to use the trade
139
+ names, trademarks, service marks, or product names of the Licensor,
140
+ except as required for reasonable and customary use in describing the
141
+ origin of the Work and reproducing the content of the NOTICE file.
142
+
143
+ 7. Disclaimer of Warranty. Unless required by applicable law or
144
+ agreed to in writing, Licensor provides the Work (and each
145
+ Contributor provides its Contributions) on an "AS IS" BASIS,
146
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
+ implied, including, without limitation, any warranties or conditions
148
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
+ PARTICULAR PURPOSE. You are solely responsible for determining the
150
+ appropriateness of using or redistributing the Work and assume any
151
+ risks associated with Your exercise of permissions under this License.
152
+
153
+ 8. Limitation of Liability. In no event and under no legal theory,
154
+ whether in tort (including negligence), contract, or otherwise,
155
+ unless required by applicable law (such as deliberate and grossly
156
+ negligent acts) or agreed to in writing, shall any Contributor be
157
+ liable to You for damages, including any direct, indirect, special,
158
+ incidental, or consequential damages of any character arising as a
159
+ result of this License or out of the use or inability to use the
160
+ Work (including but not limited to damages for loss of goodwill,
161
+ work stoppage, computer failure or malfunction, or any and all
162
+ other commercial damages or losses), even if such Contributor
163
+ has been advised of the possibility of such damages.
164
+
165
+ 9. Accepting Warranty or Additional Liability. While redistributing
166
+ the Work or Derivative Works thereof, You may choose to offer,
167
+ and charge a fee for, acceptance of support, warranty, indemnity,
168
+ or other liability obligations and/or rights consistent with this
169
+ License. However, in accepting such obligations, You may act only
170
+ on Your own behalf and on Your sole responsibility, not on behalf
171
+ of any other Contributor, and only if You agree to indemnify,
172
+ defend, and hold each Contributor harmless for any liability
173
+ incurred by, or claims asserted against, such Contributor by reason
174
+ of your accepting any such warranty or additional liability.
175
+
176
+ END OF TERMS AND CONDITIONS
177
+
178
+ APPENDIX: How to apply the Apache License to your work.
179
+
180
+ To apply the Apache License to your work, attach the following
181
+ boilerplate notice, with the fields enclosed by brackets "{}"
182
+ replaced with your own identifying information. (Don't include
183
+ the brackets!) The text should be enclosed in the appropriate
184
+ comment syntax for the file format. We also recommend that a
185
+ file or class name and description of purpose be included on the
186
+ same "printed page" as the copyright notice for easier
187
+ identification within third-party archives.
188
+
189
+ Copyright 2021-present NAVER Corp.
190
+
191
+ Licensed under the Apache License, Version 2.0 (the "License");
192
+ you may not use this file except in compliance with the License.
193
+ You may obtain a copy of the License at
194
+
195
+ http://www.apache.org/licenses/LICENSE-2.0
196
+
197
+ Unless required by applicable law or agreed to in writing, software
198
+ distributed under the License is distributed on an "AS IS" BASIS,
199
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200
+ See the License for the specific language governing permissions and
201
+ limitations under the License.
src/flux/annotator/mlsd/__init__.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # MLSD Line Detection
2
+ # From https://github.com/navervision/mlsd
3
+ # Apache-2.0 license
4
+
5
+ import cv2
6
+ import numpy as np
7
+ import torch
8
+ import os
9
+
10
+ from einops import rearrange
11
+ from huggingface_hub import hf_hub_download
12
+ from .models.mbv2_mlsd_tiny import MobileV2_MLSD_Tiny
13
+ from .models.mbv2_mlsd_large import MobileV2_MLSD_Large
14
+ from .utils import pred_lines
15
+
16
+ from ...annotator.util import annotator_ckpts_path
17
+
18
+
19
+ class MLSDdetector:
20
+ def __init__(self):
21
+ model_path = os.path.join(annotator_ckpts_path, "mlsd_large_512_fp32.pth")
22
+ if not os.path.exists(model_path):
23
+ model_path = hf_hub_download("lllyasviel/Annotators", "mlsd_large_512_fp32.pth")
24
+ model = MobileV2_MLSD_Large()
25
+ model.load_state_dict(torch.load(model_path), strict=True)
26
+ self.model = model.cuda().eval()
27
+
28
+ def __call__(self, input_image, thr_v, thr_d):
29
+ assert input_image.ndim == 3
30
+ img = input_image
31
+ img_output = np.zeros_like(img)
32
+ try:
33
+ with torch.no_grad():
34
+ lines = pred_lines(img, self.model, [img.shape[0], img.shape[1]], thr_v, thr_d)
35
+ for line in lines:
36
+ x_start, y_start, x_end, y_end = [int(val) for val in line]
37
+ cv2.line(img_output, (x_start, y_start), (x_end, y_end), [255, 255, 255], 1)
38
+ except Exception as e:
39
+ pass
40
+ return img_output[:, :, 0]
src/flux/annotator/mlsd/models/mbv2_mlsd_large.py ADDED
@@ -0,0 +1,292 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import torch
4
+ import torch.nn as nn
5
+ import torch.utils.model_zoo as model_zoo
6
+ from torch.nn import functional as F
7
+
8
+
9
+ class BlockTypeA(nn.Module):
10
+ def __init__(self, in_c1, in_c2, out_c1, out_c2, upscale = True):
11
+ super(BlockTypeA, self).__init__()
12
+ self.conv1 = nn.Sequential(
13
+ nn.Conv2d(in_c2, out_c2, kernel_size=1),
14
+ nn.BatchNorm2d(out_c2),
15
+ nn.ReLU(inplace=True)
16
+ )
17
+ self.conv2 = nn.Sequential(
18
+ nn.Conv2d(in_c1, out_c1, kernel_size=1),
19
+ nn.BatchNorm2d(out_c1),
20
+ nn.ReLU(inplace=True)
21
+ )
22
+ self.upscale = upscale
23
+
24
+ def forward(self, a, b):
25
+ b = self.conv1(b)
26
+ a = self.conv2(a)
27
+ if self.upscale:
28
+ b = F.interpolate(b, scale_factor=2.0, mode='bilinear', align_corners=True)
29
+ return torch.cat((a, b), dim=1)
30
+
31
+
32
+ class BlockTypeB(nn.Module):
33
+ def __init__(self, in_c, out_c):
34
+ super(BlockTypeB, self).__init__()
35
+ self.conv1 = nn.Sequential(
36
+ nn.Conv2d(in_c, in_c, kernel_size=3, padding=1),
37
+ nn.BatchNorm2d(in_c),
38
+ nn.ReLU()
39
+ )
40
+ self.conv2 = nn.Sequential(
41
+ nn.Conv2d(in_c, out_c, kernel_size=3, padding=1),
42
+ nn.BatchNorm2d(out_c),
43
+ nn.ReLU()
44
+ )
45
+
46
+ def forward(self, x):
47
+ x = self.conv1(x) + x
48
+ x = self.conv2(x)
49
+ return x
50
+
51
+ class BlockTypeC(nn.Module):
52
+ def __init__(self, in_c, out_c):
53
+ super(BlockTypeC, self).__init__()
54
+ self.conv1 = nn.Sequential(
55
+ nn.Conv2d(in_c, in_c, kernel_size=3, padding=5, dilation=5),
56
+ nn.BatchNorm2d(in_c),
57
+ nn.ReLU()
58
+ )
59
+ self.conv2 = nn.Sequential(
60
+ nn.Conv2d(in_c, in_c, kernel_size=3, padding=1),
61
+ nn.BatchNorm2d(in_c),
62
+ nn.ReLU()
63
+ )
64
+ self.conv3 = nn.Conv2d(in_c, out_c, kernel_size=1)
65
+
66
+ def forward(self, x):
67
+ x = self.conv1(x)
68
+ x = self.conv2(x)
69
+ x = self.conv3(x)
70
+ return x
71
+
72
+ def _make_divisible(v, divisor, min_value=None):
73
+ """
74
+ This function is taken from the original tf repo.
75
+ It ensures that all layers have a channel number that is divisible by 8
76
+ It can be seen here:
77
+ https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
78
+ :param v:
79
+ :param divisor:
80
+ :param min_value:
81
+ :return:
82
+ """
83
+ if min_value is None:
84
+ min_value = divisor
85
+ new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
86
+ # Make sure that round down does not go down by more than 10%.
87
+ if new_v < 0.9 * v:
88
+ new_v += divisor
89
+ return new_v
90
+
91
+
92
+ class ConvBNReLU(nn.Sequential):
93
+ def __init__(self, in_planes, out_planes, kernel_size=3, stride=1, groups=1):
94
+ self.channel_pad = out_planes - in_planes
95
+ self.stride = stride
96
+ #padding = (kernel_size - 1) // 2
97
+
98
+ # TFLite uses slightly different padding than PyTorch
99
+ if stride == 2:
100
+ padding = 0
101
+ else:
102
+ padding = (kernel_size - 1) // 2
103
+
104
+ super(ConvBNReLU, self).__init__(
105
+ nn.Conv2d(in_planes, out_planes, kernel_size, stride, padding, groups=groups, bias=False),
106
+ nn.BatchNorm2d(out_planes),
107
+ nn.ReLU6(inplace=True)
108
+ )
109
+ self.max_pool = nn.MaxPool2d(kernel_size=stride, stride=stride)
110
+
111
+
112
+ def forward(self, x):
113
+ # TFLite uses different padding
114
+ if self.stride == 2:
115
+ x = F.pad(x, (0, 1, 0, 1), "constant", 0)
116
+ #print(x.shape)
117
+
118
+ for module in self:
119
+ if not isinstance(module, nn.MaxPool2d):
120
+ x = module(x)
121
+ return x
122
+
123
+
124
+ class InvertedResidual(nn.Module):
125
+ def __init__(self, inp, oup, stride, expand_ratio):
126
+ super(InvertedResidual, self).__init__()
127
+ self.stride = stride
128
+ assert stride in [1, 2]
129
+
130
+ hidden_dim = int(round(inp * expand_ratio))
131
+ self.use_res_connect = self.stride == 1 and inp == oup
132
+
133
+ layers = []
134
+ if expand_ratio != 1:
135
+ # pw
136
+ layers.append(ConvBNReLU(inp, hidden_dim, kernel_size=1))
137
+ layers.extend([
138
+ # dw
139
+ ConvBNReLU(hidden_dim, hidden_dim, stride=stride, groups=hidden_dim),
140
+ # pw-linear
141
+ nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
142
+ nn.BatchNorm2d(oup),
143
+ ])
144
+ self.conv = nn.Sequential(*layers)
145
+
146
+ def forward(self, x):
147
+ if self.use_res_connect:
148
+ return x + self.conv(x)
149
+ else:
150
+ return self.conv(x)
151
+
152
+
153
+ class MobileNetV2(nn.Module):
154
+ def __init__(self, pretrained=True):
155
+ """
156
+ MobileNet V2 main class
157
+ Args:
158
+ num_classes (int): Number of classes
159
+ width_mult (float): Width multiplier - adjusts number of channels in each layer by this amount
160
+ inverted_residual_setting: Network structure
161
+ round_nearest (int): Round the number of channels in each layer to be a multiple of this number
162
+ Set to 1 to turn off rounding
163
+ block: Module specifying inverted residual building block for mobilenet
164
+ """
165
+ super(MobileNetV2, self).__init__()
166
+
167
+ block = InvertedResidual
168
+ input_channel = 32
169
+ last_channel = 1280
170
+ width_mult = 1.0
171
+ round_nearest = 8
172
+
173
+ inverted_residual_setting = [
174
+ # t, c, n, s
175
+ [1, 16, 1, 1],
176
+ [6, 24, 2, 2],
177
+ [6, 32, 3, 2],
178
+ [6, 64, 4, 2],
179
+ [6, 96, 3, 1],
180
+ #[6, 160, 3, 2],
181
+ #[6, 320, 1, 1],
182
+ ]
183
+
184
+ # only check the first element, assuming user knows t,c,n,s are required
185
+ if len(inverted_residual_setting) == 0 or len(inverted_residual_setting[0]) != 4:
186
+ raise ValueError("inverted_residual_setting should be non-empty "
187
+ "or a 4-element list, got {}".format(inverted_residual_setting))
188
+
189
+ # building first layer
190
+ input_channel = _make_divisible(input_channel * width_mult, round_nearest)
191
+ self.last_channel = _make_divisible(last_channel * max(1.0, width_mult), round_nearest)
192
+ features = [ConvBNReLU(4, input_channel, stride=2)]
193
+ # building inverted residual blocks
194
+ for t, c, n, s in inverted_residual_setting:
195
+ output_channel = _make_divisible(c * width_mult, round_nearest)
196
+ for i in range(n):
197
+ stride = s if i == 0 else 1
198
+ features.append(block(input_channel, output_channel, stride, expand_ratio=t))
199
+ input_channel = output_channel
200
+
201
+ self.features = nn.Sequential(*features)
202
+ self.fpn_selected = [1, 3, 6, 10, 13]
203
+ # weight initialization
204
+ for m in self.modules():
205
+ if isinstance(m, nn.Conv2d):
206
+ nn.init.kaiming_normal_(m.weight, mode='fan_out')
207
+ if m.bias is not None:
208
+ nn.init.zeros_(m.bias)
209
+ elif isinstance(m, nn.BatchNorm2d):
210
+ nn.init.ones_(m.weight)
211
+ nn.init.zeros_(m.bias)
212
+ elif isinstance(m, nn.Linear):
213
+ nn.init.normal_(m.weight, 0, 0.01)
214
+ nn.init.zeros_(m.bias)
215
+ if pretrained:
216
+ self._load_pretrained_model()
217
+
218
+ def _forward_impl(self, x):
219
+ # This exists since TorchScript doesn't support inheritance, so the superclass method
220
+ # (this one) needs to have a name other than `forward` that can be accessed in a subclass
221
+ fpn_features = []
222
+ for i, f in enumerate(self.features):
223
+ if i > self.fpn_selected[-1]:
224
+ break
225
+ x = f(x)
226
+ if i in self.fpn_selected:
227
+ fpn_features.append(x)
228
+
229
+ c1, c2, c3, c4, c5 = fpn_features
230
+ return c1, c2, c3, c4, c5
231
+
232
+
233
+ def forward(self, x):
234
+ return self._forward_impl(x)
235
+
236
+ def _load_pretrained_model(self):
237
+ pretrain_dict = model_zoo.load_url('https://download.pytorch.org/models/mobilenet_v2-b0353104.pth')
238
+ model_dict = {}
239
+ state_dict = self.state_dict()
240
+ for k, v in pretrain_dict.items():
241
+ if k in state_dict:
242
+ model_dict[k] = v
243
+ state_dict.update(model_dict)
244
+ self.load_state_dict(state_dict)
245
+
246
+
247
+ class MobileV2_MLSD_Large(nn.Module):
248
+ def __init__(self):
249
+ super(MobileV2_MLSD_Large, self).__init__()
250
+
251
+ self.backbone = MobileNetV2(pretrained=False)
252
+ ## A, B
253
+ self.block15 = BlockTypeA(in_c1= 64, in_c2= 96,
254
+ out_c1= 64, out_c2=64,
255
+ upscale=False)
256
+ self.block16 = BlockTypeB(128, 64)
257
+
258
+ ## A, B
259
+ self.block17 = BlockTypeA(in_c1 = 32, in_c2 = 64,
260
+ out_c1= 64, out_c2= 64)
261
+ self.block18 = BlockTypeB(128, 64)
262
+
263
+ ## A, B
264
+ self.block19 = BlockTypeA(in_c1=24, in_c2=64,
265
+ out_c1=64, out_c2=64)
266
+ self.block20 = BlockTypeB(128, 64)
267
+
268
+ ## A, B, C
269
+ self.block21 = BlockTypeA(in_c1=16, in_c2=64,
270
+ out_c1=64, out_c2=64)
271
+ self.block22 = BlockTypeB(128, 64)
272
+
273
+ self.block23 = BlockTypeC(64, 16)
274
+
275
+ def forward(self, x):
276
+ c1, c2, c3, c4, c5 = self.backbone(x)
277
+
278
+ x = self.block15(c4, c5)
279
+ x = self.block16(x)
280
+
281
+ x = self.block17(c3, x)
282
+ x = self.block18(x)
283
+
284
+ x = self.block19(c2, x)
285
+ x = self.block20(x)
286
+
287
+ x = self.block21(c1, x)
288
+ x = self.block22(x)
289
+ x = self.block23(x)
290
+ x = x[:, 7:, :, :]
291
+
292
+ return x
src/flux/annotator/mlsd/models/mbv2_mlsd_tiny.py ADDED
@@ -0,0 +1,275 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import torch
4
+ import torch.nn as nn
5
+ import torch.utils.model_zoo as model_zoo
6
+ from torch.nn import functional as F
7
+
8
+
9
+ class BlockTypeA(nn.Module):
10
+ def __init__(self, in_c1, in_c2, out_c1, out_c2, upscale = True):
11
+ super(BlockTypeA, self).__init__()
12
+ self.conv1 = nn.Sequential(
13
+ nn.Conv2d(in_c2, out_c2, kernel_size=1),
14
+ nn.BatchNorm2d(out_c2),
15
+ nn.ReLU(inplace=True)
16
+ )
17
+ self.conv2 = nn.Sequential(
18
+ nn.Conv2d(in_c1, out_c1, kernel_size=1),
19
+ nn.BatchNorm2d(out_c1),
20
+ nn.ReLU(inplace=True)
21
+ )
22
+ self.upscale = upscale
23
+
24
+ def forward(self, a, b):
25
+ b = self.conv1(b)
26
+ a = self.conv2(a)
27
+ b = F.interpolate(b, scale_factor=2.0, mode='bilinear', align_corners=True)
28
+ return torch.cat((a, b), dim=1)
29
+
30
+
31
+ class BlockTypeB(nn.Module):
32
+ def __init__(self, in_c, out_c):
33
+ super(BlockTypeB, self).__init__()
34
+ self.conv1 = nn.Sequential(
35
+ nn.Conv2d(in_c, in_c, kernel_size=3, padding=1),
36
+ nn.BatchNorm2d(in_c),
37
+ nn.ReLU()
38
+ )
39
+ self.conv2 = nn.Sequential(
40
+ nn.Conv2d(in_c, out_c, kernel_size=3, padding=1),
41
+ nn.BatchNorm2d(out_c),
42
+ nn.ReLU()
43
+ )
44
+
45
+ def forward(self, x):
46
+ x = self.conv1(x) + x
47
+ x = self.conv2(x)
48
+ return x
49
+
50
+ class BlockTypeC(nn.Module):
51
+ def __init__(self, in_c, out_c):
52
+ super(BlockTypeC, self).__init__()
53
+ self.conv1 = nn.Sequential(
54
+ nn.Conv2d(in_c, in_c, kernel_size=3, padding=5, dilation=5),
55
+ nn.BatchNorm2d(in_c),
56
+ nn.ReLU()
57
+ )
58
+ self.conv2 = nn.Sequential(
59
+ nn.Conv2d(in_c, in_c, kernel_size=3, padding=1),
60
+ nn.BatchNorm2d(in_c),
61
+ nn.ReLU()
62
+ )
63
+ self.conv3 = nn.Conv2d(in_c, out_c, kernel_size=1)
64
+
65
+ def forward(self, x):
66
+ x = self.conv1(x)
67
+ x = self.conv2(x)
68
+ x = self.conv3(x)
69
+ return x
70
+
71
+ def _make_divisible(v, divisor, min_value=None):
72
+ """
73
+ This function is taken from the original tf repo.
74
+ It ensures that all layers have a channel number that is divisible by 8
75
+ It can be seen here:
76
+ https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
77
+ :param v:
78
+ :param divisor:
79
+ :param min_value:
80
+ :return:
81
+ """
82
+ if min_value is None:
83
+ min_value = divisor
84
+ new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
85
+ # Make sure that round down does not go down by more than 10%.
86
+ if new_v < 0.9 * v:
87
+ new_v += divisor
88
+ return new_v
89
+
90
+
91
+ class ConvBNReLU(nn.Sequential):
92
+ def __init__(self, in_planes, out_planes, kernel_size=3, stride=1, groups=1):
93
+ self.channel_pad = out_planes - in_planes
94
+ self.stride = stride
95
+ #padding = (kernel_size - 1) // 2
96
+
97
+ # TFLite uses slightly different padding than PyTorch
98
+ if stride == 2:
99
+ padding = 0
100
+ else:
101
+ padding = (kernel_size - 1) // 2
102
+
103
+ super(ConvBNReLU, self).__init__(
104
+ nn.Conv2d(in_planes, out_planes, kernel_size, stride, padding, groups=groups, bias=False),
105
+ nn.BatchNorm2d(out_planes),
106
+ nn.ReLU6(inplace=True)
107
+ )
108
+ self.max_pool = nn.MaxPool2d(kernel_size=stride, stride=stride)
109
+
110
+
111
+ def forward(self, x):
112
+ # TFLite uses different padding
113
+ if self.stride == 2:
114
+ x = F.pad(x, (0, 1, 0, 1), "constant", 0)
115
+ #print(x.shape)
116
+
117
+ for module in self:
118
+ if not isinstance(module, nn.MaxPool2d):
119
+ x = module(x)
120
+ return x
121
+
122
+
123
+ class InvertedResidual(nn.Module):
124
+ def __init__(self, inp, oup, stride, expand_ratio):
125
+ super(InvertedResidual, self).__init__()
126
+ self.stride = stride
127
+ assert stride in [1, 2]
128
+
129
+ hidden_dim = int(round(inp * expand_ratio))
130
+ self.use_res_connect = self.stride == 1 and inp == oup
131
+
132
+ layers = []
133
+ if expand_ratio != 1:
134
+ # pw
135
+ layers.append(ConvBNReLU(inp, hidden_dim, kernel_size=1))
136
+ layers.extend([
137
+ # dw
138
+ ConvBNReLU(hidden_dim, hidden_dim, stride=stride, groups=hidden_dim),
139
+ # pw-linear
140
+ nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
141
+ nn.BatchNorm2d(oup),
142
+ ])
143
+ self.conv = nn.Sequential(*layers)
144
+
145
+ def forward(self, x):
146
+ if self.use_res_connect:
147
+ return x + self.conv(x)
148
+ else:
149
+ return self.conv(x)
150
+
151
+
152
+ class MobileNetV2(nn.Module):
153
+ def __init__(self, pretrained=True):
154
+ """
155
+ MobileNet V2 main class
156
+ Args:
157
+ num_classes (int): Number of classes
158
+ width_mult (float): Width multiplier - adjusts number of channels in each layer by this amount
159
+ inverted_residual_setting: Network structure
160
+ round_nearest (int): Round the number of channels in each layer to be a multiple of this number
161
+ Set to 1 to turn off rounding
162
+ block: Module specifying inverted residual building block for mobilenet
163
+ """
164
+ super(MobileNetV2, self).__init__()
165
+
166
+ block = InvertedResidual
167
+ input_channel = 32
168
+ last_channel = 1280
169
+ width_mult = 1.0
170
+ round_nearest = 8
171
+
172
+ inverted_residual_setting = [
173
+ # t, c, n, s
174
+ [1, 16, 1, 1],
175
+ [6, 24, 2, 2],
176
+ [6, 32, 3, 2],
177
+ [6, 64, 4, 2],
178
+ #[6, 96, 3, 1],
179
+ #[6, 160, 3, 2],
180
+ #[6, 320, 1, 1],
181
+ ]
182
+
183
+ # only check the first element, assuming user knows t,c,n,s are required
184
+ if len(inverted_residual_setting) == 0 or len(inverted_residual_setting[0]) != 4:
185
+ raise ValueError("inverted_residual_setting should be non-empty "
186
+ "or a 4-element list, got {}".format(inverted_residual_setting))
187
+
188
+ # building first layer
189
+ input_channel = _make_divisible(input_channel * width_mult, round_nearest)
190
+ self.last_channel = _make_divisible(last_channel * max(1.0, width_mult), round_nearest)
191
+ features = [ConvBNReLU(4, input_channel, stride=2)]
192
+ # building inverted residual blocks
193
+ for t, c, n, s in inverted_residual_setting:
194
+ output_channel = _make_divisible(c * width_mult, round_nearest)
195
+ for i in range(n):
196
+ stride = s if i == 0 else 1
197
+ features.append(block(input_channel, output_channel, stride, expand_ratio=t))
198
+ input_channel = output_channel
199
+ self.features = nn.Sequential(*features)
200
+
201
+ self.fpn_selected = [3, 6, 10]
202
+ # weight initialization
203
+ for m in self.modules():
204
+ if isinstance(m, nn.Conv2d):
205
+ nn.init.kaiming_normal_(m.weight, mode='fan_out')
206
+ if m.bias is not None:
207
+ nn.init.zeros_(m.bias)
208
+ elif isinstance(m, nn.BatchNorm2d):
209
+ nn.init.ones_(m.weight)
210
+ nn.init.zeros_(m.bias)
211
+ elif isinstance(m, nn.Linear):
212
+ nn.init.normal_(m.weight, 0, 0.01)
213
+ nn.init.zeros_(m.bias)
214
+
215
+ #if pretrained:
216
+ # self._load_pretrained_model()
217
+
218
+ def _forward_impl(self, x):
219
+ # This exists since TorchScript doesn't support inheritance, so the superclass method
220
+ # (this one) needs to have a name other than `forward` that can be accessed in a subclass
221
+ fpn_features = []
222
+ for i, f in enumerate(self.features):
223
+ if i > self.fpn_selected[-1]:
224
+ break
225
+ x = f(x)
226
+ if i in self.fpn_selected:
227
+ fpn_features.append(x)
228
+
229
+ c2, c3, c4 = fpn_features
230
+ return c2, c3, c4
231
+
232
+
233
+ def forward(self, x):
234
+ return self._forward_impl(x)
235
+
236
+ def _load_pretrained_model(self):
237
+ pretrain_dict = model_zoo.load_url('https://download.pytorch.org/models/mobilenet_v2-b0353104.pth')
238
+ model_dict = {}
239
+ state_dict = self.state_dict()
240
+ for k, v in pretrain_dict.items():
241
+ if k in state_dict:
242
+ model_dict[k] = v
243
+ state_dict.update(model_dict)
244
+ self.load_state_dict(state_dict)
245
+
246
+
247
+ class MobileV2_MLSD_Tiny(nn.Module):
248
+ def __init__(self):
249
+ super(MobileV2_MLSD_Tiny, self).__init__()
250
+
251
+ self.backbone = MobileNetV2(pretrained=True)
252
+
253
+ self.block12 = BlockTypeA(in_c1= 32, in_c2= 64,
254
+ out_c1= 64, out_c2=64)
255
+ self.block13 = BlockTypeB(128, 64)
256
+
257
+ self.block14 = BlockTypeA(in_c1 = 24, in_c2 = 64,
258
+ out_c1= 32, out_c2= 32)
259
+ self.block15 = BlockTypeB(64, 64)
260
+
261
+ self.block16 = BlockTypeC(64, 16)
262
+
263
+ def forward(self, x):
264
+ c2, c3, c4 = self.backbone(x)
265
+
266
+ x = self.block12(c3, c4)
267
+ x = self.block13(x)
268
+ x = self.block14(c2, x)
269
+ x = self.block15(x)
270
+ x = self.block16(x)
271
+ x = x[:, 7:, :, :]
272
+ #print(x.shape)
273
+ x = F.interpolate(x, scale_factor=2.0, mode='bilinear', align_corners=True)
274
+
275
+ return x
src/flux/annotator/mlsd/utils.py ADDED
@@ -0,0 +1,580 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ '''
2
+ modified by lihaoweicv
3
+ pytorch version
4
+ '''
5
+
6
+ '''
7
+ M-LSD
8
+ Copyright 2021-present NAVER Corp.
9
+ Apache License v2.0
10
+ '''
11
+
12
+ import os
13
+ import numpy as np
14
+ import cv2
15
+ import torch
16
+ from torch.nn import functional as F
17
+
18
+
19
+ def deccode_output_score_and_ptss(tpMap, topk_n = 200, ksize = 5):
20
+ '''
21
+ tpMap:
22
+ center: tpMap[1, 0, :, :]
23
+ displacement: tpMap[1, 1:5, :, :]
24
+ '''
25
+ b, c, h, w = tpMap.shape
26
+ assert b==1, 'only support bsize==1'
27
+ displacement = tpMap[:, 1:5, :, :][0]
28
+ center = tpMap[:, 0, :, :]
29
+ heat = torch.sigmoid(center)
30
+ hmax = F.max_pool2d( heat, (ksize, ksize), stride=1, padding=(ksize-1)//2)
31
+ keep = (hmax == heat).float()
32
+ heat = heat * keep
33
+ heat = heat.reshape(-1, )
34
+
35
+ scores, indices = torch.topk(heat, topk_n, dim=-1, largest=True)
36
+ yy = torch.floor_divide(indices, w).unsqueeze(-1)
37
+ xx = torch.fmod(indices, w).unsqueeze(-1)
38
+ ptss = torch.cat((yy, xx),dim=-1)
39
+
40
+ ptss = ptss.detach().cpu().numpy()
41
+ scores = scores.detach().cpu().numpy()
42
+ displacement = displacement.detach().cpu().numpy()
43
+ displacement = displacement.transpose((1,2,0))
44
+ return ptss, scores, displacement
45
+
46
+
47
+ def pred_lines(image, model,
48
+ input_shape=[512, 512],
49
+ score_thr=0.10,
50
+ dist_thr=20.0):
51
+ h, w, _ = image.shape
52
+ h_ratio, w_ratio = [h / input_shape[0], w / input_shape[1]]
53
+
54
+ resized_image = np.concatenate([cv2.resize(image, (input_shape[1], input_shape[0]), interpolation=cv2.INTER_AREA),
55
+ np.ones([input_shape[0], input_shape[1], 1])], axis=-1)
56
+
57
+ resized_image = resized_image.transpose((2,0,1))
58
+ batch_image = np.expand_dims(resized_image, axis=0).astype('float32')
59
+ batch_image = (batch_image / 127.5) - 1.0
60
+
61
+ batch_image = torch.from_numpy(batch_image).float().to("cuda:4")
62
+ outputs = model(batch_image)
63
+ pts, pts_score, vmap = deccode_output_score_and_ptss(outputs, 200, 3)
64
+ start = vmap[:, :, :2]
65
+ end = vmap[:, :, 2:]
66
+ dist_map = np.sqrt(np.sum((start - end) ** 2, axis=-1))
67
+
68
+ segments_list = []
69
+ for center, score in zip(pts, pts_score):
70
+ y, x = center
71
+ distance = dist_map[y, x]
72
+ if score > score_thr and distance > dist_thr:
73
+ disp_x_start, disp_y_start, disp_x_end, disp_y_end = vmap[y, x, :]
74
+ x_start = x + disp_x_start
75
+ y_start = y + disp_y_start
76
+ x_end = x + disp_x_end
77
+ y_end = y + disp_y_end
78
+ segments_list.append([x_start, y_start, x_end, y_end])
79
+
80
+ lines = 2 * np.array(segments_list) # 256 > 512
81
+ lines[:, 0] = lines[:, 0] * w_ratio
82
+ lines[:, 1] = lines[:, 1] * h_ratio
83
+ lines[:, 2] = lines[:, 2] * w_ratio
84
+ lines[:, 3] = lines[:, 3] * h_ratio
85
+
86
+ return lines
87
+
88
+
89
+ def pred_squares(image,
90
+ model,
91
+ input_shape=[512, 512],
92
+ params={'score': 0.06,
93
+ 'outside_ratio': 0.28,
94
+ 'inside_ratio': 0.45,
95
+ 'w_overlap': 0.0,
96
+ 'w_degree': 1.95,
97
+ 'w_length': 0.0,
98
+ 'w_area': 1.86,
99
+ 'w_center': 0.14}):
100
+ '''
101
+ shape = [height, width]
102
+ '''
103
+ h, w, _ = image.shape
104
+ original_shape = [h, w]
105
+
106
+ resized_image = np.concatenate([cv2.resize(image, (input_shape[0], input_shape[1]), interpolation=cv2.INTER_AREA),
107
+ np.ones([input_shape[0], input_shape[1], 1])], axis=-1)
108
+ resized_image = resized_image.transpose((2, 0, 1))
109
+ batch_image = np.expand_dims(resized_image, axis=0).astype('float32')
110
+ batch_image = (batch_image / 127.5) - 1.0
111
+
112
+ batch_image = torch.from_numpy(batch_image).float().cuda()
113
+ outputs = model(batch_image)
114
+
115
+ pts, pts_score, vmap = deccode_output_score_and_ptss(outputs, 200, 3)
116
+ start = vmap[:, :, :2] # (x, y)
117
+ end = vmap[:, :, 2:] # (x, y)
118
+ dist_map = np.sqrt(np.sum((start - end) ** 2, axis=-1))
119
+
120
+ junc_list = []
121
+ segments_list = []
122
+ for junc, score in zip(pts, pts_score):
123
+ y, x = junc
124
+ distance = dist_map[y, x]
125
+ if score > params['score'] and distance > 20.0:
126
+ junc_list.append([x, y])
127
+ disp_x_start, disp_y_start, disp_x_end, disp_y_end = vmap[y, x, :]
128
+ d_arrow = 1.0
129
+ x_start = x + d_arrow * disp_x_start
130
+ y_start = y + d_arrow * disp_y_start
131
+ x_end = x + d_arrow * disp_x_end
132
+ y_end = y + d_arrow * disp_y_end
133
+ segments_list.append([x_start, y_start, x_end, y_end])
134
+
135
+ segments = np.array(segments_list)
136
+
137
+ ####### post processing for squares
138
+ # 1. get unique lines
139
+ point = np.array([[0, 0]])
140
+ point = point[0]
141
+ start = segments[:, :2]
142
+ end = segments[:, 2:]
143
+ diff = start - end
144
+ a = diff[:, 1]
145
+ b = -diff[:, 0]
146
+ c = a * start[:, 0] + b * start[:, 1]
147
+
148
+ d = np.abs(a * point[0] + b * point[1] - c) / np.sqrt(a ** 2 + b ** 2 + 1e-10)
149
+ theta = np.arctan2(diff[:, 0], diff[:, 1]) * 180 / np.pi
150
+ theta[theta < 0.0] += 180
151
+ hough = np.concatenate([d[:, None], theta[:, None]], axis=-1)
152
+
153
+ d_quant = 1
154
+ theta_quant = 2
155
+ hough[:, 0] //= d_quant
156
+ hough[:, 1] //= theta_quant
157
+ _, indices, counts = np.unique(hough, axis=0, return_index=True, return_counts=True)
158
+
159
+ acc_map = np.zeros([512 // d_quant + 1, 360 // theta_quant + 1], dtype='float32')
160
+ idx_map = np.zeros([512 // d_quant + 1, 360 // theta_quant + 1], dtype='int32') - 1
161
+ yx_indices = hough[indices, :].astype('int32')
162
+ acc_map[yx_indices[:, 0], yx_indices[:, 1]] = counts
163
+ idx_map[yx_indices[:, 0], yx_indices[:, 1]] = indices
164
+
165
+ acc_map_np = acc_map
166
+ # acc_map = acc_map[None, :, :, None]
167
+ #
168
+ # ### fast suppression using tensorflow op
169
+ # acc_map = tf.constant(acc_map, dtype=tf.float32)
170
+ # max_acc_map = tf.keras.layers.MaxPool2D(pool_size=(5, 5), strides=1, padding='same')(acc_map)
171
+ # acc_map = acc_map * tf.cast(tf.math.equal(acc_map, max_acc_map), tf.float32)
172
+ # flatten_acc_map = tf.reshape(acc_map, [1, -1])
173
+ # topk_values, topk_indices = tf.math.top_k(flatten_acc_map, k=len(pts))
174
+ # _, h, w, _ = acc_map.shape
175
+ # y = tf.expand_dims(topk_indices // w, axis=-1)
176
+ # x = tf.expand_dims(topk_indices % w, axis=-1)
177
+ # yx = tf.concat([y, x], axis=-1)
178
+
179
+ ### fast suppression using pytorch op
180
+ acc_map = torch.from_numpy(acc_map_np).unsqueeze(0).unsqueeze(0)
181
+ _,_, h, w = acc_map.shape
182
+ max_acc_map = F.max_pool2d(acc_map,kernel_size=5, stride=1, padding=2)
183
+ acc_map = acc_map * ( (acc_map == max_acc_map).float() )
184
+ flatten_acc_map = acc_map.reshape([-1, ])
185
+
186
+ scores, indices = torch.topk(flatten_acc_map, len(pts), dim=-1, largest=True)
187
+ yy = torch.div(indices, w, rounding_mode='floor').unsqueeze(-1)
188
+ xx = torch.fmod(indices, w).unsqueeze(-1)
189
+ yx = torch.cat((yy, xx), dim=-1)
190
+
191
+ yx = yx.detach().cpu().numpy()
192
+
193
+ topk_values = scores.detach().cpu().numpy()
194
+ indices = idx_map[yx[:, 0], yx[:, 1]]
195
+ basis = 5 // 2
196
+
197
+ merged_segments = []
198
+ for yx_pt, max_indice, value in zip(yx, indices, topk_values):
199
+ y, x = yx_pt
200
+ if max_indice == -1 or value == 0:
201
+ continue
202
+ segment_list = []
203
+ for y_offset in range(-basis, basis + 1):
204
+ for x_offset in range(-basis, basis + 1):
205
+ indice = idx_map[y + y_offset, x + x_offset]
206
+ cnt = int(acc_map_np[y + y_offset, x + x_offset])
207
+ if indice != -1:
208
+ segment_list.append(segments[indice])
209
+ if cnt > 1:
210
+ check_cnt = 1
211
+ current_hough = hough[indice]
212
+ for new_indice, new_hough in enumerate(hough):
213
+ if (current_hough == new_hough).all() and indice != new_indice:
214
+ segment_list.append(segments[new_indice])
215
+ check_cnt += 1
216
+ if check_cnt == cnt:
217
+ break
218
+ group_segments = np.array(segment_list).reshape([-1, 2])
219
+ sorted_group_segments = np.sort(group_segments, axis=0)
220
+ x_min, y_min = sorted_group_segments[0, :]
221
+ x_max, y_max = sorted_group_segments[-1, :]
222
+
223
+ deg = theta[max_indice]
224
+ if deg >= 90:
225
+ merged_segments.append([x_min, y_max, x_max, y_min])
226
+ else:
227
+ merged_segments.append([x_min, y_min, x_max, y_max])
228
+
229
+ # 2. get intersections
230
+ new_segments = np.array(merged_segments) # (x1, y1, x2, y2)
231
+ start = new_segments[:, :2] # (x1, y1)
232
+ end = new_segments[:, 2:] # (x2, y2)
233
+ new_centers = (start + end) / 2.0
234
+ diff = start - end
235
+ dist_segments = np.sqrt(np.sum(diff ** 2, axis=-1))
236
+
237
+ # ax + by = c
238
+ a = diff[:, 1]
239
+ b = -diff[:, 0]
240
+ c = a * start[:, 0] + b * start[:, 1]
241
+ pre_det = a[:, None] * b[None, :]
242
+ det = pre_det - np.transpose(pre_det)
243
+
244
+ pre_inter_y = a[:, None] * c[None, :]
245
+ inter_y = (pre_inter_y - np.transpose(pre_inter_y)) / (det + 1e-10)
246
+ pre_inter_x = c[:, None] * b[None, :]
247
+ inter_x = (pre_inter_x - np.transpose(pre_inter_x)) / (det + 1e-10)
248
+ inter_pts = np.concatenate([inter_x[:, :, None], inter_y[:, :, None]], axis=-1).astype('int32')
249
+
250
+ # 3. get corner information
251
+ # 3.1 get distance
252
+ '''
253
+ dist_segments:
254
+ | dist(0), dist(1), dist(2), ...|
255
+ dist_inter_to_segment1:
256
+ | dist(inter,0), dist(inter,0), dist(inter,0), ... |
257
+ | dist(inter,1), dist(inter,1), dist(inter,1), ... |
258
+ ...
259
+ dist_inter_to_semgnet2:
260
+ | dist(inter,0), dist(inter,1), dist(inter,2), ... |
261
+ | dist(inter,0), dist(inter,1), dist(inter,2), ... |
262
+ ...
263
+ '''
264
+
265
+ dist_inter_to_segment1_start = np.sqrt(
266
+ np.sum(((inter_pts - start[:, None, :]) ** 2), axis=-1, keepdims=True)) # [n_batch, n_batch, 1]
267
+ dist_inter_to_segment1_end = np.sqrt(
268
+ np.sum(((inter_pts - end[:, None, :]) ** 2), axis=-1, keepdims=True)) # [n_batch, n_batch, 1]
269
+ dist_inter_to_segment2_start = np.sqrt(
270
+ np.sum(((inter_pts - start[None, :, :]) ** 2), axis=-1, keepdims=True)) # [n_batch, n_batch, 1]
271
+ dist_inter_to_segment2_end = np.sqrt(
272
+ np.sum(((inter_pts - end[None, :, :]) ** 2), axis=-1, keepdims=True)) # [n_batch, n_batch, 1]
273
+
274
+ # sort ascending
275
+ dist_inter_to_segment1 = np.sort(
276
+ np.concatenate([dist_inter_to_segment1_start, dist_inter_to_segment1_end], axis=-1),
277
+ axis=-1) # [n_batch, n_batch, 2]
278
+ dist_inter_to_segment2 = np.sort(
279
+ np.concatenate([dist_inter_to_segment2_start, dist_inter_to_segment2_end], axis=-1),
280
+ axis=-1) # [n_batch, n_batch, 2]
281
+
282
+ # 3.2 get degree
283
+ inter_to_start = new_centers[:, None, :] - inter_pts
284
+ deg_inter_to_start = np.arctan2(inter_to_start[:, :, 1], inter_to_start[:, :, 0]) * 180 / np.pi
285
+ deg_inter_to_start[deg_inter_to_start < 0.0] += 360
286
+ inter_to_end = new_centers[None, :, :] - inter_pts
287
+ deg_inter_to_end = np.arctan2(inter_to_end[:, :, 1], inter_to_end[:, :, 0]) * 180 / np.pi
288
+ deg_inter_to_end[deg_inter_to_end < 0.0] += 360
289
+
290
+ '''
291
+ B -- G
292
+ | |
293
+ C -- R
294
+ B : blue / G: green / C: cyan / R: red
295
+
296
+ 0 -- 1
297
+ | |
298
+ 3 -- 2
299
+ '''
300
+ # rename variables
301
+ deg1_map, deg2_map = deg_inter_to_start, deg_inter_to_end
302
+ # sort deg ascending
303
+ deg_sort = np.sort(np.concatenate([deg1_map[:, :, None], deg2_map[:, :, None]], axis=-1), axis=-1)
304
+
305
+ deg_diff_map = np.abs(deg1_map - deg2_map)
306
+ # we only consider the smallest degree of intersect
307
+ deg_diff_map[deg_diff_map > 180] = 360 - deg_diff_map[deg_diff_map > 180]
308
+
309
+ # define available degree range
310
+ deg_range = [60, 120]
311
+
312
+ corner_dict = {corner_info: [] for corner_info in range(4)}
313
+ inter_points = []
314
+ for i in range(inter_pts.shape[0]):
315
+ for j in range(i + 1, inter_pts.shape[1]):
316
+ # i, j > line index, always i < j
317
+ x, y = inter_pts[i, j, :]
318
+ deg1, deg2 = deg_sort[i, j, :]
319
+ deg_diff = deg_diff_map[i, j]
320
+
321
+ check_degree = deg_diff > deg_range[0] and deg_diff < deg_range[1]
322
+
323
+ outside_ratio = params['outside_ratio'] # over ratio >>> drop it!
324
+ inside_ratio = params['inside_ratio'] # over ratio >>> drop it!
325
+ check_distance = ((dist_inter_to_segment1[i, j, 1] >= dist_segments[i] and \
326
+ dist_inter_to_segment1[i, j, 0] <= dist_segments[i] * outside_ratio) or \
327
+ (dist_inter_to_segment1[i, j, 1] <= dist_segments[i] and \
328
+ dist_inter_to_segment1[i, j, 0] <= dist_segments[i] * inside_ratio)) and \
329
+ ((dist_inter_to_segment2[i, j, 1] >= dist_segments[j] and \
330
+ dist_inter_to_segment2[i, j, 0] <= dist_segments[j] * outside_ratio) or \
331
+ (dist_inter_to_segment2[i, j, 1] <= dist_segments[j] and \
332
+ dist_inter_to_segment2[i, j, 0] <= dist_segments[j] * inside_ratio))
333
+
334
+ if check_degree and check_distance:
335
+ corner_info = None
336
+
337
+ if (deg1 >= 0 and deg1 <= 45 and deg2 >= 45 and deg2 <= 120) or \
338
+ (deg2 >= 315 and deg1 >= 45 and deg1 <= 120):
339
+ corner_info, color_info = 0, 'blue'
340
+ elif (deg1 >= 45 and deg1 <= 125 and deg2 >= 125 and deg2 <= 225):
341
+ corner_info, color_info = 1, 'green'
342
+ elif (deg1 >= 125 and deg1 <= 225 and deg2 >= 225 and deg2 <= 315):
343
+ corner_info, color_info = 2, 'black'
344
+ elif (deg1 >= 0 and deg1 <= 45 and deg2 >= 225 and deg2 <= 315) or \
345
+ (deg2 >= 315 and deg1 >= 225 and deg1 <= 315):
346
+ corner_info, color_info = 3, 'cyan'
347
+ else:
348
+ corner_info, color_info = 4, 'red' # we don't use it
349
+ continue
350
+
351
+ corner_dict[corner_info].append([x, y, i, j])
352
+ inter_points.append([x, y])
353
+
354
+ square_list = []
355
+ connect_list = []
356
+ segments_list = []
357
+ for corner0 in corner_dict[0]:
358
+ for corner1 in corner_dict[1]:
359
+ connect01 = False
360
+ for corner0_line in corner0[2:]:
361
+ if corner0_line in corner1[2:]:
362
+ connect01 = True
363
+ break
364
+ if connect01:
365
+ for corner2 in corner_dict[2]:
366
+ connect12 = False
367
+ for corner1_line in corner1[2:]:
368
+ if corner1_line in corner2[2:]:
369
+ connect12 = True
370
+ break
371
+ if connect12:
372
+ for corner3 in corner_dict[3]:
373
+ connect23 = False
374
+ for corner2_line in corner2[2:]:
375
+ if corner2_line in corner3[2:]:
376
+ connect23 = True
377
+ break
378
+ if connect23:
379
+ for corner3_line in corner3[2:]:
380
+ if corner3_line in corner0[2:]:
381
+ # SQUARE!!!
382
+ '''
383
+ 0 -- 1
384
+ | |
385
+ 3 -- 2
386
+ square_list:
387
+ order: 0 > 1 > 2 > 3
388
+ | x0, y0, x1, y1, x2, y2, x3, y3 |
389
+ | x0, y0, x1, y1, x2, y2, x3, y3 |
390
+ ...
391
+ connect_list:
392
+ order: 01 > 12 > 23 > 30
393
+ | line_idx01, line_idx12, line_idx23, line_idx30 |
394
+ | line_idx01, line_idx12, line_idx23, line_idx30 |
395
+ ...
396
+ segments_list:
397
+ order: 0 > 1 > 2 > 3
398
+ | line_idx0_i, line_idx0_j, line_idx1_i, line_idx1_j, line_idx2_i, line_idx2_j, line_idx3_i, line_idx3_j |
399
+ | line_idx0_i, line_idx0_j, line_idx1_i, line_idx1_j, line_idx2_i, line_idx2_j, line_idx3_i, line_idx3_j |
400
+ ...
401
+ '''
402
+ square_list.append(corner0[:2] + corner1[:2] + corner2[:2] + corner3[:2])
403
+ connect_list.append([corner0_line, corner1_line, corner2_line, corner3_line])
404
+ segments_list.append(corner0[2:] + corner1[2:] + corner2[2:] + corner3[2:])
405
+
406
+ def check_outside_inside(segments_info, connect_idx):
407
+ # return 'outside or inside', min distance, cover_param, peri_param
408
+ if connect_idx == segments_info[0]:
409
+ check_dist_mat = dist_inter_to_segment1
410
+ else:
411
+ check_dist_mat = dist_inter_to_segment2
412
+
413
+ i, j = segments_info
414
+ min_dist, max_dist = check_dist_mat[i, j, :]
415
+ connect_dist = dist_segments[connect_idx]
416
+ if max_dist > connect_dist:
417
+ return 'outside', min_dist, 0, 1
418
+ else:
419
+ return 'inside', min_dist, -1, -1
420
+
421
+ top_square = None
422
+
423
+ try:
424
+ map_size = input_shape[0] / 2
425
+ squares = np.array(square_list).reshape([-1, 4, 2])
426
+ score_array = []
427
+ connect_array = np.array(connect_list)
428
+ segments_array = np.array(segments_list).reshape([-1, 4, 2])
429
+
430
+ # get degree of corners:
431
+ squares_rollup = np.roll(squares, 1, axis=1)
432
+ squares_rolldown = np.roll(squares, -1, axis=1)
433
+ vec1 = squares_rollup - squares
434
+ normalized_vec1 = vec1 / (np.linalg.norm(vec1, axis=-1, keepdims=True) + 1e-10)
435
+ vec2 = squares_rolldown - squares
436
+ normalized_vec2 = vec2 / (np.linalg.norm(vec2, axis=-1, keepdims=True) + 1e-10)
437
+ inner_products = np.sum(normalized_vec1 * normalized_vec2, axis=-1) # [n_squares, 4]
438
+ squares_degree = np.arccos(inner_products) * 180 / np.pi # [n_squares, 4]
439
+
440
+ # get square score
441
+ overlap_scores = []
442
+ degree_scores = []
443
+ length_scores = []
444
+
445
+ for connects, segments, square, degree in zip(connect_array, segments_array, squares, squares_degree):
446
+ '''
447
+ 0 -- 1
448
+ | |
449
+ 3 -- 2
450
+
451
+ # segments: [4, 2]
452
+ # connects: [4]
453
+ '''
454
+
455
+ ###################################### OVERLAP SCORES
456
+ cover = 0
457
+ perimeter = 0
458
+ # check 0 > 1 > 2 > 3
459
+ square_length = []
460
+
461
+ for start_idx in range(4):
462
+ end_idx = (start_idx + 1) % 4
463
+
464
+ connect_idx = connects[start_idx] # segment idx of segment01
465
+ start_segments = segments[start_idx]
466
+ end_segments = segments[end_idx]
467
+
468
+ start_point = square[start_idx]
469
+ end_point = square[end_idx]
470
+
471
+ # check whether outside or inside
472
+ start_position, start_min, start_cover_param, start_peri_param = check_outside_inside(start_segments,
473
+ connect_idx)
474
+ end_position, end_min, end_cover_param, end_peri_param = check_outside_inside(end_segments, connect_idx)
475
+
476
+ cover += dist_segments[connect_idx] + start_cover_param * start_min + end_cover_param * end_min
477
+ perimeter += dist_segments[connect_idx] + start_peri_param * start_min + end_peri_param * end_min
478
+
479
+ square_length.append(
480
+ dist_segments[connect_idx] + start_peri_param * start_min + end_peri_param * end_min)
481
+
482
+ overlap_scores.append(cover / perimeter)
483
+ ######################################
484
+ ###################################### DEGREE SCORES
485
+ '''
486
+ deg0 vs deg2
487
+ deg1 vs deg3
488
+ '''
489
+ deg0, deg1, deg2, deg3 = degree
490
+ deg_ratio1 = deg0 / deg2
491
+ if deg_ratio1 > 1.0:
492
+ deg_ratio1 = 1 / deg_ratio1
493
+ deg_ratio2 = deg1 / deg3
494
+ if deg_ratio2 > 1.0:
495
+ deg_ratio2 = 1 / deg_ratio2
496
+ degree_scores.append((deg_ratio1 + deg_ratio2) / 2)
497
+ ######################################
498
+ ###################################### LENGTH SCORES
499
+ '''
500
+ len0 vs len2
501
+ len1 vs len3
502
+ '''
503
+ len0, len1, len2, len3 = square_length
504
+ len_ratio1 = len0 / len2 if len2 > len0 else len2 / len0
505
+ len_ratio2 = len1 / len3 if len3 > len1 else len3 / len1
506
+ length_scores.append((len_ratio1 + len_ratio2) / 2)
507
+
508
+ ######################################
509
+
510
+ overlap_scores = np.array(overlap_scores)
511
+ overlap_scores /= np.max(overlap_scores)
512
+
513
+ degree_scores = np.array(degree_scores)
514
+ # degree_scores /= np.max(degree_scores)
515
+
516
+ length_scores = np.array(length_scores)
517
+
518
+ ###################################### AREA SCORES
519
+ area_scores = np.reshape(squares, [-1, 4, 2])
520
+ area_x = area_scores[:, :, 0]
521
+ area_y = area_scores[:, :, 1]
522
+ correction = area_x[:, -1] * area_y[:, 0] - area_y[:, -1] * area_x[:, 0]
523
+ area_scores = np.sum(area_x[:, :-1] * area_y[:, 1:], axis=-1) - np.sum(area_y[:, :-1] * area_x[:, 1:], axis=-1)
524
+ area_scores = 0.5 * np.abs(area_scores + correction)
525
+ area_scores /= (map_size * map_size) # np.max(area_scores)
526
+ ######################################
527
+
528
+ ###################################### CENTER SCORES
529
+ centers = np.array([[256 // 2, 256 // 2]], dtype='float32') # [1, 2]
530
+ # squares: [n, 4, 2]
531
+ square_centers = np.mean(squares, axis=1) # [n, 2]
532
+ center2center = np.sqrt(np.sum((centers - square_centers) ** 2))
533
+ center_scores = center2center / (map_size / np.sqrt(2.0))
534
+
535
+ '''
536
+ score_w = [overlap, degree, area, center, length]
537
+ '''
538
+ score_w = [0.0, 1.0, 10.0, 0.5, 1.0]
539
+ score_array = params['w_overlap'] * overlap_scores \
540
+ + params['w_degree'] * degree_scores \
541
+ + params['w_area'] * area_scores \
542
+ - params['w_center'] * center_scores \
543
+ + params['w_length'] * length_scores
544
+
545
+ best_square = []
546
+
547
+ sorted_idx = np.argsort(score_array)[::-1]
548
+ score_array = score_array[sorted_idx]
549
+ squares = squares[sorted_idx]
550
+
551
+ except Exception as e:
552
+ pass
553
+
554
+ '''return list
555
+ merged_lines, squares, scores
556
+ '''
557
+
558
+ try:
559
+ new_segments[:, 0] = new_segments[:, 0] * 2 / input_shape[1] * original_shape[1]
560
+ new_segments[:, 1] = new_segments[:, 1] * 2 / input_shape[0] * original_shape[0]
561
+ new_segments[:, 2] = new_segments[:, 2] * 2 / input_shape[1] * original_shape[1]
562
+ new_segments[:, 3] = new_segments[:, 3] * 2 / input_shape[0] * original_shape[0]
563
+ except:
564
+ new_segments = []
565
+
566
+ try:
567
+ squares[:, :, 0] = squares[:, :, 0] * 2 / input_shape[1] * original_shape[1]
568
+ squares[:, :, 1] = squares[:, :, 1] * 2 / input_shape[0] * original_shape[0]
569
+ except:
570
+ squares = []
571
+ score_array = []
572
+
573
+ try:
574
+ inter_points = np.array(inter_points)
575
+ inter_points[:, 0] = inter_points[:, 0] * 2 / input_shape[1] * original_shape[1]
576
+ inter_points[:, 1] = inter_points[:, 1] * 2 / input_shape[0] * original_shape[0]
577
+ except:
578
+ inter_points = []
579
+
580
+ return new_segments, squares, score_array, inter_points
src/flux/annotator/tile/__init__.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import random
2
+ import cv2
3
+ from .guided_filter import FastGuidedFilter
4
+
5
+
6
+ class TileDetector:
7
+ # https://huggingface.co/xinsir/controlnet-tile-sdxl-1.0
8
+ def __init__(self):
9
+ pass
10
+
11
+ def __call__(self, image):
12
+ blur_strength = random.sample([i / 10. for i in range(10, 201, 2)], k=1)[0]
13
+ radius = random.sample([i for i in range(1, 40, 2)], k=1)[0]
14
+ eps = random.sample([i / 1000. for i in range(1, 101, 2)], k=1)[0]
15
+ scale_factor = random.sample([i / 10. for i in range(10, 181, 5)], k=1)[0]
16
+
17
+ ksize = int(blur_strength)
18
+ if ksize % 2 == 0:
19
+ ksize += 1
20
+
21
+ if random.random() > 0.5:
22
+ image = cv2.GaussianBlur(image, (ksize, ksize), blur_strength / 2)
23
+ if random.random() > 0.5:
24
+ filter = FastGuidedFilter(image, radius, eps, scale_factor)
25
+ image = filter.filter(image)
26
+ return image
src/flux/annotator/tile/guided_filter.py ADDED
@@ -0,0 +1,280 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ ## @package guided_filter.core.filters
3
+ #
4
+ # Implementation of guided filter.
5
+ # * GuidedFilter: Original guided filter.
6
+ # * FastGuidedFilter: Fast version of the guided filter.
7
+ # @author tody
8
+ # @date 2015/08/26
9
+
10
+ import numpy as np
11
+ import cv2
12
+
13
+ ## Convert image into float32 type.
14
+ def to32F(img):
15
+ if img.dtype == np.float32:
16
+ return img
17
+ return (1.0 / 255.0) * np.float32(img)
18
+
19
+ ## Convert image into uint8 type.
20
+ def to8U(img):
21
+ if img.dtype == np.uint8:
22
+ return img
23
+ return np.clip(np.uint8(255.0 * img), 0, 255)
24
+
25
+ ## Return if the input image is gray or not.
26
+ def _isGray(I):
27
+ return len(I.shape) == 2
28
+
29
+
30
+ ## Return down sampled image.
31
+ # @param scale (w/s, h/s) image will be created.
32
+ # @param shape I.shape[:2]=(h, w). numpy friendly size parameter.
33
+ def _downSample(I, scale=4, shape=None):
34
+ if shape is not None:
35
+ h, w = shape
36
+ return cv2.resize(I, (w, h), interpolation=cv2.INTER_NEAREST)
37
+
38
+ h, w = I.shape[:2]
39
+ return cv2.resize(I, (int(w / scale), int(h / scale)), interpolation=cv2.INTER_NEAREST)
40
+
41
+
42
+ ## Return up sampled image.
43
+ # @param scale (w*s, h*s) image will be created.
44
+ # @param shape I.shape[:2]=(h, w). numpy friendly size parameter.
45
+ def _upSample(I, scale=2, shape=None):
46
+ if shape is not None:
47
+ h, w = shape
48
+ return cv2.resize(I, (w, h), interpolation=cv2.INTER_LINEAR)
49
+
50
+ h, w = I.shape[:2]
51
+ return cv2.resize(I, (int(w * scale), int(h * scale)), interpolation=cv2.INTER_LINEAR)
52
+
53
+ ## Fast guide filter.
54
+ class FastGuidedFilter:
55
+ ## Constructor.
56
+ # @param I Input guidance image. Color or gray.
57
+ # @param radius Radius of Guided Filter.
58
+ # @param epsilon Regularization term of Guided Filter.
59
+ # @param scale Down sampled scale.
60
+ def __init__(self, I, radius=5, epsilon=0.4, scale=4):
61
+ I_32F = to32F(I)
62
+ self._I = I_32F
63
+ h, w = I.shape[:2]
64
+
65
+ I_sub = _downSample(I_32F, scale)
66
+
67
+ self._I_sub = I_sub
68
+ radius = int(radius / scale)
69
+
70
+ if _isGray(I):
71
+ self._guided_filter = GuidedFilterGray(I_sub, radius, epsilon)
72
+ else:
73
+ self._guided_filter = GuidedFilterColor(I_sub, radius, epsilon)
74
+
75
+ ## Apply filter for the input image.
76
+ # @param p Input image for the filtering.
77
+ def filter(self, p):
78
+ p_32F = to32F(p)
79
+ shape_original = p.shape[:2]
80
+
81
+ p_sub = _downSample(p_32F, shape=self._I_sub.shape[:2])
82
+
83
+ if _isGray(p_sub):
84
+ return self._filterGray(p_sub, shape_original)
85
+
86
+ cs = p.shape[2]
87
+ q = np.array(p_32F)
88
+
89
+ for ci in range(cs):
90
+ q[:, :, ci] = self._filterGray(p_sub[:, :, ci], shape_original)
91
+ return to8U(q)
92
+
93
+ def _filterGray(self, p_sub, shape_original):
94
+ ab_sub = self._guided_filter._computeCoefficients(p_sub)
95
+ ab = [_upSample(abi, shape=shape_original) for abi in ab_sub]
96
+ return self._guided_filter._computeOutput(ab, self._I)
97
+
98
+
99
+ ## Guide filter.
100
+ class GuidedFilter:
101
+ ## Constructor.
102
+ # @param I Input guidance image. Color or gray.
103
+ # @param radius Radius of Guided Filter.
104
+ # @param epsilon Regularization term of Guided Filter.
105
+ def __init__(self, I, radius=5, epsilon=0.4):
106
+ I_32F = to32F(I)
107
+
108
+ if _isGray(I):
109
+ self._guided_filter = GuidedFilterGray(I_32F, radius, epsilon)
110
+ else:
111
+ self._guided_filter = GuidedFilterColor(I_32F, radius, epsilon)
112
+
113
+ ## Apply filter for the input image.
114
+ # @param p Input image for the filtering.
115
+ def filter(self, p):
116
+ return to8U(self._guided_filter.filter(p))
117
+
118
+
119
+ ## Common parts of guided filter.
120
+ #
121
+ # This class is used by guided_filter class. GuidedFilterGray and GuidedFilterColor.
122
+ # Based on guided_filter._computeCoefficients, guided_filter._computeOutput,
123
+ # GuidedFilterCommon.filter computes filtered image for color and gray.
124
+ class GuidedFilterCommon:
125
+ def __init__(self, guided_filter):
126
+ self._guided_filter = guided_filter
127
+
128
+ ## Apply filter for the input image.
129
+ # @param p Input image for the filtering.
130
+ def filter(self, p):
131
+ p_32F = to32F(p)
132
+ if _isGray(p_32F):
133
+ return self._filterGray(p_32F)
134
+
135
+ cs = p.shape[2]
136
+ q = np.array(p_32F)
137
+
138
+ for ci in range(cs):
139
+ q[:, :, ci] = self._filterGray(p_32F[:, :, ci])
140
+ return q
141
+
142
+ def _filterGray(self, p):
143
+ ab = self._guided_filter._computeCoefficients(p)
144
+ return self._guided_filter._computeOutput(ab, self._guided_filter._I)
145
+
146
+
147
+ ## Guided filter for gray guidance image.
148
+ class GuidedFilterGray:
149
+ # @param I Input gray guidance image.
150
+ # @param radius Radius of Guided Filter.
151
+ # @param epsilon Regularization term of Guided Filter.
152
+ def __init__(self, I, radius=5, epsilon=0.4):
153
+ self._radius = 2 * radius + 1
154
+ self._epsilon = epsilon
155
+ self._I = to32F(I)
156
+ self._initFilter()
157
+ self._filter_common = GuidedFilterCommon(self)
158
+
159
+ ## Apply filter for the input image.
160
+ # @param p Input image for the filtering.
161
+ def filter(self, p):
162
+ return self._filter_common.filter(p)
163
+
164
+ def _initFilter(self):
165
+ I = self._I
166
+ r = self._radius
167
+ self._I_mean = cv2.blur(I, (r, r))
168
+ I_mean_sq = cv2.blur(I ** 2, (r, r))
169
+ self._I_var = I_mean_sq - self._I_mean ** 2
170
+
171
+ def _computeCoefficients(self, p):
172
+ r = self._radius
173
+ p_mean = cv2.blur(p, (r, r))
174
+ p_cov = p_mean - self._I_mean * p_mean
175
+ a = p_cov / (self._I_var + self._epsilon)
176
+ b = p_mean - a * self._I_mean
177
+ a_mean = cv2.blur(a, (r, r))
178
+ b_mean = cv2.blur(b, (r, r))
179
+ return a_mean, b_mean
180
+
181
+ def _computeOutput(self, ab, I):
182
+ a_mean, b_mean = ab
183
+ return a_mean * I + b_mean
184
+
185
+
186
+ ## Guided filter for color guidance image.
187
+ class GuidedFilterColor:
188
+ # @param I Input color guidance image.
189
+ # @param radius Radius of Guided Filter.
190
+ # @param epsilon Regularization term of Guided Filter.
191
+ def __init__(self, I, radius=5, epsilon=0.2):
192
+ self._radius = 2 * radius + 1
193
+ self._epsilon = epsilon
194
+ self._I = to32F(I)
195
+ self._initFilter()
196
+ self._filter_common = GuidedFilterCommon(self)
197
+
198
+ ## Apply filter for the input image.
199
+ # @param p Input image for the filtering.
200
+ def filter(self, p):
201
+ return self._filter_common.filter(p)
202
+
203
+ def _initFilter(self):
204
+ I = self._I
205
+ r = self._radius
206
+ eps = self._epsilon
207
+
208
+ Ir, Ig, Ib = I[:, :, 0], I[:, :, 1], I[:, :, 2]
209
+
210
+ self._Ir_mean = cv2.blur(Ir, (r, r))
211
+ self._Ig_mean = cv2.blur(Ig, (r, r))
212
+ self._Ib_mean = cv2.blur(Ib, (r, r))
213
+
214
+ Irr_var = cv2.blur(Ir ** 2, (r, r)) - self._Ir_mean ** 2 + eps
215
+ Irg_var = cv2.blur(Ir * Ig, (r, r)) - self._Ir_mean * self._Ig_mean
216
+ Irb_var = cv2.blur(Ir * Ib, (r, r)) - self._Ir_mean * self._Ib_mean
217
+ Igg_var = cv2.blur(Ig * Ig, (r, r)) - self._Ig_mean * self._Ig_mean + eps
218
+ Igb_var = cv2.blur(Ig * Ib, (r, r)) - self._Ig_mean * self._Ib_mean
219
+ Ibb_var = cv2.blur(Ib * Ib, (r, r)) - self._Ib_mean * self._Ib_mean + eps
220
+
221
+ Irr_inv = Igg_var * Ibb_var - Igb_var * Igb_var
222
+ Irg_inv = Igb_var * Irb_var - Irg_var * Ibb_var
223
+ Irb_inv = Irg_var * Igb_var - Igg_var * Irb_var
224
+ Igg_inv = Irr_var * Ibb_var - Irb_var * Irb_var
225
+ Igb_inv = Irb_var * Irg_var - Irr_var * Igb_var
226
+ Ibb_inv = Irr_var * Igg_var - Irg_var * Irg_var
227
+
228
+ I_cov = Irr_inv * Irr_var + Irg_inv * Irg_var + Irb_inv * Irb_var
229
+ Irr_inv /= I_cov
230
+ Irg_inv /= I_cov
231
+ Irb_inv /= I_cov
232
+ Igg_inv /= I_cov
233
+ Igb_inv /= I_cov
234
+ Ibb_inv /= I_cov
235
+
236
+ self._Irr_inv = Irr_inv
237
+ self._Irg_inv = Irg_inv
238
+ self._Irb_inv = Irb_inv
239
+ self._Igg_inv = Igg_inv
240
+ self._Igb_inv = Igb_inv
241
+ self._Ibb_inv = Ibb_inv
242
+
243
+ def _computeCoefficients(self, p):
244
+ r = self._radius
245
+ I = self._I
246
+ Ir, Ig, Ib = I[:, :, 0], I[:, :, 1], I[:, :, 2]
247
+
248
+ p_mean = cv2.blur(p, (r, r))
249
+
250
+ Ipr_mean = cv2.blur(Ir * p, (r, r))
251
+ Ipg_mean = cv2.blur(Ig * p, (r, r))
252
+ Ipb_mean = cv2.blur(Ib * p, (r, r))
253
+
254
+ Ipr_cov = Ipr_mean - self._Ir_mean * p_mean
255
+ Ipg_cov = Ipg_mean - self._Ig_mean * p_mean
256
+ Ipb_cov = Ipb_mean - self._Ib_mean * p_mean
257
+
258
+ ar = self._Irr_inv * Ipr_cov + self._Irg_inv * Ipg_cov + self._Irb_inv * Ipb_cov
259
+ ag = self._Irg_inv * Ipr_cov + self._Igg_inv * Ipg_cov + self._Igb_inv * Ipb_cov
260
+ ab = self._Irb_inv * Ipr_cov + self._Igb_inv * Ipg_cov + self._Ibb_inv * Ipb_cov
261
+ b = p_mean - ar * self._Ir_mean - ag * self._Ig_mean - ab * self._Ib_mean
262
+
263
+ ar_mean = cv2.blur(ar, (r, r))
264
+ ag_mean = cv2.blur(ag, (r, r))
265
+ ab_mean = cv2.blur(ab, (r, r))
266
+ b_mean = cv2.blur(b, (r, r))
267
+
268
+ return ar_mean, ag_mean, ab_mean, b_mean
269
+
270
+ def _computeOutput(self, ab, I):
271
+ ar_mean, ag_mean, ab_mean, b_mean = ab
272
+
273
+ Ir, Ig, Ib = I[:, :, 0], I[:, :, 1], I[:, :, 2]
274
+
275
+ q = (ar_mean * Ir +
276
+ ag_mean * Ig +
277
+ ab_mean * Ib +
278
+ b_mean)
279
+
280
+ return q
src/flux/annotator/util.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import cv2
3
+ import os
4
+
5
+
6
+ annotator_ckpts_path = os.path.join(os.path.dirname(__file__), 'ckpts')
7
+
8
+
9
+ def HWC3(x):
10
+ assert x.dtype == np.uint8
11
+ if x.ndim == 2:
12
+ x = x[:, :, None]
13
+ assert x.ndim == 3
14
+ H, W, C = x.shape
15
+ assert C == 1 or C == 3 or C == 4
16
+ if C == 3:
17
+ return x
18
+ if C == 1:
19
+ return np.concatenate([x, x, x], axis=2)
20
+ if C == 4:
21
+ color = x[:, :, 0:3].astype(np.float32)
22
+ alpha = x[:, :, 3:4].astype(np.float32) / 255.0
23
+ y = color * alpha + 255.0 * (1.0 - alpha)
24
+ y = y.clip(0, 255).astype(np.uint8)
25
+ return y
26
+
27
+
28
+ def resize_image(input_image, resolution):
29
+ H, W, C = input_image.shape
30
+ H = float(H)
31
+ W = float(W)
32
+ k = float(resolution) / min(H, W)
33
+ H *= k
34
+ W *= k
35
+ H = int(np.round(H / 64.0)) * 64
36
+ W = int(np.round(W / 64.0)) * 64
37
+ img = cv2.resize(input_image, (W, H), interpolation=cv2.INTER_LANCZOS4 if k > 1 else cv2.INTER_AREA)
38
+ return img
src/flux/api.py ADDED
@@ -0,0 +1,194 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import io
2
+ import os
3
+ import time
4
+ from pathlib import Path
5
+
6
+ import requests
7
+ from PIL import Image
8
+
9
+ API_ENDPOINT = "https://api.bfl.ml"
10
+
11
+
12
+ class ApiException(Exception):
13
+ def __init__(self, status_code: int, detail: str | list[dict] | None = None):
14
+ super().__init__()
15
+ self.detail = detail
16
+ self.status_code = status_code
17
+
18
+ def __str__(self) -> str:
19
+ return self.__repr__()
20
+
21
+ def __repr__(self) -> str:
22
+ if self.detail is None:
23
+ message = None
24
+ elif isinstance(self.detail, str):
25
+ message = self.detail
26
+ else:
27
+ message = "[" + ",".join(d["msg"] for d in self.detail) + "]"
28
+ return f"ApiException({self.status_code=}, {message=}, detail={self.detail})"
29
+
30
+
31
+ class ImageRequest:
32
+ def __init__(
33
+ self,
34
+ prompt: str,
35
+ width: int = 1024,
36
+ height: int = 1024,
37
+ name: str = "flux.1-pro",
38
+ num_steps: int = 50,
39
+ prompt_upsampling: bool = False,
40
+ seed: int | None = None,
41
+ validate: bool = True,
42
+ launch: bool = True,
43
+ api_key: str | None = None,
44
+ ):
45
+ """
46
+ Manages an image generation request to the API.
47
+
48
+ Args:
49
+ prompt: Prompt to sample
50
+ width: Width of the image in pixel
51
+ height: Height of the image in pixel
52
+ name: Name of the model
53
+ num_steps: Number of network evaluations
54
+ prompt_upsampling: Use prompt upsampling
55
+ seed: Fix the generation seed
56
+ validate: Run input validation
57
+ launch: Directly launches request
58
+ api_key: Your API key if not provided by the environment
59
+
60
+ Raises:
61
+ ValueError: For invalid input
62
+ ApiException: For errors raised from the API
63
+ """
64
+ if validate:
65
+ if name not in ["flux.1-pro"]:
66
+ raise ValueError(f"Invalid model {name}")
67
+ elif width % 32 != 0:
68
+ raise ValueError(f"width must be divisible by 32, got {width}")
69
+ elif not (256 <= width <= 1440):
70
+ raise ValueError(f"width must be between 256 and 1440, got {width}")
71
+ elif height % 32 != 0:
72
+ raise ValueError(f"height must be divisible by 32, got {height}")
73
+ elif not (256 <= height <= 1440):
74
+ raise ValueError(f"height must be between 256 and 1440, got {height}")
75
+ elif not (1 <= num_steps <= 50):
76
+ raise ValueError(f"steps must be between 1 and 50, got {num_steps}")
77
+
78
+ self.request_json = {
79
+ "prompt": prompt,
80
+ "width": width,
81
+ "height": height,
82
+ "variant": name,
83
+ "steps": num_steps,
84
+ "prompt_upsampling": prompt_upsampling,
85
+ }
86
+ if seed is not None:
87
+ self.request_json["seed"] = seed
88
+
89
+ self.request_id: str | None = None
90
+ self.result: dict | None = None
91
+ self._image_bytes: bytes | None = None
92
+ self._url: str | None = None
93
+ if api_key is None:
94
+ self.api_key = os.environ.get("BFL_API_KEY")
95
+ else:
96
+ self.api_key = api_key
97
+
98
+ if launch:
99
+ self.request()
100
+
101
+ def request(self):
102
+ """
103
+ Request to generate the image.
104
+ """
105
+ if self.request_id is not None:
106
+ return
107
+ response = requests.post(
108
+ f"{API_ENDPOINT}/v1/image",
109
+ headers={
110
+ "accept": "application/json",
111
+ "x-key": self.api_key,
112
+ "Content-Type": "application/json",
113
+ },
114
+ json=self.request_json,
115
+ )
116
+ result = response.json()
117
+ if response.status_code != 200:
118
+ raise ApiException(status_code=response.status_code, detail=result.get("detail"))
119
+ self.request_id = response.json()["id"]
120
+
121
+ def retrieve(self) -> dict:
122
+ """
123
+ Wait for the generation to finish and retrieve response.
124
+ """
125
+ if self.request_id is None:
126
+ self.request()
127
+ while self.result is None:
128
+ response = requests.get(
129
+ f"{API_ENDPOINT}/v1/get_result",
130
+ headers={
131
+ "accept": "application/json",
132
+ "x-key": self.api_key,
133
+ },
134
+ params={
135
+ "id": self.request_id,
136
+ },
137
+ )
138
+ result = response.json()
139
+ if "status" not in result:
140
+ raise ApiException(status_code=response.status_code, detail=result.get("detail"))
141
+ elif result["status"] == "Ready":
142
+ self.result = result["result"]
143
+ elif result["status"] == "Pending":
144
+ time.sleep(0.5)
145
+ else:
146
+ raise ApiException(status_code=200, detail=f"API returned status '{result['status']}'")
147
+ return self.result
148
+
149
+ @property
150
+ def bytes(self) -> bytes:
151
+ """
152
+ Generated image as bytes.
153
+ """
154
+ if self._image_bytes is None:
155
+ response = requests.get(self.url)
156
+ if response.status_code == 200:
157
+ self._image_bytes = response.content
158
+ else:
159
+ raise ApiException(status_code=response.status_code)
160
+ return self._image_bytes
161
+
162
+ @property
163
+ def url(self) -> str:
164
+ """
165
+ Public url to retrieve the image from
166
+ """
167
+ if self._url is None:
168
+ result = self.retrieve()
169
+ self._url = result["sample"]
170
+ return self._url
171
+
172
+ @property
173
+ def image(self) -> Image.Image:
174
+ """
175
+ Load the image as a PIL Image
176
+ """
177
+ return Image.open(io.BytesIO(self.bytes))
178
+
179
+ def save(self, path: str):
180
+ """
181
+ Save the generated image to a local path
182
+ """
183
+ suffix = Path(self.url).suffix
184
+ if not path.endswith(suffix):
185
+ path = path + suffix
186
+ Path(path).resolve().parent.mkdir(parents=True, exist_ok=True)
187
+ with open(path, "wb") as file:
188
+ file.write(self.bytes)
189
+
190
+
191
+ if __name__ == "__main__":
192
+ from fire import Fire
193
+
194
+ Fire(ImageRequest)
src/flux/cli.py ADDED
@@ -0,0 +1,254 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import time
4
+ from dataclasses import dataclass
5
+ from glob import iglob
6
+
7
+ import torch
8
+ from einops import rearrange
9
+ from fire import Fire
10
+ from PIL import ExifTags, Image
11
+
12
+ from flux.sampling import denoise, get_noise, get_schedule, prepare, unpack
13
+ from flux.util import (configs, embed_watermark, load_ae, load_clip,
14
+ load_flow_model, load_t5)
15
+ from transformers import pipeline
16
+
17
+ NSFW_THRESHOLD = 0.85
18
+
19
+ @dataclass
20
+ class SamplingOptions:
21
+ prompt: str
22
+ width: int
23
+ height: int
24
+ num_steps: int
25
+ guidance: float
26
+ seed: int | None
27
+
28
+
29
+ def parse_prompt(options: SamplingOptions) -> SamplingOptions | None:
30
+ user_question = "Next prompt (write /h for help, /q to quit and leave empty to repeat):\n"
31
+ usage = (
32
+ "Usage: Either write your prompt directly, leave this field empty "
33
+ "to repeat the prompt or write a command starting with a slash:\n"
34
+ "- '/w <width>' will set the width of the generated image\n"
35
+ "- '/h <height>' will set the height of the generated image\n"
36
+ "- '/s <seed>' sets the next seed\n"
37
+ "- '/g <guidance>' sets the guidance (flux-dev only)\n"
38
+ "- '/n <steps>' sets the number of steps\n"
39
+ "- '/q' to quit"
40
+ )
41
+
42
+ while (prompt := input(user_question)).startswith("/"):
43
+ if prompt.startswith("/w"):
44
+ if prompt.count(" ") != 1:
45
+ print(f"Got invalid command '{prompt}'\n{usage}")
46
+ continue
47
+ _, width = prompt.split()
48
+ options.width = 16 * (int(width) // 16)
49
+ print(
50
+ f"Setting resolution to {options.width} x {options.height} "
51
+ f"({options.height *options.width/1e6:.2f}MP)"
52
+ )
53
+ elif prompt.startswith("/h"):
54
+ if prompt.count(" ") != 1:
55
+ print(f"Got invalid command '{prompt}'\n{usage}")
56
+ continue
57
+ _, height = prompt.split()
58
+ options.height = 16 * (int(height) // 16)
59
+ print(
60
+ f"Setting resolution to {options.width} x {options.height} "
61
+ f"({options.height *options.width/1e6:.2f}MP)"
62
+ )
63
+ elif prompt.startswith("/g"):
64
+ if prompt.count(" ") != 1:
65
+ print(f"Got invalid command '{prompt}'\n{usage}")
66
+ continue
67
+ _, guidance = prompt.split()
68
+ options.guidance = float(guidance)
69
+ print(f"Setting guidance to {options.guidance}")
70
+ elif prompt.startswith("/s"):
71
+ if prompt.count(" ") != 1:
72
+ print(f"Got invalid command '{prompt}'\n{usage}")
73
+ continue
74
+ _, seed = prompt.split()
75
+ options.seed = int(seed)
76
+ print(f"Setting seed to {options.seed}")
77
+ elif prompt.startswith("/n"):
78
+ if prompt.count(" ") != 1:
79
+ print(f"Got invalid command '{prompt}'\n{usage}")
80
+ continue
81
+ _, steps = prompt.split()
82
+ options.num_steps = int(steps)
83
+ print(f"Setting seed to {options.num_steps}")
84
+ elif prompt.startswith("/q"):
85
+ print("Quitting")
86
+ return None
87
+ else:
88
+ if not prompt.startswith("/h"):
89
+ print(f"Got invalid command '{prompt}'\n{usage}")
90
+ print(usage)
91
+ if prompt != "":
92
+ options.prompt = prompt
93
+ return options
94
+
95
+
96
+ @torch.inference_mode()
97
+ def main(
98
+ name: str = "flux-schnell",
99
+ width: int = 1360,
100
+ height: int = 768,
101
+ seed: int | None = None,
102
+ prompt: str = (
103
+ "a photo of a forest with mist swirling around the tree trunks. The word "
104
+ '"FLUX" is painted over it in big, red brush strokes with visible texture'
105
+ ),
106
+ device: str = "cuda" if torch.cuda.is_available() else "cpu",
107
+ num_steps: int | None = None,
108
+ loop: bool = False,
109
+ guidance: float = 3.5,
110
+ offload: bool = False,
111
+ output_dir: str = "output",
112
+ add_sampling_metadata: bool = True,
113
+ ):
114
+ """
115
+ Sample the flux model. Either interactively (set `--loop`) or run for a
116
+ single image.
117
+
118
+ Args:
119
+ name: Name of the model to load
120
+ height: height of the sample in pixels (should be a multiple of 16)
121
+ width: width of the sample in pixels (should be a multiple of 16)
122
+ seed: Set a seed for sampling
123
+ output_name: where to save the output image, `{idx}` will be replaced
124
+ by the index of the sample
125
+ prompt: Prompt used for sampling
126
+ device: Pytorch device
127
+ num_steps: number of sampling steps (default 4 for schnell, 50 for guidance distilled)
128
+ loop: start an interactive session and sample multiple times
129
+ guidance: guidance value used for guidance distillation
130
+ add_sampling_metadata: Add the prompt to the image Exif metadata
131
+ """
132
+ nsfw_classifier = pipeline("image-classification", model="Falconsai/nsfw_image_detection")
133
+
134
+ if name not in configs:
135
+ available = ", ".join(configs.keys())
136
+ raise ValueError(f"Got unknown model name: {name}, chose from {available}")
137
+
138
+ torch_device = torch.device(device)
139
+ if num_steps is None:
140
+ num_steps = 4 if name == "flux-schnell" else 50
141
+
142
+ # allow for packing and conversion to latent space
143
+ height = 16 * (height // 16)
144
+ width = 16 * (width // 16)
145
+
146
+ output_name = os.path.join(output_dir, "img_{idx}.jpg")
147
+ if not os.path.exists(output_dir):
148
+ os.makedirs(output_dir)
149
+ idx = 0
150
+ else:
151
+ fns = [fn for fn in iglob(output_name.format(idx="*")) if re.search(r"img_[0-9]\.jpg$", fn)]
152
+ if len(fns) > 0:
153
+ idx = max(int(fn.split("_")[-1].split(".")[0]) for fn in fns) + 1
154
+ else:
155
+ idx = 0
156
+
157
+ # init all components
158
+ t5 = load_t5(torch_device, max_length=256 if name == "flux-schnell" else 512)
159
+ clip = load_clip(torch_device)
160
+ model = load_flow_model(name, device="cpu" if offload else torch_device)
161
+ ae = load_ae(name, device="cpu" if offload else torch_device)
162
+
163
+ rng = torch.Generator(device="cpu")
164
+ opts = SamplingOptions(
165
+ prompt=prompt,
166
+ width=width,
167
+ height=height,
168
+ num_steps=num_steps,
169
+ guidance=guidance,
170
+ seed=seed,
171
+ )
172
+
173
+ if loop:
174
+ opts = parse_prompt(opts)
175
+
176
+ while opts is not None:
177
+ if opts.seed is None:
178
+ opts.seed = rng.seed()
179
+ print(f"Generating with seed {opts.seed}:\n{opts.prompt}")
180
+ t0 = time.perf_counter()
181
+
182
+ # prepare input
183
+ x = get_noise(
184
+ 1,
185
+ opts.height,
186
+ opts.width,
187
+ device=torch_device,
188
+ dtype=torch.bfloat16,
189
+ seed=opts.seed,
190
+ )
191
+ opts.seed = None
192
+ if offload:
193
+ ae = ae.cpu()
194
+ torch.cuda.empty_cache()
195
+ t5, clip = t5.to(torch_device), clip.to(torch_device)
196
+ inp = prepare(t5, clip, x, prompt=opts.prompt)
197
+ timesteps = get_schedule(opts.num_steps, inp["img"].shape[1], shift=(name != "flux-schnell"))
198
+
199
+ # offload TEs to CPU, load model to gpu
200
+ if offload:
201
+ t5, clip = t5.cpu(), clip.cpu()
202
+ torch.cuda.empty_cache()
203
+ model = model.to(torch_device)
204
+
205
+ # denoise initial noise
206
+ x = denoise(model, **inp, timesteps=timesteps, guidance=opts.guidance)
207
+
208
+ # offload model, load autoencoder to gpu
209
+ if offload:
210
+ model.cpu()
211
+ torch.cuda.empty_cache()
212
+ ae.decoder.to(x.device)
213
+
214
+ # decode latents to pixel space
215
+ x = unpack(x.float(), opts.height, opts.width)
216
+ with torch.autocast(device_type=torch_device.type, dtype=torch.bfloat16):
217
+ x = ae.decode(x)
218
+ t1 = time.perf_counter()
219
+
220
+ fn = output_name.format(idx=idx)
221
+ print(f"Done in {t1 - t0:.1f}s. Saving {fn}")
222
+ # bring into PIL format and save
223
+ x = x.clamp(-1, 1)
224
+ x = embed_watermark(x.float())
225
+ x = rearrange(x[0], "c h w -> h w c")
226
+
227
+ img = Image.fromarray((127.5 * (x + 1.0)).cpu().byte().numpy())
228
+ nsfw_score = [x["score"] for x in nsfw_classifier(img) if x["label"] == "nsfw"][0]
229
+
230
+ if nsfw_score < NSFW_THRESHOLD:
231
+ exif_data = Image.Exif()
232
+ exif_data[ExifTags.Base.Software] = "AI generated;txt2img;flux"
233
+ exif_data[ExifTags.Base.Make] = "Black Forest Labs"
234
+ exif_data[ExifTags.Base.Model] = name
235
+ if add_sampling_metadata:
236
+ exif_data[ExifTags.Base.ImageDescription] = prompt
237
+ img.save(fn, exif=exif_data, quality=95, subsampling=0)
238
+ idx += 1
239
+ else:
240
+ print("Your generated image may contain NSFW content.")
241
+
242
+ if loop:
243
+ print("-" * 80)
244
+ opts = parse_prompt(opts)
245
+ else:
246
+ opts = None
247
+
248
+
249
+ def app():
250
+ Fire(main)
251
+
252
+
253
+ if __name__ == "__main__":
254
+ app()
src/flux/controlnet.py ADDED
@@ -0,0 +1,222 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+
3
+ import torch
4
+ from torch import Tensor, nn
5
+ from einops import rearrange
6
+
7
+ from .modules.layers import (DoubleStreamBlock, EmbedND, LastLayer,
8
+ MLPEmbedder, SingleStreamBlock,
9
+ timestep_embedding)
10
+
11
+
12
+ @dataclass
13
+ class FluxParams:
14
+ in_channels: int
15
+ vec_in_dim: int
16
+ context_in_dim: int
17
+ hidden_size: int
18
+ mlp_ratio: float
19
+ num_heads: int
20
+ depth: int
21
+ depth_single_blocks: int
22
+ axes_dim: list[int]
23
+ theta: int
24
+ qkv_bias: bool
25
+ guidance_embed: bool
26
+
27
+ def zero_module(module):
28
+ for p in module.parameters():
29
+ nn.init.zeros_(p)
30
+ return module
31
+
32
+
33
+ class ControlNetFlux(nn.Module):
34
+ """
35
+ Transformer model for flow matching on sequences.
36
+ """
37
+ _supports_gradient_checkpointing = True
38
+
39
+ def __init__(self, params: FluxParams, controlnet_depth=2):
40
+ super().__init__()
41
+
42
+ self.params = params
43
+ self.in_channels = params.in_channels
44
+ self.out_channels = self.in_channels
45
+ if params.hidden_size % params.num_heads != 0:
46
+ raise ValueError(
47
+ f"Hidden size {params.hidden_size} must be divisible by num_heads {params.num_heads}"
48
+ )
49
+ pe_dim = params.hidden_size // params.num_heads
50
+ if sum(params.axes_dim) != pe_dim:
51
+ raise ValueError(f"Got {params.axes_dim} but expected positional dim {pe_dim}")
52
+ self.hidden_size = params.hidden_size
53
+ self.num_heads = params.num_heads
54
+ self.pe_embedder = EmbedND(dim=pe_dim, theta=params.theta, axes_dim=params.axes_dim)
55
+ self.img_in = nn.Linear(self.in_channels, self.hidden_size, bias=True)
56
+ self.time_in = MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size)
57
+ self.vector_in = MLPEmbedder(params.vec_in_dim, self.hidden_size)
58
+ self.guidance_in = (
59
+ MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size) if params.guidance_embed else nn.Identity()
60
+ )
61
+ self.txt_in = nn.Linear(params.context_in_dim, self.hidden_size)
62
+
63
+ self.double_blocks = nn.ModuleList(
64
+ [
65
+ DoubleStreamBlock(
66
+ self.hidden_size,
67
+ self.num_heads,
68
+ mlp_ratio=params.mlp_ratio,
69
+ qkv_bias=params.qkv_bias,
70
+ )
71
+ for _ in range(controlnet_depth)
72
+ ]
73
+ )
74
+
75
+ # add ControlNet blocks
76
+ self.controlnet_blocks = nn.ModuleList([])
77
+ for _ in range(controlnet_depth):
78
+ controlnet_block = nn.Linear(self.hidden_size, self.hidden_size)
79
+ controlnet_block = zero_module(controlnet_block)
80
+ self.controlnet_blocks.append(controlnet_block)
81
+ self.pos_embed_input = nn.Linear(self.in_channels, self.hidden_size, bias=True)
82
+ self.gradient_checkpointing = False
83
+ self.input_hint_block = nn.Sequential(
84
+ nn.Conv2d(3, 16, 3, padding=1),
85
+ nn.SiLU(),
86
+ nn.Conv2d(16, 16, 3, padding=1),
87
+ nn.SiLU(),
88
+ nn.Conv2d(16, 16, 3, padding=1, stride=2),
89
+ nn.SiLU(),
90
+ nn.Conv2d(16, 16, 3, padding=1),
91
+ nn.SiLU(),
92
+ nn.Conv2d(16, 16, 3, padding=1, stride=2),
93
+ nn.SiLU(),
94
+ nn.Conv2d(16, 16, 3, padding=1),
95
+ nn.SiLU(),
96
+ nn.Conv2d(16, 16, 3, padding=1, stride=2),
97
+ nn.SiLU(),
98
+ zero_module(nn.Conv2d(16, 16, 3, padding=1))
99
+ )
100
+
101
+ def _set_gradient_checkpointing(self, module, value=False):
102
+ if hasattr(module, "gradient_checkpointing"):
103
+ module.gradient_checkpointing = value
104
+
105
+
106
+ @property
107
+ def attn_processors(self):
108
+ # set recursively
109
+ processors = {}
110
+
111
+ def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors):
112
+ if hasattr(module, "set_processor"):
113
+ processors[f"{name}.processor"] = module.processor
114
+
115
+ for sub_name, child in module.named_children():
116
+ fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
117
+
118
+ return processors
119
+
120
+ for name, module in self.named_children():
121
+ fn_recursive_add_processors(name, module, processors)
122
+
123
+ return processors
124
+
125
+ def set_attn_processor(self, processor):
126
+ r"""
127
+ Sets the attention processor to use to compute attention.
128
+
129
+ Parameters:
130
+ processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
131
+ The instantiated processor class or a dictionary of processor classes that will be set as the processor
132
+ for **all** `Attention` layers.
133
+
134
+ If `processor` is a dict, the key needs to define the path to the corresponding cross attention
135
+ processor. This is strongly recommended when setting trainable attention processors.
136
+
137
+ """
138
+ count = len(self.attn_processors.keys())
139
+
140
+ if isinstance(processor, dict) and len(processor) != count:
141
+ raise ValueError(
142
+ f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
143
+ f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
144
+ )
145
+
146
+ def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
147
+ if hasattr(module, "set_processor"):
148
+ if not isinstance(processor, dict):
149
+ module.set_processor(processor)
150
+ else:
151
+ module.set_processor(processor.pop(f"{name}.processor"))
152
+
153
+ for sub_name, child in module.named_children():
154
+ fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
155
+
156
+ for name, module in self.named_children():
157
+ fn_recursive_attn_processor(name, module, processor)
158
+
159
+ def forward(
160
+ self,
161
+ img: Tensor,
162
+ img_ids: Tensor,
163
+ controlnet_cond: Tensor,
164
+ txt: Tensor,
165
+ txt_ids: Tensor,
166
+ timesteps: Tensor,
167
+ y: Tensor,
168
+ guidance: Tensor | None = None,
169
+ ) -> Tensor:
170
+ if img.ndim != 3 or txt.ndim != 3:
171
+ raise ValueError("Input img and txt tensors must have 3 dimensions.")
172
+
173
+ # running on sequences img
174
+ img = self.img_in(img)
175
+ controlnet_cond = self.input_hint_block(controlnet_cond)
176
+ controlnet_cond = rearrange(controlnet_cond, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=2, pw=2)
177
+ controlnet_cond = self.pos_embed_input(controlnet_cond)
178
+ img = img + controlnet_cond
179
+ vec = self.time_in(timestep_embedding(timesteps, 256))
180
+ if self.params.guidance_embed:
181
+ if guidance is None:
182
+ raise ValueError("Didn't get guidance strength for guidance distilled model.")
183
+ vec = vec + self.guidance_in(timestep_embedding(guidance, 256))
184
+ vec = vec + self.vector_in(y)
185
+ txt = self.txt_in(txt)
186
+
187
+ ids = torch.cat((txt_ids, img_ids), dim=1)
188
+ pe = self.pe_embedder(ids)
189
+
190
+ block_res_samples = ()
191
+
192
+ for block in self.double_blocks:
193
+ if self.training and self.gradient_checkpointing:
194
+
195
+ def create_custom_forward(module, return_dict=None):
196
+ def custom_forward(*inputs):
197
+ if return_dict is not None:
198
+ return module(*inputs, return_dict=return_dict)
199
+ else:
200
+ return module(*inputs)
201
+
202
+ return custom_forward
203
+
204
+ ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
205
+ encoder_hidden_states, hidden_states = torch.utils.checkpoint.checkpoint(
206
+ create_custom_forward(block),
207
+ img,
208
+ txt,
209
+ vec,
210
+ pe,
211
+ )
212
+ else:
213
+ img, txt = block(img=img, txt=txt, vec=vec, pe=pe)
214
+
215
+ block_res_samples = block_res_samples + (img,)
216
+
217
+ controlnet_block_res_samples = ()
218
+ for block_res_sample, controlnet_block in zip(block_res_samples, self.controlnet_blocks):
219
+ block_res_sample = controlnet_block(block_res_sample)
220
+ controlnet_block_res_samples = controlnet_block_res_samples + (block_res_sample,)
221
+
222
+ return controlnet_block_res_samples
src/flux/math.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from einops import rearrange
3
+ from torch import Tensor
4
+
5
+
6
+ def attention(q: Tensor, k: Tensor, v: Tensor, pe: Tensor) -> Tensor:
7
+ q, k = apply_rope(q, k, pe)
8
+
9
+ x = torch.nn.functional.scaled_dot_product_attention(q, k, v)
10
+ x = rearrange(x, "B H L D -> B L (H D)")
11
+
12
+ return x
13
+
14
+
15
+ def rope(pos: Tensor, dim: int, theta: int) -> Tensor:
16
+ assert dim % 2 == 0
17
+ scale = torch.arange(0, dim, 2, dtype=torch.float64, device=pos.device) / dim
18
+ omega = 1.0 / (theta**scale)
19
+ out = torch.einsum("...n,d->...nd", pos, omega)
20
+ out = torch.stack([torch.cos(out), -torch.sin(out), torch.sin(out), torch.cos(out)], dim=-1)
21
+ out = rearrange(out, "b n d (i j) -> b n d i j", i=2, j=2)
22
+ return out.float()
23
+
24
+
25
+ def apply_rope(xq: Tensor, xk: Tensor, freqs_cis: Tensor) -> tuple[Tensor, Tensor]:
26
+ xq_ = xq.float().reshape(*xq.shape[:-1], -1, 1, 2)
27
+ xk_ = xk.float().reshape(*xk.shape[:-1], -1, 1, 2)
28
+ xq_out = freqs_cis[..., 0] * xq_[..., 0] + freqs_cis[..., 1] * xq_[..., 1]
29
+ xk_out = freqs_cis[..., 0] * xk_[..., 0] + freqs_cis[..., 1] * xk_[..., 1]
30
+ return xq_out.reshape(*xq.shape).type_as(xq), xk_out.reshape(*xk.shape).type_as(xk)
src/flux/model.py ADDED
@@ -0,0 +1,228 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+
3
+ import torch
4
+ from torch import Tensor, nn
5
+ from einops import rearrange
6
+
7
+ from .modules.layers import (DoubleStreamBlock, EmbedND, LastLayer,
8
+ MLPEmbedder, SingleStreamBlock,
9
+ timestep_embedding)
10
+
11
+
12
+ @dataclass
13
+ class FluxParams:
14
+ in_channels: int
15
+ vec_in_dim: int
16
+ context_in_dim: int
17
+ hidden_size: int
18
+ mlp_ratio: float
19
+ num_heads: int
20
+ depth: int
21
+ depth_single_blocks: int
22
+ axes_dim: list[int]
23
+ theta: int
24
+ qkv_bias: bool
25
+ guidance_embed: bool
26
+
27
+
28
+ class Flux(nn.Module):
29
+ """
30
+ Transformer model for flow matching on sequences.
31
+ """
32
+ _supports_gradient_checkpointing = True
33
+
34
+ def __init__(self, params: FluxParams):
35
+ super().__init__()
36
+
37
+ self.params = params
38
+ self.in_channels = params.in_channels
39
+ self.out_channels = self.in_channels
40
+ if params.hidden_size % params.num_heads != 0:
41
+ raise ValueError(
42
+ f"Hidden size {params.hidden_size} must be divisible by num_heads {params.num_heads}"
43
+ )
44
+ pe_dim = params.hidden_size // params.num_heads
45
+ if sum(params.axes_dim) != pe_dim:
46
+ raise ValueError(f"Got {params.axes_dim} but expected positional dim {pe_dim}")
47
+ self.hidden_size = params.hidden_size
48
+ self.num_heads = params.num_heads
49
+ self.pe_embedder = EmbedND(dim=pe_dim, theta=params.theta, axes_dim=params.axes_dim)
50
+ self.img_in = nn.Linear(self.in_channels, self.hidden_size, bias=True)
51
+ self.time_in = MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size)
52
+ self.vector_in = MLPEmbedder(params.vec_in_dim, self.hidden_size)
53
+ self.guidance_in = (
54
+ MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size) if params.guidance_embed else nn.Identity()
55
+ )
56
+ self.txt_in = nn.Linear(params.context_in_dim, self.hidden_size)
57
+
58
+ self.double_blocks = nn.ModuleList(
59
+ [
60
+ DoubleStreamBlock(
61
+ self.hidden_size,
62
+ self.num_heads,
63
+ mlp_ratio=params.mlp_ratio,
64
+ qkv_bias=params.qkv_bias,
65
+ )
66
+ for _ in range(params.depth)
67
+ ]
68
+ )
69
+
70
+ self.single_blocks = nn.ModuleList(
71
+ [
72
+ SingleStreamBlock(self.hidden_size, self.num_heads, mlp_ratio=params.mlp_ratio)
73
+ for _ in range(params.depth_single_blocks)
74
+ ]
75
+ )
76
+
77
+ self.final_layer = LastLayer(self.hidden_size, 1, self.out_channels)
78
+ self.gradient_checkpointing = False
79
+
80
+ def _set_gradient_checkpointing(self, module, value=False):
81
+ if hasattr(module, "gradient_checkpointing"):
82
+ module.gradient_checkpointing = value
83
+
84
+ @property
85
+ def attn_processors(self):
86
+ # set recursively
87
+ processors = {}
88
+
89
+ def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors):
90
+ if hasattr(module, "set_processor"):
91
+ processors[f"{name}.processor"] = module.processor
92
+
93
+ for sub_name, child in module.named_children():
94
+ fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
95
+
96
+ return processors
97
+
98
+ for name, module in self.named_children():
99
+ fn_recursive_add_processors(name, module, processors)
100
+
101
+ return processors
102
+
103
+ def set_attn_processor(self, processor):
104
+ r"""
105
+ Sets the attention processor to use to compute attention.
106
+
107
+ Parameters:
108
+ processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
109
+ The instantiated processor class or a dictionary of processor classes that will be set as the processor
110
+ for **all** `Attention` layers.
111
+
112
+ If `processor` is a dict, the key needs to define the path to the corresponding cross attention
113
+ processor. This is strongly recommended when setting trainable attention processors.
114
+
115
+ """
116
+ count = len(self.attn_processors.keys())
117
+
118
+ if isinstance(processor, dict) and len(processor) != count:
119
+ raise ValueError(
120
+ f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
121
+ f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
122
+ )
123
+
124
+ def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
125
+ if hasattr(module, "set_processor"):
126
+ if not isinstance(processor, dict):
127
+ module.set_processor(processor)
128
+ else:
129
+ module.set_processor(processor.pop(f"{name}.processor"))
130
+
131
+ for sub_name, child in module.named_children():
132
+ fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
133
+
134
+ for name, module in self.named_children():
135
+ fn_recursive_attn_processor(name, module, processor)
136
+
137
+ def forward(
138
+ self,
139
+ img: Tensor,
140
+ img_ids: Tensor,
141
+ txt: Tensor,
142
+ txt_ids: Tensor,
143
+ timesteps: Tensor,
144
+ y: Tensor,
145
+ block_controlnet_hidden_states=None,
146
+ guidance: Tensor | None = None,
147
+ image_proj: Tensor | None = None,
148
+ ip_scale: Tensor | float = 1.0,
149
+ ) -> Tensor:
150
+ if img.ndim != 3 or txt.ndim != 3:
151
+ raise ValueError("Input img and txt tensors must have 3 dimensions.")
152
+
153
+ # running on sequences img
154
+ img = self.img_in(img)
155
+ vec = self.time_in(timestep_embedding(timesteps, 256))
156
+ if self.params.guidance_embed:
157
+ if guidance is None:
158
+ raise ValueError("Didn't get guidance strength for guidance distilled model.")
159
+ vec = vec + self.guidance_in(timestep_embedding(guidance, 256))
160
+ vec = vec + self.vector_in(y)
161
+ txt = self.txt_in(txt)
162
+
163
+ ids = torch.cat((txt_ids, img_ids), dim=1)
164
+ pe = self.pe_embedder(ids)
165
+ if block_controlnet_hidden_states is not None:
166
+ controlnet_depth = len(block_controlnet_hidden_states)
167
+ for index_block, block in enumerate(self.double_blocks):
168
+ if self.training and self.gradient_checkpointing:
169
+
170
+ def create_custom_forward(module, return_dict=None):
171
+ def custom_forward(*inputs):
172
+ if return_dict is not None:
173
+ return module(*inputs, return_dict=return_dict)
174
+ else:
175
+ return module(*inputs)
176
+
177
+ return custom_forward
178
+
179
+ ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
180
+ encoder_hidden_states, hidden_states = torch.utils.checkpoint.checkpoint(
181
+ create_custom_forward(block),
182
+ img,
183
+ txt,
184
+ vec,
185
+ pe,
186
+ image_proj,
187
+ ip_scale,
188
+ )
189
+ else:
190
+ img, txt = block(
191
+ img=img,
192
+ txt=txt,
193
+ vec=vec,
194
+ pe=pe,
195
+ image_proj=image_proj,
196
+ ip_scale=ip_scale,
197
+ )
198
+ # controlnet residual
199
+ if block_controlnet_hidden_states is not None:
200
+ img = img + block_controlnet_hidden_states[index_block % 2]
201
+
202
+
203
+ img = torch.cat((txt, img), 1)
204
+ for block in self.single_blocks:
205
+ if self.training and self.gradient_checkpointing:
206
+
207
+ def create_custom_forward(module, return_dict=None):
208
+ def custom_forward(*inputs):
209
+ if return_dict is not None:
210
+ return module(*inputs, return_dict=return_dict)
211
+ else:
212
+ return module(*inputs)
213
+
214
+ return custom_forward
215
+
216
+ ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
217
+ encoder_hidden_states, hidden_states = torch.utils.checkpoint.checkpoint(
218
+ create_custom_forward(block),
219
+ img,
220
+ vec,
221
+ pe,
222
+ )
223
+ else:
224
+ img = block(img, vec=vec, pe=pe)
225
+ img = img[:, txt.shape[1] :, ...]
226
+
227
+ img = self.final_layer(img, vec) # (N, T, patch_size ** 2 * out_channels)
228
+ return img
src/flux/modules/autoencoder.py ADDED
@@ -0,0 +1,312 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+
3
+ import torch
4
+ from einops import rearrange
5
+ from torch import Tensor, nn
6
+
7
+
8
+ @dataclass
9
+ class AutoEncoderParams:
10
+ resolution: int
11
+ in_channels: int
12
+ ch: int
13
+ out_ch: int
14
+ ch_mult: list[int]
15
+ num_res_blocks: int
16
+ z_channels: int
17
+ scale_factor: float
18
+ shift_factor: float
19
+
20
+
21
+ def swish(x: Tensor) -> Tensor:
22
+ return x * torch.sigmoid(x)
23
+
24
+
25
+ class AttnBlock(nn.Module):
26
+ def __init__(self, in_channels: int):
27
+ super().__init__()
28
+ self.in_channels = in_channels
29
+
30
+ self.norm = nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
31
+
32
+ self.q = nn.Conv2d(in_channels, in_channels, kernel_size=1)
33
+ self.k = nn.Conv2d(in_channels, in_channels, kernel_size=1)
34
+ self.v = nn.Conv2d(in_channels, in_channels, kernel_size=1)
35
+ self.proj_out = nn.Conv2d(in_channels, in_channels, kernel_size=1)
36
+
37
+ def attention(self, h_: Tensor) -> Tensor:
38
+ h_ = self.norm(h_)
39
+ q = self.q(h_)
40
+ k = self.k(h_)
41
+ v = self.v(h_)
42
+
43
+ b, c, h, w = q.shape
44
+ q = rearrange(q, "b c h w -> b 1 (h w) c").contiguous()
45
+ k = rearrange(k, "b c h w -> b 1 (h w) c").contiguous()
46
+ v = rearrange(v, "b c h w -> b 1 (h w) c").contiguous()
47
+ h_ = nn.functional.scaled_dot_product_attention(q, k, v)
48
+
49
+ return rearrange(h_, "b 1 (h w) c -> b c h w", h=h, w=w, c=c, b=b)
50
+
51
+ def forward(self, x: Tensor) -> Tensor:
52
+ return x + self.proj_out(self.attention(x))
53
+
54
+
55
+ class ResnetBlock(nn.Module):
56
+ def __init__(self, in_channels: int, out_channels: int):
57
+ super().__init__()
58
+ self.in_channels = in_channels
59
+ out_channels = in_channels if out_channels is None else out_channels
60
+ self.out_channels = out_channels
61
+
62
+ self.norm1 = nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
63
+ self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
64
+ self.norm2 = nn.GroupNorm(num_groups=32, num_channels=out_channels, eps=1e-6, affine=True)
65
+ self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
66
+ if self.in_channels != self.out_channels:
67
+ self.nin_shortcut = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
68
+
69
+ def forward(self, x):
70
+ h = x
71
+ h = self.norm1(h)
72
+ h = swish(h)
73
+ h = self.conv1(h)
74
+
75
+ h = self.norm2(h)
76
+ h = swish(h)
77
+ h = self.conv2(h)
78
+
79
+ if self.in_channels != self.out_channels:
80
+ x = self.nin_shortcut(x)
81
+
82
+ return x + h
83
+
84
+
85
+ class Downsample(nn.Module):
86
+ def __init__(self, in_channels: int):
87
+ super().__init__()
88
+ # no asymmetric padding in torch conv, must do it ourselves
89
+ self.conv = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=2, padding=0)
90
+
91
+ def forward(self, x: Tensor):
92
+ pad = (0, 1, 0, 1)
93
+ x = nn.functional.pad(x, pad, mode="constant", value=0)
94
+ x = self.conv(x)
95
+ return x
96
+
97
+
98
+ class Upsample(nn.Module):
99
+ def __init__(self, in_channels: int):
100
+ super().__init__()
101
+ self.conv = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1)
102
+
103
+ def forward(self, x: Tensor):
104
+ x = nn.functional.interpolate(x, scale_factor=2.0, mode="nearest")
105
+ x = self.conv(x)
106
+ return x
107
+
108
+
109
+ class Encoder(nn.Module):
110
+ def __init__(
111
+ self,
112
+ resolution: int,
113
+ in_channels: int,
114
+ ch: int,
115
+ ch_mult: list[int],
116
+ num_res_blocks: int,
117
+ z_channels: int,
118
+ ):
119
+ super().__init__()
120
+ self.ch = ch
121
+ self.num_resolutions = len(ch_mult)
122
+ self.num_res_blocks = num_res_blocks
123
+ self.resolution = resolution
124
+ self.in_channels = in_channels
125
+ # downsampling
126
+ self.conv_in = nn.Conv2d(in_channels, self.ch, kernel_size=3, stride=1, padding=1)
127
+
128
+ curr_res = resolution
129
+ in_ch_mult = (1,) + tuple(ch_mult)
130
+ self.in_ch_mult = in_ch_mult
131
+ self.down = nn.ModuleList()
132
+ block_in = self.ch
133
+ for i_level in range(self.num_resolutions):
134
+ block = nn.ModuleList()
135
+ attn = nn.ModuleList()
136
+ block_in = ch * in_ch_mult[i_level]
137
+ block_out = ch * ch_mult[i_level]
138
+ for _ in range(self.num_res_blocks):
139
+ block.append(ResnetBlock(in_channels=block_in, out_channels=block_out))
140
+ block_in = block_out
141
+ down = nn.Module()
142
+ down.block = block
143
+ down.attn = attn
144
+ if i_level != self.num_resolutions - 1:
145
+ down.downsample = Downsample(block_in)
146
+ curr_res = curr_res // 2
147
+ self.down.append(down)
148
+
149
+ # middle
150
+ self.mid = nn.Module()
151
+ self.mid.block_1 = ResnetBlock(in_channels=block_in, out_channels=block_in)
152
+ self.mid.attn_1 = AttnBlock(block_in)
153
+ self.mid.block_2 = ResnetBlock(in_channels=block_in, out_channels=block_in)
154
+
155
+ # end
156
+ self.norm_out = nn.GroupNorm(num_groups=32, num_channels=block_in, eps=1e-6, affine=True)
157
+ self.conv_out = nn.Conv2d(block_in, 2 * z_channels, kernel_size=3, stride=1, padding=1)
158
+
159
+ def forward(self, x: Tensor) -> Tensor:
160
+ # downsampling
161
+ hs = [self.conv_in(x)]
162
+ for i_level in range(self.num_resolutions):
163
+ for i_block in range(self.num_res_blocks):
164
+ h = self.down[i_level].block[i_block](hs[-1])
165
+ if len(self.down[i_level].attn) > 0:
166
+ h = self.down[i_level].attn[i_block](h)
167
+ hs.append(h)
168
+ if i_level != self.num_resolutions - 1:
169
+ hs.append(self.down[i_level].downsample(hs[-1]))
170
+
171
+ # middle
172
+ h = hs[-1]
173
+ h = self.mid.block_1(h)
174
+ h = self.mid.attn_1(h)
175
+ h = self.mid.block_2(h)
176
+ # end
177
+ h = self.norm_out(h)
178
+ h = swish(h)
179
+ h = self.conv_out(h)
180
+ return h
181
+
182
+
183
+ class Decoder(nn.Module):
184
+ def __init__(
185
+ self,
186
+ ch: int,
187
+ out_ch: int,
188
+ ch_mult: list[int],
189
+ num_res_blocks: int,
190
+ in_channels: int,
191
+ resolution: int,
192
+ z_channels: int,
193
+ ):
194
+ super().__init__()
195
+ self.ch = ch
196
+ self.num_resolutions = len(ch_mult)
197
+ self.num_res_blocks = num_res_blocks
198
+ self.resolution = resolution
199
+ self.in_channels = in_channels
200
+ self.ffactor = 2 ** (self.num_resolutions - 1)
201
+
202
+ # compute in_ch_mult, block_in and curr_res at lowest res
203
+ block_in = ch * ch_mult[self.num_resolutions - 1]
204
+ curr_res = resolution // 2 ** (self.num_resolutions - 1)
205
+ self.z_shape = (1, z_channels, curr_res, curr_res)
206
+
207
+ # z to block_in
208
+ self.conv_in = nn.Conv2d(z_channels, block_in, kernel_size=3, stride=1, padding=1)
209
+
210
+ # middle
211
+ self.mid = nn.Module()
212
+ self.mid.block_1 = ResnetBlock(in_channels=block_in, out_channels=block_in)
213
+ self.mid.attn_1 = AttnBlock(block_in)
214
+ self.mid.block_2 = ResnetBlock(in_channels=block_in, out_channels=block_in)
215
+
216
+ # upsampling
217
+ self.up = nn.ModuleList()
218
+ for i_level in reversed(range(self.num_resolutions)):
219
+ block = nn.ModuleList()
220
+ attn = nn.ModuleList()
221
+ block_out = ch * ch_mult[i_level]
222
+ for _ in range(self.num_res_blocks + 1):
223
+ block.append(ResnetBlock(in_channels=block_in, out_channels=block_out))
224
+ block_in = block_out
225
+ up = nn.Module()
226
+ up.block = block
227
+ up.attn = attn
228
+ if i_level != 0:
229
+ up.upsample = Upsample(block_in)
230
+ curr_res = curr_res * 2
231
+ self.up.insert(0, up) # prepend to get consistent order
232
+
233
+ # end
234
+ self.norm_out = nn.GroupNorm(num_groups=32, num_channels=block_in, eps=1e-6, affine=True)
235
+ self.conv_out = nn.Conv2d(block_in, out_ch, kernel_size=3, stride=1, padding=1)
236
+
237
+ def forward(self, z: Tensor) -> Tensor:
238
+ # z to block_in
239
+ h = self.conv_in(z)
240
+
241
+ # middle
242
+ h = self.mid.block_1(h)
243
+ h = self.mid.attn_1(h)
244
+ h = self.mid.block_2(h)
245
+
246
+ # upsampling
247
+ for i_level in reversed(range(self.num_resolutions)):
248
+ for i_block in range(self.num_res_blocks + 1):
249
+ h = self.up[i_level].block[i_block](h)
250
+ if len(self.up[i_level].attn) > 0:
251
+ h = self.up[i_level].attn[i_block](h)
252
+ if i_level != 0:
253
+ h = self.up[i_level].upsample(h)
254
+
255
+ # end
256
+ h = self.norm_out(h)
257
+ h = swish(h)
258
+ h = self.conv_out(h)
259
+ return h
260
+
261
+
262
+ class DiagonalGaussian(nn.Module):
263
+ def __init__(self, sample: bool = True, chunk_dim: int = 1):
264
+ super().__init__()
265
+ self.sample = sample
266
+ self.chunk_dim = chunk_dim
267
+
268
+ def forward(self, z: Tensor) -> Tensor:
269
+ mean, logvar = torch.chunk(z, 2, dim=self.chunk_dim)
270
+ if self.sample:
271
+ std = torch.exp(0.5 * logvar)
272
+ return mean + std * torch.randn_like(mean)
273
+ else:
274
+ return mean
275
+
276
+
277
+ class AutoEncoder(nn.Module):
278
+ def __init__(self, params: AutoEncoderParams):
279
+ super().__init__()
280
+ self.encoder = Encoder(
281
+ resolution=params.resolution,
282
+ in_channels=params.in_channels,
283
+ ch=params.ch,
284
+ ch_mult=params.ch_mult,
285
+ num_res_blocks=params.num_res_blocks,
286
+ z_channels=params.z_channels,
287
+ )
288
+ self.decoder = Decoder(
289
+ resolution=params.resolution,
290
+ in_channels=params.in_channels,
291
+ ch=params.ch,
292
+ out_ch=params.out_ch,
293
+ ch_mult=params.ch_mult,
294
+ num_res_blocks=params.num_res_blocks,
295
+ z_channels=params.z_channels,
296
+ )
297
+ self.reg = DiagonalGaussian()
298
+
299
+ self.scale_factor = params.scale_factor
300
+ self.shift_factor = params.shift_factor
301
+
302
+ def encode(self, x: Tensor) -> Tensor:
303
+ z = self.reg(self.encoder(x))
304
+ z = self.scale_factor * (z - self.shift_factor)
305
+ return z
306
+
307
+ def decode(self, z: Tensor) -> Tensor:
308
+ z = z / self.scale_factor + self.shift_factor
309
+ return self.decoder(z)
310
+
311
+ def forward(self, x: Tensor) -> Tensor:
312
+ return self.decode(self.encode(x))
src/flux/modules/conditioner.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from torch import Tensor, nn
2
+ from transformers import (CLIPTextModel, CLIPTokenizer, T5EncoderModel,
3
+ T5Tokenizer)
4
+
5
+
6
+ class HFEmbedder(nn.Module):
7
+ def __init__(self, version: str, max_length: int, **hf_kwargs):
8
+ super().__init__()
9
+ self.is_clip = version.startswith("openai")
10
+ self.max_length = max_length
11
+ self.output_key = "pooler_output" if self.is_clip else "last_hidden_state"
12
+
13
+ if self.is_clip:
14
+ self.tokenizer: CLIPTokenizer = CLIPTokenizer.from_pretrained(version, max_length=max_length)
15
+ self.hf_module: CLIPTextModel = CLIPTextModel.from_pretrained(version, **hf_kwargs)
16
+ else:
17
+ self.tokenizer: T5Tokenizer = T5Tokenizer.from_pretrained(version, max_length=max_length)
18
+ self.hf_module: T5EncoderModel = T5EncoderModel.from_pretrained(version, **hf_kwargs)
19
+
20
+ self.hf_module = self.hf_module.eval().requires_grad_(False)
21
+
22
+ def forward(self, text: list[str]) -> Tensor:
23
+ batch_encoding = self.tokenizer(
24
+ text,
25
+ truncation=True,
26
+ max_length=self.max_length,
27
+ return_length=False,
28
+ return_overflowing_tokens=False,
29
+ padding="max_length",
30
+ return_tensors="pt",
31
+ )
32
+
33
+ outputs = self.hf_module(
34
+ input_ids=batch_encoding["input_ids"].to(self.hf_module.device),
35
+ attention_mask=None,
36
+ output_hidden_states=False,
37
+ )
38
+ return outputs[self.output_key]
src/flux/modules/layers.py ADDED
@@ -0,0 +1,567 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ from dataclasses import dataclass
3
+
4
+ import torch
5
+ from einops import rearrange
6
+ from torch import Tensor, nn
7
+
8
+ from ..math import attention, rope
9
+ import torch.nn.functional as F
10
+
11
+ class EmbedND(nn.Module):
12
+ def __init__(self, dim: int, theta: int, axes_dim: list[int]):
13
+ super().__init__()
14
+ self.dim = dim
15
+ self.theta = theta
16
+ self.axes_dim = axes_dim
17
+
18
+ def forward(self, ids: Tensor) -> Tensor:
19
+ n_axes = ids.shape[-1]
20
+ emb = torch.cat(
21
+ [rope(ids[..., i], self.axes_dim[i], self.theta) for i in range(n_axes)],
22
+ dim=-3,
23
+ )
24
+
25
+ return emb.unsqueeze(1)
26
+
27
+
28
+ def timestep_embedding(t: Tensor, dim, max_period=10000, time_factor: float = 1000.0):
29
+ """
30
+ Create sinusoidal timestep embeddings.
31
+ :param t: a 1-D Tensor of N indices, one per batch element.
32
+ These may be fractional.
33
+ :param dim: the dimension of the output.
34
+ :param max_period: controls the minimum frequency of the embeddings.
35
+ :return: an (N, D) Tensor of positional embeddings.
36
+ """
37
+ t = time_factor * t
38
+ half = dim // 2
39
+ freqs = torch.exp(-math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half).to(
40
+ t.device
41
+ )
42
+
43
+ args = t[:, None].float() * freqs[None]
44
+ embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
45
+ if dim % 2:
46
+ embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
47
+ if torch.is_floating_point(t):
48
+ embedding = embedding.to(t)
49
+ return embedding
50
+
51
+
52
+ class MLPEmbedder(nn.Module):
53
+ def __init__(self, in_dim: int, hidden_dim: int):
54
+ super().__init__()
55
+ self.in_layer = nn.Linear(in_dim, hidden_dim, bias=True)
56
+ self.silu = nn.SiLU()
57
+ self.out_layer = nn.Linear(hidden_dim, hidden_dim, bias=True)
58
+
59
+ def forward(self, x: Tensor) -> Tensor:
60
+ return self.out_layer(self.silu(self.in_layer(x)))
61
+
62
+
63
+ class RMSNorm(torch.nn.Module):
64
+ def __init__(self, dim: int):
65
+ super().__init__()
66
+ self.scale = nn.Parameter(torch.ones(dim))
67
+
68
+ def forward(self, x: Tensor):
69
+ x_dtype = x.dtype
70
+ x = x.float()
71
+ rrms = torch.rsqrt(torch.mean(x**2, dim=-1, keepdim=True) + 1e-6)
72
+ return (x * rrms).to(dtype=x_dtype) * self.scale
73
+
74
+
75
+ class QKNorm(torch.nn.Module):
76
+ def __init__(self, dim: int):
77
+ super().__init__()
78
+ self.query_norm = RMSNorm(dim)
79
+ self.key_norm = RMSNorm(dim)
80
+
81
+ def forward(self, q: Tensor, k: Tensor, v: Tensor) -> tuple[Tensor, Tensor]:
82
+ q = self.query_norm(q)
83
+ k = self.key_norm(k)
84
+ return q.to(v), k.to(v)
85
+
86
+ class LoRALinearLayer(nn.Module):
87
+ def __init__(self, in_features, out_features, rank=4, network_alpha=None, device=None, dtype=None):
88
+ super().__init__()
89
+
90
+ self.down = nn.Linear(in_features, rank, bias=False, device=device, dtype=dtype)
91
+ self.up = nn.Linear(rank, out_features, bias=False, device=device, dtype=dtype)
92
+ # This value has the same meaning as the `--network_alpha` option in the kohya-ss trainer script.
93
+ # See https://github.com/darkstorm2150/sd-scripts/blob/main/docs/train_network_README-en.md#execute-learning
94
+ self.network_alpha = network_alpha
95
+ self.rank = rank
96
+
97
+ nn.init.normal_(self.down.weight, std=1 / rank)
98
+ nn.init.zeros_(self.up.weight)
99
+
100
+ def forward(self, hidden_states):
101
+ orig_dtype = hidden_states.dtype
102
+ dtype = self.down.weight.dtype
103
+
104
+ down_hidden_states = self.down(hidden_states.to(dtype))
105
+ up_hidden_states = self.up(down_hidden_states)
106
+
107
+ if self.network_alpha is not None:
108
+ up_hidden_states *= self.network_alpha / self.rank
109
+
110
+ return up_hidden_states.to(orig_dtype)
111
+
112
+ class FLuxSelfAttnProcessor:
113
+ def __call__(self, attn, x, pe, **attention_kwargs):
114
+ print('2' * 30)
115
+
116
+ qkv = attn.qkv(x)
117
+ q, k, v = rearrange(qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
118
+ q, k = attn.norm(q, k, v)
119
+ x = attention(q, k, v, pe=pe)
120
+ x = attn.proj(x)
121
+ return x
122
+
123
+ class LoraFluxAttnProcessor(nn.Module):
124
+
125
+ def __init__(self, dim: int, rank=4, network_alpha=None, lora_weight=1):
126
+ super().__init__()
127
+ self.qkv_lora = LoRALinearLayer(dim, dim * 3, rank, network_alpha)
128
+ self.proj_lora = LoRALinearLayer(dim, dim, rank, network_alpha)
129
+ self.lora_weight = lora_weight
130
+
131
+
132
+ def __call__(self, attn, x, pe, **attention_kwargs):
133
+ qkv = attn.qkv(x) + self.qkv_lora(x) * self.lora_weight
134
+ q, k, v = rearrange(qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
135
+ q, k = attn.norm(q, k, v)
136
+ x = attention(q, k, v, pe=pe)
137
+ x = attn.proj(x) + self.proj_lora(x) * self.lora_weight
138
+ print('1' * 30)
139
+ print(x.norm(), (self.proj_lora(x) * self.lora_weight).norm(), 'norm')
140
+ return x
141
+
142
+ class SelfAttention(nn.Module):
143
+ def __init__(self, dim: int, num_heads: int = 8, qkv_bias: bool = False):
144
+ super().__init__()
145
+ self.num_heads = num_heads
146
+ head_dim = dim // num_heads
147
+
148
+ self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
149
+ self.norm = QKNorm(head_dim)
150
+ self.proj = nn.Linear(dim, dim)
151
+ def forward():
152
+ pass
153
+
154
+
155
+ @dataclass
156
+ class ModulationOut:
157
+ shift: Tensor
158
+ scale: Tensor
159
+ gate: Tensor
160
+
161
+
162
+ class Modulation(nn.Module):
163
+ def __init__(self, dim: int, double: bool):
164
+ super().__init__()
165
+ self.is_double = double
166
+ self.multiplier = 6 if double else 3
167
+ self.lin = nn.Linear(dim, self.multiplier * dim, bias=True)
168
+
169
+ def forward(self, vec: Tensor) -> tuple[ModulationOut, ModulationOut | None]:
170
+ out = self.lin(nn.functional.silu(vec))[:, None, :].chunk(self.multiplier, dim=-1)
171
+
172
+ return (
173
+ ModulationOut(*out[:3]),
174
+ ModulationOut(*out[3:]) if self.is_double else None,
175
+ )
176
+
177
+ class DoubleStreamBlockLoraProcessor(nn.Module):
178
+ def __init__(self, dim: int, rank=4, network_alpha=None, lora_weight=1):
179
+ super().__init__()
180
+ self.qkv_lora1 = LoRALinearLayer(dim, dim * 3, rank, network_alpha)
181
+ self.proj_lora1 = LoRALinearLayer(dim, dim, rank, network_alpha)
182
+ self.qkv_lora2 = LoRALinearLayer(dim, dim * 3, rank, network_alpha)
183
+ self.proj_lora2 = LoRALinearLayer(dim, dim, rank, network_alpha)
184
+ self.lora_weight = lora_weight
185
+
186
+ def forward(self, attn, img, txt, vec, pe, **attention_kwargs):
187
+ img_mod1, img_mod2 = attn.img_mod(vec)
188
+ txt_mod1, txt_mod2 = attn.txt_mod(vec)
189
+
190
+ # prepare image for attention
191
+ img_modulated = attn.img_norm1(img)
192
+ img_modulated = (1 + img_mod1.scale) * img_modulated + img_mod1.shift
193
+ img_qkv = attn.img_attn.qkv(img_modulated) + self.qkv_lora1(img_modulated) * self.lora_weight
194
+ img_q, img_k, img_v = rearrange(img_qkv, "B L (K H D) -> K B H L D", K=3, H=attn.num_heads)
195
+ img_q, img_k = attn.img_attn.norm(img_q, img_k, img_v)
196
+
197
+ # prepare txt for attention
198
+ txt_modulated = attn.txt_norm1(txt)
199
+ txt_modulated = (1 + txt_mod1.scale) * txt_modulated + txt_mod1.shift
200
+ txt_qkv = attn.txt_attn.qkv(txt_modulated) + self.qkv_lora2(txt_modulated) * self.lora_weight
201
+ txt_q, txt_k, txt_v = rearrange(txt_qkv, "B L (K H D) -> K B H L D", K=3, H=attn.num_heads)
202
+ txt_q, txt_k = attn.txt_attn.norm(txt_q, txt_k, txt_v)
203
+
204
+ # run actual attention
205
+ q = torch.cat((txt_q, img_q), dim=2)
206
+ k = torch.cat((txt_k, img_k), dim=2)
207
+ v = torch.cat((txt_v, img_v), dim=2)
208
+
209
+ attn1 = attention(q, k, v, pe=pe)
210
+ txt_attn, img_attn = attn1[:, : txt.shape[1]], attn1[:, txt.shape[1] :]
211
+
212
+ # calculate the img bloks
213
+ img = img + img_mod1.gate * attn.img_attn.proj(img_attn) + img_mod1.gate * self.proj_lora1(img_attn) * self.lora_weight
214
+ img = img + img_mod2.gate * attn.img_mlp((1 + img_mod2.scale) * attn.img_norm2(img) + img_mod2.shift)
215
+
216
+ # calculate the txt bloks
217
+ txt = txt + txt_mod1.gate * attn.txt_attn.proj(txt_attn) + txt_mod1.gate * self.proj_lora2(txt_attn) * self.lora_weight
218
+ txt = txt + txt_mod2.gate * attn.txt_mlp((1 + txt_mod2.scale) * attn.txt_norm2(txt) + txt_mod2.shift)
219
+ return img, txt
220
+
221
+ class IPDoubleStreamBlockProcessor(nn.Module):
222
+ """Attention processor for handling IP-adapter with double stream block."""
223
+
224
+ def __init__(self, context_dim, hidden_dim):
225
+ super().__init__()
226
+ if not hasattr(F, "scaled_dot_product_attention"):
227
+ raise ImportError(
228
+ "IPDoubleStreamBlockProcessor requires PyTorch 2.0 or higher. Please upgrade PyTorch."
229
+ )
230
+
231
+ # Ensure context_dim matches the dimension of image_proj
232
+ self.context_dim = context_dim
233
+ self.hidden_dim = hidden_dim
234
+
235
+ # Initialize projections for IP-adapter
236
+ self.ip_adapter_double_stream_k_proj = nn.Linear(context_dim, hidden_dim, bias=True)
237
+ self.ip_adapter_double_stream_v_proj = nn.Linear(context_dim, hidden_dim, bias=True)
238
+
239
+ nn.init.zeros_(self.ip_adapter_double_stream_k_proj.weight)
240
+ nn.init.zeros_(self.ip_adapter_double_stream_k_proj.bias)
241
+
242
+ nn.init.zeros_(self.ip_adapter_double_stream_v_proj.weight)
243
+ nn.init.zeros_(self.ip_adapter_double_stream_v_proj.bias)
244
+
245
+ def __call__(self, attn, img, txt, vec, pe, image_proj, ip_scale=1.0, **attention_kwargs):
246
+
247
+ # Prepare image for attention
248
+ img_mod1, img_mod2 = attn.img_mod(vec)
249
+ txt_mod1, txt_mod2 = attn.txt_mod(vec)
250
+
251
+ img_modulated = attn.img_norm1(img)
252
+ img_modulated = (1 + img_mod1.scale) * img_modulated + img_mod1.shift
253
+ img_qkv = attn.img_attn.qkv(img_modulated)
254
+ img_q, img_k, img_v = rearrange(img_qkv, "B L (K H D) -> K B H L D", K=3, H=attn.num_heads, D=attn.head_dim)
255
+ img_q, img_k = attn.img_attn.norm(img_q, img_k, img_v)
256
+
257
+ txt_modulated = attn.txt_norm1(txt)
258
+ txt_modulated = (1 + txt_mod1.scale) * txt_modulated + txt_mod1.shift
259
+ txt_qkv = attn.txt_attn.qkv(txt_modulated)
260
+ txt_q, txt_k, txt_v = rearrange(txt_qkv, "B L (K H D) -> K B H L D", K=3, H=attn.num_heads, D=attn.head_dim)
261
+ txt_q, txt_k = attn.txt_attn.norm(txt_q, txt_k, txt_v)
262
+
263
+ q = torch.cat((txt_q, img_q), dim=2)
264
+ k = torch.cat((txt_k, img_k), dim=2)
265
+ v = torch.cat((txt_v, img_v), dim=2)
266
+
267
+ attn1 = attention(q, k, v, pe=pe)
268
+ txt_attn, img_attn = attn1[:, :txt.shape[1]], attn1[:, txt.shape[1]:]
269
+
270
+ # print(f"txt_attn shape: {txt_attn.size()}")
271
+ # print(f"img_attn shape: {img_attn.size()}")
272
+
273
+ img = img + img_mod1.gate * attn.img_attn.proj(img_attn)
274
+ img = img + img_mod2.gate * attn.img_mlp((1 + img_mod2.scale) * attn.img_norm2(img) + img_mod2.shift)
275
+
276
+ txt = txt + txt_mod1.gate * attn.txt_attn.proj(txt_attn)
277
+ txt = txt + txt_mod2.gate * attn.txt_mlp((1 + txt_mod2.scale) * attn.txt_norm2(txt) + txt_mod2.shift)
278
+
279
+
280
+ # IP-adapter processing
281
+ ip_query = img_q # latent sample query
282
+ ip_key = self.ip_adapter_double_stream_k_proj(image_proj)
283
+ ip_value = self.ip_adapter_double_stream_v_proj(image_proj)
284
+
285
+ # Reshape projections for multi-head attention
286
+ ip_key = rearrange(ip_key, 'B L (H D) -> B H L D', H=attn.num_heads, D=attn.head_dim)
287
+ ip_value = rearrange(ip_value, 'B L (H D) -> B H L D', H=attn.num_heads, D=attn.head_dim)
288
+
289
+ # Compute attention between IP projections and the latent query
290
+ ip_attention = F.scaled_dot_product_attention(
291
+ ip_query,
292
+ ip_key,
293
+ ip_value,
294
+ dropout_p=0.0,
295
+ is_causal=False
296
+ )
297
+ ip_attention = rearrange(ip_attention, "B H L D -> B L (H D)", H=attn.num_heads, D=attn.head_dim)
298
+
299
+ img = img + ip_scale * ip_attention
300
+
301
+ return img, txt
302
+
303
+ class DoubleStreamBlockProcessor:
304
+ def __call__(self, attn, img, txt, vec, pe, **attention_kwargs):
305
+ img_mod1, img_mod2 = attn.img_mod(vec)
306
+ txt_mod1, txt_mod2 = attn.txt_mod(vec)
307
+
308
+ # prepare image for attention
309
+ img_modulated = attn.img_norm1(img)
310
+ img_modulated = (1 + img_mod1.scale) * img_modulated + img_mod1.shift
311
+ img_qkv = attn.img_attn.qkv(img_modulated)
312
+ img_q, img_k, img_v = rearrange(img_qkv, "B L (K H D) -> K B H L D", K=3, H=attn.num_heads, D=attn.head_dim)
313
+ img_q, img_k = attn.img_attn.norm(img_q, img_k, img_v)
314
+
315
+ # prepare txt for attention
316
+ txt_modulated = attn.txt_norm1(txt)
317
+ txt_modulated = (1 + txt_mod1.scale) * txt_modulated + txt_mod1.shift
318
+ txt_qkv = attn.txt_attn.qkv(txt_modulated)
319
+ txt_q, txt_k, txt_v = rearrange(txt_qkv, "B L (K H D) -> K B H L D", K=3, H=attn.num_heads, D=attn.head_dim)
320
+ txt_q, txt_k = attn.txt_attn.norm(txt_q, txt_k, txt_v)
321
+
322
+ # run actual attention
323
+ q = torch.cat((txt_q, img_q), dim=2)
324
+ k = torch.cat((txt_k, img_k), dim=2)
325
+ v = torch.cat((txt_v, img_v), dim=2)
326
+
327
+ attn1 = attention(q, k, v, pe=pe)
328
+ txt_attn, img_attn = attn1[:, : txt.shape[1]], attn1[:, txt.shape[1] :]
329
+
330
+ # calculate the img bloks
331
+ img = img + img_mod1.gate * attn.img_attn.proj(img_attn)
332
+ img = img + img_mod2.gate * attn.img_mlp((1 + img_mod2.scale) * attn.img_norm2(img) + img_mod2.shift)
333
+
334
+ # calculate the txt bloks
335
+ txt = txt + txt_mod1.gate * attn.txt_attn.proj(txt_attn)
336
+ txt = txt + txt_mod2.gate * attn.txt_mlp((1 + txt_mod2.scale) * attn.txt_norm2(txt) + txt_mod2.shift)
337
+ return img, txt
338
+
339
+ class DoubleStreamBlock(nn.Module):
340
+ def __init__(self, hidden_size: int, num_heads: int, mlp_ratio: float, qkv_bias: bool = False):
341
+ super().__init__()
342
+ mlp_hidden_dim = int(hidden_size * mlp_ratio)
343
+ self.num_heads = num_heads
344
+ self.hidden_size = hidden_size
345
+ self.head_dim = hidden_size // num_heads
346
+
347
+ self.img_mod = Modulation(hidden_size, double=True)
348
+ self.img_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
349
+ self.img_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias)
350
+
351
+ self.img_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
352
+ self.img_mlp = nn.Sequential(
353
+ nn.Linear(hidden_size, mlp_hidden_dim, bias=True),
354
+ nn.GELU(approximate="tanh"),
355
+ nn.Linear(mlp_hidden_dim, hidden_size, bias=True),
356
+ )
357
+
358
+ self.txt_mod = Modulation(hidden_size, double=True)
359
+ self.txt_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
360
+ self.txt_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias)
361
+
362
+ self.txt_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
363
+ self.txt_mlp = nn.Sequential(
364
+ nn.Linear(hidden_size, mlp_hidden_dim, bias=True),
365
+ nn.GELU(approximate="tanh"),
366
+ nn.Linear(mlp_hidden_dim, hidden_size, bias=True),
367
+ )
368
+ processor = DoubleStreamBlockProcessor()
369
+ self.set_processor(processor)
370
+
371
+ def set_processor(self, processor) -> None:
372
+ self.processor = processor
373
+
374
+ def get_processor(self):
375
+ return self.processor
376
+
377
+ def forward(
378
+ self,
379
+ img: Tensor,
380
+ txt: Tensor,
381
+ vec: Tensor,
382
+ pe: Tensor,
383
+ image_proj: Tensor = None,
384
+ ip_scale: float =1.0,
385
+ ) -> tuple[Tensor, Tensor]:
386
+ if image_proj is None:
387
+ return self.processor(self, img, txt, vec, pe)
388
+ else:
389
+ return self.processor(self, img, txt, vec, pe, image_proj, ip_scale)
390
+
391
+ class IPSingleStreamBlockProcessor(nn.Module):
392
+ """Attention processor for handling IP-adapter with single stream block."""
393
+ def __init__(self, context_dim, hidden_dim):
394
+ super().__init__()
395
+ if not hasattr(F, "scaled_dot_product_attention"):
396
+ raise ImportError(
397
+ "IPSingleStreamBlockProcessor requires PyTorch 2.0 or higher. Please upgrade PyTorch."
398
+ )
399
+
400
+ # Ensure context_dim matches the dimension of image_proj
401
+ self.context_dim = context_dim
402
+ self.hidden_dim = hidden_dim
403
+
404
+ # Initialize projections for IP-adapter
405
+ self.ip_adapter_single_stream_k_proj = nn.Linear(context_dim, hidden_dim, bias=False)
406
+ self.ip_adapter_single_stream_v_proj = nn.Linear(context_dim, hidden_dim, bias=False)
407
+
408
+ nn.init.zeros_(self.ip_adapter_single_stream_k_proj.weight)
409
+ nn.init.zeros_(self.ip_adapter_single_stream_v_proj.weight)
410
+
411
+ def __call__(
412
+ self,
413
+ attn: nn.Module,
414
+ x: Tensor,
415
+ vec: Tensor,
416
+ pe: Tensor,
417
+ image_proj: Tensor | None = None,
418
+ ip_scale: float = 1.0
419
+ ) -> Tensor:
420
+
421
+ mod, _ = attn.modulation(vec)
422
+ x_mod = (1 + mod.scale) * attn.pre_norm(x) + mod.shift
423
+ qkv, mlp = torch.split(attn.linear1(x_mod), [3 * attn.hidden_size, attn.mlp_hidden_dim], dim=-1)
424
+
425
+ q, k, v = rearrange(qkv, "B L (K H D) -> K B H L D", K=3, H=attn.num_heads, D=attn.head_dim)
426
+ q, k = attn.norm(q, k, v)
427
+
428
+ # compute attention
429
+ attn_1 = attention(q, k, v, pe=pe)
430
+
431
+ # IP-adapter processing
432
+ ip_query = q
433
+ ip_key = self.ip_adapter_single_stream_k_proj(image_proj)
434
+ ip_value = self.ip_adapter_single_stream_v_proj(image_proj)
435
+
436
+ # Reshape projections for multi-head attention
437
+ ip_key = rearrange(ip_key, 'B L (H D) -> B H L D', H=attn.num_heads, D=attn.head_dim)
438
+ ip_value = rearrange(ip_value, 'B L (H D) -> B H L D', H=attn.num_heads, D=attn.head_dim)
439
+
440
+
441
+ # Compute attention between IP projections and the latent query
442
+ ip_attention = F.scaled_dot_product_attention(
443
+ ip_query,
444
+ ip_key,
445
+ ip_value
446
+ )
447
+ ip_attention = rearrange(ip_attention, "B H L D -> B L (H D)")
448
+
449
+ attn_out = attn_1 + ip_scale * ip_attention
450
+
451
+ # compute activation in mlp stream, cat again and run second linear layer
452
+ output = attn.linear2(torch.cat((attn_out, attn.mlp_act(mlp)), 2))
453
+ out = x + mod.gate * output
454
+
455
+ return out
456
+
457
+ class SingleStreamBlockProcessor:
458
+ def __call__(self, attn: nn.Module, x: Tensor, vec: Tensor, pe: Tensor) -> Tensor:
459
+
460
+ mod, _ = attn.modulation(vec)
461
+ x_mod = (1 + mod.scale) * attn.pre_norm(x) + mod.shift
462
+ qkv, mlp = torch.split(attn.linear1(x_mod), [3 * attn.hidden_size, attn.mlp_hidden_dim], dim=-1)
463
+
464
+ q, k, v = rearrange(qkv, "B L (K H D) -> K B H L D", K=3, H=attn.num_heads)
465
+ q, k = attn.norm(q, k, v)
466
+
467
+ # compute attention
468
+ attn_1 = attention(q, k, v, pe=pe)
469
+
470
+ # compute activation in mlp stream, cat again and run second linear layer
471
+ output = attn.linear2(torch.cat((attn_1, attn.mlp_act(mlp)), 2))
472
+ output = x + mod.gate * output
473
+ return output
474
+
475
+ class SingleStreamBlock(nn.Module):
476
+ """
477
+ A DiT block with parallel linear layers as described in
478
+ https://arxiv.org/abs/2302.05442 and adapted modulation interface.
479
+ """
480
+
481
+ def __init__(
482
+ self,
483
+ hidden_size: int,
484
+ num_heads: int,
485
+ mlp_ratio: float = 4.0,
486
+ qk_scale: float | None = None,
487
+ ):
488
+ super().__init__()
489
+ self.hidden_dim = hidden_size
490
+ self.num_heads = num_heads
491
+ self.head_dim = hidden_size // num_heads
492
+ self.scale = qk_scale or self.head_dim**-0.5
493
+
494
+ self.mlp_hidden_dim = int(hidden_size * mlp_ratio)
495
+ # qkv and mlp_in
496
+ self.linear1 = nn.Linear(hidden_size, hidden_size * 3 + self.mlp_hidden_dim)
497
+ # proj and mlp_out
498
+ self.linear2 = nn.Linear(hidden_size + self.mlp_hidden_dim, hidden_size)
499
+
500
+ self.norm = QKNorm(self.head_dim)
501
+
502
+ self.hidden_size = hidden_size
503
+ self.pre_norm = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
504
+
505
+ self.mlp_act = nn.GELU(approximate="tanh")
506
+ self.modulation = Modulation(hidden_size, double=False)
507
+
508
+ processor = SingleStreamBlockProcessor()
509
+ self.set_processor(processor)
510
+
511
+
512
+ def set_processor(self, processor) -> None:
513
+ self.processor = processor
514
+
515
+ def get_processor(self):
516
+ return self.processor
517
+
518
+ def forward(
519
+ self,
520
+ x: Tensor,
521
+ vec: Tensor,
522
+ pe: Tensor,
523
+ image_proj: Tensor | None = None,
524
+ ip_scale: float = 1.0
525
+ ) -> Tensor:
526
+ if image_proj is None:
527
+ return self.processor(self, x, vec, pe)
528
+ else:
529
+ return self.processor(self, x, vec, pe, image_proj, ip_scale)
530
+
531
+
532
+
533
+ class LastLayer(nn.Module):
534
+ def __init__(self, hidden_size: int, patch_size: int, out_channels: int):
535
+ super().__init__()
536
+ self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
537
+ self.linear = nn.Linear(hidden_size, patch_size * patch_size * out_channels, bias=True)
538
+ self.adaLN_modulation = nn.Sequential(nn.SiLU(), nn.Linear(hidden_size, 2 * hidden_size, bias=True))
539
+
540
+ def forward(self, x: Tensor, vec: Tensor) -> Tensor:
541
+ shift, scale = self.adaLN_modulation(vec).chunk(2, dim=1)
542
+ x = (1 + scale[:, None, :]) * self.norm_final(x) + shift[:, None, :]
543
+ x = self.linear(x)
544
+ return x
545
+
546
+ class ImageProjModel(torch.nn.Module):
547
+ """Projection Model
548
+ https://github.com/tencent-ailab/IP-Adapter/blob/main/ip_adapter/ip_adapter.py#L28
549
+ """
550
+
551
+ def __init__(self, cross_attention_dim=1024, clip_embeddings_dim=1024, clip_extra_context_tokens=4):
552
+ super().__init__()
553
+
554
+ self.generator = None
555
+ self.cross_attention_dim = cross_attention_dim
556
+ self.clip_extra_context_tokens = clip_extra_context_tokens
557
+ self.proj = torch.nn.Linear(clip_embeddings_dim, self.clip_extra_context_tokens * cross_attention_dim)
558
+ self.norm = torch.nn.LayerNorm(cross_attention_dim)
559
+
560
+ def forward(self, image_embeds):
561
+ embeds = image_embeds
562
+ clip_extra_context_tokens = self.proj(embeds).reshape(
563
+ -1, self.clip_extra_context_tokens, self.cross_attention_dim
564
+ )
565
+ clip_extra_context_tokens = self.norm(clip_extra_context_tokens)
566
+ return clip_extra_context_tokens
567
+
src/flux/sampling.py ADDED
@@ -0,0 +1,242 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ from typing import Callable
3
+
4
+ import torch
5
+ from einops import rearrange, repeat
6
+ from torch import Tensor
7
+
8
+ from .model import Flux
9
+ from .modules.conditioner import HFEmbedder
10
+
11
+
12
+ def get_noise(
13
+ num_samples: int,
14
+ height: int,
15
+ width: int,
16
+ device: torch.device,
17
+ dtype: torch.dtype,
18
+ seed: int,
19
+ ):
20
+ return torch.randn(
21
+ num_samples,
22
+ 16,
23
+ # allow for packing
24
+ 2 * math.ceil(height / 16),
25
+ 2 * math.ceil(width / 16),
26
+ device=device,
27
+ dtype=dtype,
28
+ generator=torch.Generator(device=device).manual_seed(seed),
29
+ )
30
+
31
+
32
+ def prepare(t5: HFEmbedder, clip: HFEmbedder, img: Tensor, prompt: str | list[str]) -> dict[str, Tensor]:
33
+ bs, c, h, w = img.shape
34
+ if bs == 1 and not isinstance(prompt, str):
35
+ bs = len(prompt)
36
+
37
+ img = rearrange(img, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=2, pw=2)
38
+ if img.shape[0] == 1 and bs > 1:
39
+ img = repeat(img, "1 ... -> bs ...", bs=bs)
40
+
41
+ img_ids = torch.zeros(h // 2, w // 2, 3)
42
+ img_ids[..., 1] = img_ids[..., 1] + torch.arange(h // 2)[:, None]
43
+ img_ids[..., 2] = img_ids[..., 2] + torch.arange(w // 2)[None, :]
44
+ img_ids = repeat(img_ids, "h w c -> b (h w) c", b=bs)
45
+
46
+ if isinstance(prompt, str):
47
+ prompt = [prompt]
48
+ txt = t5(prompt)
49
+ if txt.shape[0] == 1 and bs > 1:
50
+ txt = repeat(txt, "1 ... -> bs ...", bs=bs)
51
+ txt_ids = torch.zeros(bs, txt.shape[1], 3)
52
+
53
+ vec = clip(prompt)
54
+ if vec.shape[0] == 1 and bs > 1:
55
+ vec = repeat(vec, "1 ... -> bs ...", bs=bs)
56
+
57
+ return {
58
+ "img": img,
59
+ "img_ids": img_ids.to(img.device),
60
+ "txt": txt.to(img.device),
61
+ "txt_ids": txt_ids.to(img.device),
62
+ "vec": vec.to(img.device),
63
+ }
64
+
65
+
66
+ def time_shift(mu: float, sigma: float, t: Tensor):
67
+ return math.exp(mu) / (math.exp(mu) + (1 / t - 1) ** sigma)
68
+
69
+
70
+ def get_lin_function(
71
+ x1: float = 256, y1: float = 0.5, x2: float = 4096, y2: float = 1.15
72
+ ) -> Callable[[float], float]:
73
+ m = (y2 - y1) / (x2 - x1)
74
+ b = y1 - m * x1
75
+ return lambda x: m * x + b
76
+
77
+
78
+ def get_schedule(
79
+ num_steps: int,
80
+ image_seq_len: int,
81
+ base_shift: float = 0.5,
82
+ max_shift: float = 1.15,
83
+ shift: bool = True,
84
+ ) -> list[float]:
85
+ # extra step for zero
86
+ timesteps = torch.linspace(1, 0, num_steps + 1)
87
+
88
+ # shifting the schedule to favor high timesteps for higher signal images
89
+ if shift:
90
+ # eastimate mu based on linear estimation between two points
91
+ mu = get_lin_function(y1=base_shift, y2=max_shift)(image_seq_len)
92
+ timesteps = time_shift(mu, 1.0, timesteps)
93
+
94
+ return timesteps.tolist()
95
+
96
+
97
+ def denoise(
98
+ model: Flux,
99
+ # model input
100
+ img: Tensor,
101
+ img_ids: Tensor,
102
+ txt: Tensor,
103
+ txt_ids: Tensor,
104
+ vec: Tensor,
105
+ neg_txt: Tensor,
106
+ neg_txt_ids: Tensor,
107
+ neg_vec: Tensor,
108
+ # sampling parameters
109
+ timesteps: list[float],
110
+ guidance: float = 4.0,
111
+ true_gs = 1,
112
+ timestep_to_start_cfg=0,
113
+ # ip-adapter parameters
114
+ image_proj: Tensor=None,
115
+ neg_image_proj: Tensor=None,
116
+ ip_scale: Tensor | float = 1.0,
117
+ neg_ip_scale: Tensor | float = 1.0
118
+ ):
119
+ i = 0
120
+ # this is ignored for schnell
121
+ guidance_vec = torch.full((img.shape[0],), guidance, device=img.device, dtype=img.dtype)
122
+ for t_curr, t_prev in zip(timesteps[:-1], timesteps[1:]):
123
+ t_vec = torch.full((img.shape[0],), t_curr, dtype=img.dtype, device=img.device)
124
+ pred = model(
125
+ img=img,
126
+ img_ids=img_ids,
127
+ txt=txt,
128
+ txt_ids=txt_ids,
129
+ y=vec,
130
+ timesteps=t_vec,
131
+ guidance=guidance_vec,
132
+ image_proj=image_proj,
133
+ ip_scale=ip_scale,
134
+ )
135
+ if i >= timestep_to_start_cfg:
136
+ neg_pred = model(
137
+ img=img,
138
+ img_ids=img_ids,
139
+ txt=neg_txt,
140
+ txt_ids=neg_txt_ids,
141
+ y=neg_vec,
142
+ timesteps=t_vec,
143
+ guidance=guidance_vec,
144
+ image_proj=neg_image_proj,
145
+ ip_scale=neg_ip_scale,
146
+ )
147
+ pred = neg_pred + true_gs * (pred - neg_pred)
148
+ img = img + (t_prev - t_curr) * pred
149
+ i += 1
150
+ return img
151
+
152
+ def denoise_controlnet(
153
+ model: Flux,
154
+ controlnet:None,
155
+ # model input
156
+ img: Tensor,
157
+ img_ids: Tensor,
158
+ txt: Tensor,
159
+ txt_ids: Tensor,
160
+ vec: Tensor,
161
+ neg_txt: Tensor,
162
+ neg_txt_ids: Tensor,
163
+ neg_vec: Tensor,
164
+ controlnet_cond,
165
+ # sampling parameters
166
+ timesteps: list[float],
167
+ guidance: float = 4.0,
168
+ true_gs = 1,
169
+ controlnet_gs=0.7,
170
+ timestep_to_start_cfg=0,
171
+ # ip-adapter parameters
172
+ image_proj: Tensor=None,
173
+ neg_image_proj: Tensor=None,
174
+ ip_scale: Tensor | float = 1,
175
+ neg_ip_scale: Tensor | float = 1,
176
+ ):
177
+ # this is ignored for schnell
178
+ i = 0
179
+ guidance_vec = torch.full((img.shape[0],), guidance, device=img.device, dtype=img.dtype)
180
+ for t_curr, t_prev in zip(timesteps[:-1], timesteps[1:]):
181
+ t_vec = torch.full((img.shape[0],), t_curr, dtype=img.dtype, device=img.device)
182
+ block_res_samples = controlnet(
183
+ img=img,
184
+ img_ids=img_ids,
185
+ controlnet_cond=controlnet_cond,
186
+ txt=txt,
187
+ txt_ids=txt_ids,
188
+ y=vec,
189
+ timesteps=t_vec,
190
+ guidance=guidance_vec,
191
+ )
192
+ pred = model(
193
+ img=img,
194
+ img_ids=img_ids,
195
+ txt=txt,
196
+ txt_ids=txt_ids,
197
+ y=vec,
198
+ timesteps=t_vec,
199
+ guidance=guidance_vec,
200
+ block_controlnet_hidden_states=[i * controlnet_gs for i in block_res_samples],
201
+ image_proj=image_proj,
202
+ ip_scale=ip_scale,
203
+ )
204
+ if i >= timestep_to_start_cfg:
205
+ neg_block_res_samples = controlnet(
206
+ img=img,
207
+ img_ids=img_ids,
208
+ controlnet_cond=controlnet_cond,
209
+ txt=neg_txt,
210
+ txt_ids=neg_txt_ids,
211
+ y=neg_vec,
212
+ timesteps=t_vec,
213
+ guidance=guidance_vec,
214
+ )
215
+ neg_pred = model(
216
+ img=img,
217
+ img_ids=img_ids,
218
+ txt=neg_txt,
219
+ txt_ids=neg_txt_ids,
220
+ y=neg_vec,
221
+ timesteps=t_vec,
222
+ guidance=guidance_vec,
223
+ block_controlnet_hidden_states=[i * controlnet_gs for i in neg_block_res_samples],
224
+ image_proj=neg_image_proj,
225
+ ip_scale=neg_ip_scale,
226
+ )
227
+ pred = neg_pred + true_gs * (pred - neg_pred)
228
+
229
+ img = img + (t_prev - t_curr) * pred
230
+
231
+ i += 1
232
+ return img
233
+
234
+ def unpack(x: Tensor, height: int, width: int) -> Tensor:
235
+ return rearrange(
236
+ x,
237
+ "b (h w) (c ph pw) -> b c (h ph) (w pw)",
238
+ h=math.ceil(height / 16),
239
+ w=math.ceil(width / 16),
240
+ ph=2,
241
+ pw=2,
242
+ )
src/flux/util.py ADDED
@@ -0,0 +1,383 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dataclasses import dataclass
3
+
4
+ import torch
5
+ import json
6
+ import cv2
7
+ import numpy as np
8
+ from PIL import Image
9
+ from huggingface_hub import hf_hub_download
10
+ from safetensors import safe_open
11
+ from safetensors.torch import load_file as load_sft
12
+
13
+ from optimum.quanto import requantize
14
+
15
+ from .model import Flux, FluxParams
16
+ from .controlnet import ControlNetFlux
17
+ from .modules.autoencoder import AutoEncoder, AutoEncoderParams
18
+ from .modules.conditioner import HFEmbedder
19
+ from .annotator.dwpose import DWposeDetector
20
+ from .annotator.mlsd import MLSDdetector
21
+ from .annotator.canny import CannyDetector
22
+ from .annotator.midas import MidasDetector
23
+ from .annotator.hed import HEDdetector
24
+ from .annotator.tile import TileDetector
25
+
26
+
27
+ def load_safetensors(path):
28
+ tensors = {}
29
+ with safe_open(path, framework="pt", device="cpu") as f:
30
+ for key in f.keys():
31
+ tensors[key] = f.get_tensor(key)
32
+ return tensors
33
+
34
+ def get_lora_rank(checkpoint):
35
+ for k in checkpoint.keys():
36
+ if k.endswith(".down.weight"):
37
+ return checkpoint[k].shape[0]
38
+
39
+ def load_checkpoint(local_path, repo_id, name):
40
+ if local_path is not None:
41
+ if '.safetensors' in local_path:
42
+ print("Loading .safetensors checkpoint...")
43
+ checkpoint = load_safetensors(local_path)
44
+ else:
45
+ print("Loading checkpoint...")
46
+ checkpoint = torch.load(local_path, map_location='cpu')
47
+ elif repo_id is not None and name is not None:
48
+ print("Loading checkpoint from repo id...")
49
+ checkpoint = load_from_repo_id(repo_id, name)
50
+ else:
51
+ raise ValueError(
52
+ "LOADING ERROR: you must specify local_path or repo_id with name in HF to download"
53
+ )
54
+ return checkpoint
55
+
56
+
57
+ def c_crop(image):
58
+ width, height = image.size
59
+ new_size = min(width, height)
60
+ left = (width - new_size) / 2
61
+ top = (height - new_size) / 2
62
+ right = (width + new_size) / 2
63
+ bottom = (height + new_size) / 2
64
+ return image.crop((left, top, right, bottom))
65
+
66
+
67
+ class Annotator:
68
+ def __init__(self, name: str, device: str):
69
+ if name == "canny":
70
+ processor = CannyDetector()
71
+ elif name == "openpose":
72
+ processor = DWposeDetector(device)
73
+ elif name == "depth":
74
+ processor = MidasDetector()
75
+ elif name == "hed":
76
+ processor = HEDdetector()
77
+ elif name == "hough":
78
+ processor = MLSDdetector()
79
+ elif name == "tile":
80
+ processor = TileDetector()
81
+ self.name = name
82
+ self.processor = processor
83
+
84
+ def __call__(self, image: Image, width: int, height: int):
85
+ image = c_crop(image)
86
+ image = image.resize((width, height))
87
+ image = np.array(image)
88
+ if self.name == "canny":
89
+ result = self.processor(image, low_threshold=100, high_threshold=200)
90
+ elif self.name == "hough":
91
+ result = self.processor(image, thr_v=0.05, thr_d=5)
92
+ elif self.name == "depth":
93
+ result = self.processor(image)
94
+ result, _ = result
95
+ else:
96
+ result = self.processor(image)
97
+
98
+ if result.ndim != 3:
99
+ result = result[:, :, None]
100
+ result = np.concatenate([result, result, result], axis=2)
101
+ return result
102
+
103
+
104
+ @dataclass
105
+ class ModelSpec:
106
+ params: FluxParams
107
+ ae_params: AutoEncoderParams
108
+ ckpt_path: str | None
109
+ ae_path: str | None
110
+ repo_id: str | None
111
+ repo_flow: str | None
112
+ repo_ae: str | None
113
+ repo_id_ae: str | None
114
+
115
+
116
+ configs = {
117
+ "flux-dev": ModelSpec(
118
+ repo_id="black-forest-labs/FLUX.1-dev",
119
+ repo_id_ae="black-forest-labs/FLUX.1-dev",
120
+ repo_flow="flux1-dev.safetensors",
121
+ repo_ae="ae.safetensors",
122
+ ckpt_path=os.getenv("FLUX_DEV"),
123
+ params=FluxParams(
124
+ in_channels=64,
125
+ vec_in_dim=768,
126
+ context_in_dim=4096,
127
+ hidden_size=3072,
128
+ mlp_ratio=4.0,
129
+ num_heads=24,
130
+ depth=19,
131
+ depth_single_blocks=38,
132
+ axes_dim=[16, 56, 56],
133
+ theta=10_000,
134
+ qkv_bias=True,
135
+ guidance_embed=True,
136
+ ),
137
+ ae_path=os.getenv("AE"),
138
+ ae_params=AutoEncoderParams(
139
+ resolution=256,
140
+ in_channels=3,
141
+ ch=128,
142
+ out_ch=3,
143
+ ch_mult=[1, 2, 4, 4],
144
+ num_res_blocks=2,
145
+ z_channels=16,
146
+ scale_factor=0.3611,
147
+ shift_factor=0.1159,
148
+ ),
149
+ ),
150
+ "flux-dev-fp8": ModelSpec(
151
+ repo_id="XLabs-AI/flux-dev-fp8",
152
+ repo_id_ae="black-forest-labs/FLUX.1-dev",
153
+ repo_flow="flux-dev-fp8.safetensors",
154
+ repo_ae="ae.safetensors",
155
+ ckpt_path=os.getenv("FLUX_DEV_FP8"),
156
+ params=FluxParams(
157
+ in_channels=64,
158
+ vec_in_dim=768,
159
+ context_in_dim=4096,
160
+ hidden_size=3072,
161
+ mlp_ratio=4.0,
162
+ num_heads=24,
163
+ depth=19,
164
+ depth_single_blocks=38,
165
+ axes_dim=[16, 56, 56],
166
+ theta=10_000,
167
+ qkv_bias=True,
168
+ guidance_embed=True,
169
+ ),
170
+ ae_path=os.getenv("AE"),
171
+ ae_params=AutoEncoderParams(
172
+ resolution=256,
173
+ in_channels=3,
174
+ ch=128,
175
+ out_ch=3,
176
+ ch_mult=[1, 2, 4, 4],
177
+ num_res_blocks=2,
178
+ z_channels=16,
179
+ scale_factor=0.3611,
180
+ shift_factor=0.1159,
181
+ ),
182
+ ),
183
+ "flux-schnell": ModelSpec(
184
+ repo_id="black-forest-labs/FLUX.1-schnell",
185
+ repo_id_ae="black-forest-labs/FLUX.1-dev",
186
+ repo_flow="flux1-schnell.safetensors",
187
+ repo_ae="ae.safetensors",
188
+ ckpt_path=os.getenv("FLUX_SCHNELL"),
189
+ params=FluxParams(
190
+ in_channels=64,
191
+ vec_in_dim=768,
192
+ context_in_dim=4096,
193
+ hidden_size=3072,
194
+ mlp_ratio=4.0,
195
+ num_heads=24,
196
+ depth=19,
197
+ depth_single_blocks=38,
198
+ axes_dim=[16, 56, 56],
199
+ theta=10_000,
200
+ qkv_bias=True,
201
+ guidance_embed=False,
202
+ ),
203
+ ae_path=os.getenv("AE"),
204
+ ae_params=AutoEncoderParams(
205
+ resolution=256,
206
+ in_channels=3,
207
+ ch=128,
208
+ out_ch=3,
209
+ ch_mult=[1, 2, 4, 4],
210
+ num_res_blocks=2,
211
+ z_channels=16,
212
+ scale_factor=0.3611,
213
+ shift_factor=0.1159,
214
+ ),
215
+ ),
216
+ }
217
+
218
+
219
+ def print_load_warning(missing: list[str], unexpected: list[str]) -> None:
220
+ if len(missing) > 0 and len(unexpected) > 0:
221
+ print(f"Got {len(missing)} missing keys:\n\t" + "\n\t".join(missing))
222
+ print("\n" + "-" * 79 + "\n")
223
+ print(f"Got {len(unexpected)} unexpected keys:\n\t" + "\n\t".join(unexpected))
224
+ elif len(missing) > 0:
225
+ print(f"Got {len(missing)} missing keys:\n\t" + "\n\t".join(missing))
226
+ elif len(unexpected) > 0:
227
+ print(f"Got {len(unexpected)} unexpected keys:\n\t" + "\n\t".join(unexpected))
228
+
229
+ def load_from_repo_id(repo_id, checkpoint_name):
230
+ ckpt_path = hf_hub_download(repo_id, checkpoint_name)
231
+ sd = load_sft(ckpt_path, device='cpu')
232
+ return sd
233
+
234
+ def load_flow_model(name: str, device: str | torch.device = "cuda", hf_download: bool = True):
235
+ # Loading Flux
236
+ print("Init model")
237
+ ckpt_path = configs[name].ckpt_path
238
+ if (
239
+ ckpt_path is None
240
+ and configs[name].repo_id is not None
241
+ and configs[name].repo_flow is not None
242
+ and hf_download
243
+ ):
244
+ ckpt_path = hf_hub_download(configs[name].repo_id, configs[name].repo_flow)
245
+
246
+ with torch.device("meta" if ckpt_path is not None else device):
247
+ model = Flux(configs[name].params).to(torch.bfloat16)
248
+
249
+ if ckpt_path is not None:
250
+ print("Loading checkpoint")
251
+ # load_sft doesn't support torch.device
252
+ sd = load_sft(ckpt_path, device=str(device))
253
+ missing, unexpected = model.load_state_dict(sd, strict=False, assign=True)
254
+ print_load_warning(missing, unexpected)
255
+ return model
256
+
257
+ def load_flow_model2(name: str, device: str | torch.device = "cuda", hf_download: bool = True):
258
+ # Loading Flux
259
+ print("Init model")
260
+ ckpt_path = configs[name].ckpt_path
261
+ if (
262
+ ckpt_path is None
263
+ and configs[name].repo_id is not None
264
+ and configs[name].repo_flow is not None
265
+ and hf_download
266
+ ):
267
+ ckpt_path = hf_hub_download(configs[name].repo_id, configs[name].repo_flow.replace("sft", "safetensors"))
268
+
269
+ with torch.device("meta" if ckpt_path is not None else device):
270
+ model = Flux(configs[name].params)
271
+
272
+ if ckpt_path is not None:
273
+ print("Loading checkpoint")
274
+ # load_sft doesn't support torch.device
275
+ sd = load_sft(ckpt_path, device=str(device))
276
+ missing, unexpected = model.load_state_dict(sd, strict=False, assign=True)
277
+ print_load_warning(missing, unexpected)
278
+ return model
279
+
280
+ def load_flow_model_quintized(name: str, device: str | torch.device = "cuda", hf_download: bool = True):
281
+ # Loading Flux
282
+ print("Init model")
283
+ ckpt_path = configs[name].ckpt_path
284
+ if (
285
+ ckpt_path is None
286
+ and configs[name].repo_id is not None
287
+ and configs[name].repo_flow is not None
288
+ and hf_download
289
+ ):
290
+ ckpt_path = hf_hub_download(configs[name].repo_id, configs[name].repo_flow)
291
+ json_path = hf_hub_download(configs[name].repo_id, 'flux_dev_quantization_map.json')
292
+
293
+
294
+ model = Flux(configs[name].params).to(torch.bfloat16)
295
+
296
+ print("Loading checkpoint")
297
+ # load_sft doesn't support torch.device
298
+ sd = load_sft(ckpt_path, device='cpu')
299
+ with open(json_path, "r") as f:
300
+ quantization_map = json.load(f)
301
+ print("Start a quantization process...")
302
+ requantize(model, sd, quantization_map, device=device)
303
+ print("Model is quantized!")
304
+ return model
305
+
306
+ def load_controlnet(name, device, transformer=None):
307
+ with torch.device(device):
308
+ controlnet = ControlNetFlux(configs[name].params)
309
+ if transformer is not None:
310
+ controlnet.load_state_dict(transformer.state_dict(), strict=False)
311
+ return controlnet
312
+
313
+ def load_t5(device: str | torch.device = "cuda", max_length: int = 512) -> HFEmbedder:
314
+ # max length 64, 128, 256 and 512 should work (if your sequence is short enough)
315
+ return HFEmbedder("xlabs-ai/xflux_text_encoders", max_length=max_length, torch_dtype=torch.bfloat16).to(device)
316
+
317
+ def load_clip(device: str | torch.device = "cuda") -> HFEmbedder:
318
+ return HFEmbedder("openai/clip-vit-large-patch14", max_length=77, torch_dtype=torch.bfloat16).to(device)
319
+
320
+
321
+ def load_ae(name: str, device: str | torch.device = "cuda", hf_download: bool = True) -> AutoEncoder:
322
+ ckpt_path = configs[name].ae_path
323
+ if (
324
+ ckpt_path is None
325
+ and configs[name].repo_id is not None
326
+ and configs[name].repo_ae is not None
327
+ and hf_download
328
+ ):
329
+ ckpt_path = hf_hub_download(configs[name].repo_id_ae, configs[name].repo_ae)
330
+
331
+ # Loading the autoencoder
332
+ print("Init AE")
333
+ with torch.device("meta" if ckpt_path is not None else device):
334
+ ae = AutoEncoder(configs[name].ae_params)
335
+
336
+ if ckpt_path is not None:
337
+ sd = load_sft(ckpt_path, device=str(device))
338
+ missing, unexpected = ae.load_state_dict(sd, strict=False, assign=True)
339
+ print_load_warning(missing, unexpected)
340
+ return ae
341
+
342
+
343
+ class WatermarkEmbedder:
344
+ def __init__(self, watermark):
345
+ self.watermark = watermark
346
+ self.num_bits = len(WATERMARK_BITS)
347
+ self.encoder = WatermarkEncoder()
348
+ self.encoder.set_watermark("bits", self.watermark)
349
+
350
+ def __call__(self, image: torch.Tensor) -> torch.Tensor:
351
+ """
352
+ Adds a predefined watermark to the input image
353
+
354
+ Args:
355
+ image: ([N,] B, RGB, H, W) in range [-1, 1]
356
+
357
+ Returns:
358
+ same as input but watermarked
359
+ """
360
+ image = 0.5 * image + 0.5
361
+ squeeze = len(image.shape) == 4
362
+ if squeeze:
363
+ image = image[None, ...]
364
+ n = image.shape[0]
365
+ image_np = rearrange((255 * image).detach().cpu(), "n b c h w -> (n b) h w c").numpy()[:, :, :, ::-1]
366
+ # torch (b, c, h, w) in [0, 1] -> numpy (b, h, w, c) [0, 255]
367
+ # watermarking libary expects input as cv2 BGR format
368
+ for k in range(image_np.shape[0]):
369
+ image_np[k] = self.encoder.encode(image_np[k], "dwtDct")
370
+ image = torch.from_numpy(rearrange(image_np[:, :, :, ::-1], "(n b) h w c -> n b c h w", n=n)).to(
371
+ image.device
372
+ )
373
+ image = torch.clamp(image / 255, min=0.0, max=1.0)
374
+ if squeeze:
375
+ image = image[0]
376
+ image = 2 * image - 1
377
+ return image
378
+
379
+
380
+ # A fixed 48-bit message that was choosen at random
381
+ WATERMARK_MESSAGE = 0b001010101111111010000111100111001111010100101110
382
+ # bin(x)[2:] gives bits of x as str, use int to convert them to 0/1
383
+ WATERMARK_BITS = [int(bit) for bit in bin(WATERMARK_MESSAGE)[2:]]