Spaces:

Sunday01
/

testapi

Sleeping

App Files Files Community

Sunday01 commited on Jul 25

Commit

9dce458

•

1 Parent(s): c7e2109

up

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.dockerignore +9 -0
.gitattributes +2 -0
.gitignore +41 -0
CHANGELOG.md +111 -0
CHANGELOG_CN.md +111 -0
Dockerfile +65 -0
LICENSE +674 -0
Makefile +13 -0
README_CN.md +413 -0
devscripts/make_readme.py +98 -0
devscripts/utils.py +42 -0
docker_prepare.py +28 -0
fonts/Arial-Unicode-Regular.ttf +3 -0
fonts/anime_ace.ttf +3 -0
fonts/anime_ace_3.ttf +3 -0
fonts/comic shanns 2.ttf +3 -0
fonts/msgothic.ttc +3 -0
fonts/msyh.ttc +3 -0
manga_translator/__init__.py +7 -0
manga_translator/__main__.py +79 -0
manga_translator/args.py +182 -0
manga_translator/colorization/__init__.py +28 -0
manga_translator/colorization/common.py +24 -0
manga_translator/colorization/manga_colorization_v2.py +74 -0
manga_translator/colorization/manga_colorization_v2_utils/denoising/denoiser.py +118 -0
manga_translator/colorization/manga_colorization_v2_utils/denoising/functions.py +102 -0
manga_translator/colorization/manga_colorization_v2_utils/denoising/models.py +100 -0
manga_translator/colorization/manga_colorization_v2_utils/denoising/utils.py +66 -0
manga_translator/colorization/manga_colorization_v2_utils/networks/extractor.py +127 -0
manga_translator/colorization/manga_colorization_v2_utils/networks/models.py +319 -0
manga_translator/colorization/manga_colorization_v2_utils/utils/utils.py +44 -0
manga_translator/detection/__init__.py +37 -0
manga_translator/detection/common.py +146 -0
manga_translator/detection/craft.py +200 -0
manga_translator/detection/craft_utils/refiner.py +65 -0
manga_translator/detection/craft_utils/vgg16_bn.py +71 -0
manga_translator/detection/ctd.py +186 -0
manga_translator/detection/ctd_utils/__init__.py +5 -0
manga_translator/detection/ctd_utils/basemodel.py +250 -0
manga_translator/detection/ctd_utils/textmask.py +174 -0
manga_translator/detection/ctd_utils/utils/db_utils.py +706 -0
manga_translator/detection/ctd_utils/utils/imgproc_utils.py +180 -0
manga_translator/detection/ctd_utils/utils/io_utils.py +54 -0
manga_translator/detection/ctd_utils/utils/weight_init.py +103 -0
manga_translator/detection/ctd_utils/utils/yolov5_utils.py +243 -0
manga_translator/detection/ctd_utils/yolov5/common.py +289 -0
manga_translator/detection/ctd_utils/yolov5/yolo.py +311 -0
manga_translator/detection/dbnet_convnext.py +596 -0
manga_translator/detection/default.py +103 -0
manga_translator/detection/default_utils/CRAFT_resnet34.py +153 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,9 @@

+result
+*.ckpt
+*.pt
+.vscode
+*.onnx
+__pycache__
+ocrs
+models/*
+test/testdata/bboxes

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.ttc filter=lfs diff=lfs merge=lfs -text
+*.ttf filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,41 @@

+result
+*.ckpt
+*.pt
+.vscode
+*.onnx
+__pycache__
+ocrs
+Manga
+Manga-translated
+/models
+.env
+*.local
+*.local.*
+test/testdata
+.idea
+pyvenv.cfg
+Scripts
+Lib
+include
+share
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+.history

CHANGELOG.md ADDED Viewed

	@@ -0,0 +1,111 @@

+# Changelogs
+### 2023-11-11
+1. Added new OCR model `48px`
+### 2023-05-08
+1. Added [4x-UltraSharp](https://mega.nz/folder/qZRBmaIY#nIG8KyWFcGNTuMX_XNbJ_g) upscaler
+### 2023-04-30
+1. Countless bug fixes and refactor
+2. Add [CRAFT](https://github.com/clovaai/CRAFT-pytorch) detector, enable by `--detector craft`
+### 2022-06-15
+1. Added New inpainting model LaMa MPE by [dmMaze](https://github.com/dmMaze) and set as default
+### 2022-04-23
+Project version is now at beta-0.3
+1. Added English text renderer by [dmMaze](https://github.com/dmMaze)
+2. Added new CTC based OCR engine, significant speed improvement
+3. The new OCR model now support Korean
+### 2022-03-19
+1. Use new font rendering method by [pokedexter](https://github.com/pokedexter)
+2. Added manual translation UI by [rspreet92](https://github.com/rspreet92)
+### 2022-01-24
+1. Added text detection model by [dmMaze](https://github.com/dmMaze)
+### 2021-08-21
+1. New MST based text region merge algorithm, huge text region merge improvement
+2. Add baidu translator in demo mode
+3. Add google translator in demo mode
+4. Various bugfixes
+### 2021-07-29
+1. Web demo adds translator, detection resolution and target language option
+2. Slight text color extraction improvement
+### 2021-07-26
+Major upgrades for all components, now we are on beta! \
+Note in this version all English texts are detected as capital letters, \
+You need Python >= 3.8 for `cached_property` to work
+1. Detection model upgrade
+2. OCR model upgrade, better at text color extraction
+3. Inpainting model upgrade
+4. Major text rendering improvement, faster rendering and higher quality text with shadow
+5. Slight mask generation improvement
+6. Various bugfixes
+7. Default detection resolution has been dialed back to 1536 from 2048
+### 2021-07-09
+1. Fix erroneous image rendering when inpainting is not used
+### 2021-06-18
+1. Support manual translation
+2. Support detection and rendering of angled texts
+### 2021-06-13
+1. Text mask completion is now based on CRF, mask quality is drastically improved
+### 2021-06-10
+1. Improve text rendering
+### 2021-06-09
+1. New text region based text direction detection method
+2. Support running demo as web service
+### 2021-05-20
+1. Text detection model is now based on DBNet with ResNet34 backbone
+2. OCR model is now trained with more English sentences
+3. Inpaint model is now based on [AOT](https://arxiv.org/abs/2104.01431) which requires far less memory
+4. Default inpainting resolution is now increased to 2048, thanks to the new inpainting model
+5. Support merging hyphenated English words
+### 2021-05-11
+1. Add youdao translate and set as default translator
+### 2021-05-06
+1. Text detection model is now based on DBNet with ResNet101 backbone
+2. OCR model is now deeper
+3. Default detection resolution has been increased to 2048 from 1536
+Note this version is slightly better at handling English texts, other than that it is worse in every other ways
+### 2021-03-04
+1. Added inpainting model
+### 2021-02-17
+1. First version launched

CHANGELOG_CN.md ADDED Viewed

	@@ -0,0 +1,111 @@

+# 更新日志 (中文)
+### 2023-11-11
+1. 添加了新的OCR模型`48px`
+### 2023-05-08
+1. 添加了[4x-UltraSharp](https://mega.nz/folder/qZRBmaIY#nIG8KyWFcGNTuMX_XNbJ_g)超分辨率
+### 2023-04-30
+1. 无数bug修复和重构
+2. 添加了[CRAFT](https://github.com/clovaai/CRAFT-pytorch)文本检测器，使用`--detector craft`启用
+### 2022-06-15
+1. 增加了来自[dmMaze](https://github.com/dmMaze)的LaMa MPE图像修补模型
+### 2022-04-23
+版本更新为beta-0.3
+1. 增加了来自[dmMaze](https://github.com/dmMaze)的英语文本渲染器
+2. 增加了基于CTC的OCR模型，识别速度大幅提升
+3. 新OCR模型增加韩语识别支持
+### 2022-03-19
+1. 增加了来自[pokedexter](https://github.com/pokedexter)的新文本渲染器
+2. 增加了来自[rspreet92](https://github.com/rspreet92)的人工翻译页面
+### 2022-01-24
+1. 增加了来自[dmMaze](https://github.com/dmMaze)的文本检测模型
+### 2021-08-21
+1. 文本区域合并算法更新，先已经实现几乎完美文本行合并
+2. 增加演示模式百度翻译支持
+3. 增加演示模式谷歌翻译支持
+4. 各类 bug 修复
+### 2021-07-29
+1. 网页版增加翻译器、分辨率和目标语言选项
+2. 文本颜色提取小腹提升
+### 2021-07-26
+程序所有组件都大幅升级，本程序现已进入 beta 版本！ \
+注意：该版本所有英文检测只会输出大写字母。\
+你需要 Python>=3.8 版本才能运行
+1. 检测模型升级
+2. OCR 模型升级，文本颜色抽取质量大幅提升
+3. 图像修补模型升级
+4. 文本渲染升级，渲染更快，并支持更高质量的文本和文本阴影渲染
+5. 文字掩膜补全算法小幅提升
+6. 各类 BUG 修复
+7. 默认检测分辨率为 1536
+### 2021-07-09
+1. 修复不使用 inpainting 时图片错误
+### 2021-06-18
+1. 增加手动翻译选项
+2. 支持倾斜文本的识别和渲染
+### 2021-06-13
+1. 文字掩膜补全算法更新为基于 CRF 算法，补全质量大幅提升
+### 2021-06-10
+1. 完善文本渲染
+### 2021-06-09
+1. 使用基于区域的文本方向检测，文本方向检测效果大幅提升
+2. 增加 web 服务功能
+### 2021-05-20
+1. 检测模型更新为基于 ResNet34 的 DBNet
+2. OCR 模型更新增加更多英语预料训练
+3. 图像修补模型升级到基于[AOT](https://arxiv.org/abs/2104.01431)的模型，占用更少显存
+4. 图像修补默认分辨率增加到 2048
+5. 支持多行英语单词合并
+### 2021-05-11
+1. 增加并默认使用有道翻译
+### 2021-05-06
+1. 检测模型更新为基于 ResNet101 的 DBNet
+2. OCR 模型更新更深
+3. 默认检测分辨率增加到 2048
+注意这个版本除了英文检测稍微好一些，其他方面都不如之前版本
+### 2021-03-04
+1. 添加图片修补模型
+### 2021-02-17
+1. 初步版本发布

Dockerfile ADDED Viewed

	@@ -0,0 +1,65 @@

+FROM pytorch/pytorch:latest
+RUN useradd -m -u 1000 user
+WORKDIR /app
+RUN apt-get update
+RUN DEBIAN_FRONTEND=noninteractive TZ=asia/shanghai apt-get -y install tzdata
+# 设置缓存环境变量
+ENV TRANSFORMERS_CACHE=/app/cache
+ENV DEEPL_AUTH_KEY="6e4907cd-8926-42e7-aa5d-7561363c82b1:fx"
+ENV OPENAI_API_KEY="sk-yuBWvBk2lTQoJFYP24A03515D46041429f907dE81cC3F04e"
+ENV OPENAI_HTTP_PROXY="https://www.ygxdapi.top"
+RUN mkdir -p /app/cache
+# Assume root to install required dependencies
+RUN apt-get install -y git g++ ffmpeg libsm6 libxext6 libvulkan-dev
+# Install pip dependencies
+COPY --chown=user requirements.txt /app/requirements.txt
+RUN pip install -r /app/requirements.txt
+RUN pip install torchvision --force-reinstall
+RUN pip install "numpy<2.0"
+# RUN pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
+RUN apt-get remove -y g++ && \
+    apt-get autoremove -y
+# Copy app
+COPY --chown=user . /app
+# Prepare models
+RUN python -u docker_prepare.py
+RUN rm -rf /tmp
+# Add /app to Python module path
+ENV PYTHONPATH="${PYTHONPATH}:/app"
+WORKDIR /app
+RUN mkdir -p /app/result && chmod 777 /app/result
+RUN mkdir -p /app/models/translators && chmod 777 /app/models/translators
+RUN mkdir -p /app/models/upscaling && chmod 777 /app/models/upscaling
+RUN mkdir -p /app/cache/models && chmod 777 /app/cache/models
+RUN mkdir -p /app/cache/.locks && chmod 777 /app/cache/.locks
+RUN mkdir -p /app/cache/models--kha-white--manga-ocr-base && chmod 777 /app/cache/models--kha-white--manga-ocr-base
+RUN mkdir -p /app && chmod 777 /app
+ENTRYPOINT ["python", "-m", "manga_translator", "-v", "--mode", "web", "--host", "0.0.0.0", "--port", "7860", "--font-size", "28", "--font-size-offset", "5", "--unclip-ratio", "1.1", "--det-invert"]
+# # ENTRYPOINT ["python", "-m", "manga_translator", "-v", "--mode", "web", "--host", "0.0.0.0", "--port", "7860", "--use-cuda", "--use-inpainting"]
+# 使用指定的基础镜像
+# FROM zyddnys/manga-image-translator:main
+# 复制需要的文件到容器中
+# COPY ./../../translate_demo.py /app/translate_demo.py
+# # 暴露端口
+# EXPOSE 7860
+# # 运行命令
+# CMD ["--verbose", "--log-web", "--mode", "web", "--use-inpainting", "--use-cuda", "--host=0.0.0.0", "--port=7860"]

LICENSE ADDED Viewed

	@@ -0,0 +1,674 @@

+                    GNU GENERAL PUBLIC LICENSE
+                       Version 3, 29 June 2007
+ Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+                            Preamble
+  The GNU General Public License is a free, copyleft license for
+software and other kinds of works.
+  The licenses for most software and other practical works are designed
+to take away your freedom to share and change the works.  By contrast,
+the GNU General Public License is intended to guarantee your freedom to
+share and change all versions of a program--to make sure it remains free
+software for all its users.  We, the Free Software Foundation, use the
+GNU General Public License for most of our software; it applies also to
+any other work released this way by its authors.  You can apply it to
+your programs, too.
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+them if you wish), that you receive source code or can get it if you
+want it, that you can change the software or use pieces of it in new
+free programs, and that you know you can do these things.
+  To protect your rights, we need to prevent others from denying you
+these rights or asking you to surrender the rights.  Therefore, you have
+certain responsibilities if you distribute copies of the software, or if
+you modify it: responsibilities to respect the freedom of others.
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must pass on to the recipients the same
+freedoms that you received.  You must make sure that they, too, receive
+or can get the source code.  And you must show them these terms so they
+know their rights.
+  Developers that use the GNU GPL protect your rights with two steps:
+(1) assert copyright on the software, and (2) offer you this License
+giving you legal permission to copy, distribute and/or modify it.
+  For the developers' and authors' protection, the GPL clearly explains
+that there is no warranty for this free software.  For both users' and
+authors' sake, the GPL requires that modified versions be marked as
+changed, so that their problems will not be attributed erroneously to
+authors of previous versions.
+  Some devices are designed to deny users access to install or run
+modified versions of the software inside them, although the manufacturer
+can do so.  This is fundamentally incompatible with the aim of
+protecting users' freedom to change the software.  The systematic
+pattern of such abuse occurs in the area of products for individuals to
+use, which is precisely where it is most unacceptable.  Therefore, we
+have designed this version of the GPL to prohibit the practice for those
+products.  If such problems arise substantially in other domains, we
+stand ready to extend this provision to those domains in future versions
+of the GPL, as needed to protect the freedom of users.
+  Finally, every program is threatened constantly by software patents.
+States should not allow patents to restrict development and use of
+software on general-purpose computers, but in those that do, we wish to
+avoid the special danger that patents applied to a free program could
+make it effectively proprietary.  To prevent this, the GPL assures that
+patents cannot be used to render the program non-free.
+  The precise terms and conditions for copying, distribution and
+modification follow.
+                       TERMS AND CONDITIONS
+  0. Definitions.
+  "This License" refers to version 3 of the GNU General Public License.
+  "Copyright" also means copyright-like laws that apply to other kinds of
+works, such as semiconductor masks.
+  "The Program" refers to any copyrightable work licensed under this
+License.  Each licensee is addressed as "you".  "Licensees" and
+"recipients" may be individuals or organizations.
+  To "modify" a work means to copy from or adapt all or part of the work
+in a fashion requiring copyright permission, other than the making of an
+exact copy.  The resulting work is called a "modified version" of the
+earlier work or a work "based on" the earlier work.
+  A "covered work" means either the unmodified Program or a work based
+on the Program.
+  To "propagate" a work means to do anything with it that, without
+permission, would make you directly or secondarily liable for
+infringement under applicable copyright law, except executing it on a
+computer or modifying a private copy.  Propagation includes copying,
+distribution (with or without modification), making available to the
+public, and in some countries other activities as well.
+  To "convey" a work means any kind of propagation that enables other
+parties to make or receive copies.  Mere interaction with a user through
+a computer network, with no transfer of a copy, is not conveying.
+  An interactive user interface displays "Appropriate Legal Notices"
+to the extent that it includes a convenient and prominently visible
+feature that (1) displays an appropriate copyright notice, and (2)
+tells the user that there is no warranty for the work (except to the
+extent that warranties are provided), that licensees may convey the
+work under this License, and how to view a copy of this License.  If
+the interface presents a list of user commands or options, such as a
+menu, a prominent item in the list meets this criterion.
+  1. Source Code.
+  The "source code" for a work means the preferred form of the work
+for making modifications to it.  "Object code" means any non-source
+form of a work.
+  A "Standard Interface" means an interface that either is an official
+standard defined by a recognized standards body, or, in the case of
+interfaces specified for a particular programming language, one that
+is widely used among developers working in that language.
+  The "System Libraries" of an executable work include anything, other
+than the work as a whole, that (a) is included in the normal form of
+packaging a Major Component, but which is not part of that Major
+Component, and (b) serves only to enable use of the work with that
+Major Component, or to implement a Standard Interface for which an
+implementation is available to the public in source code form.  A
+"Major Component", in this context, means a major essential component
+(kernel, window system, and so on) of the specific operating system
+(if any) on which the executable work runs, or a compiler used to
+produce the work, or an object code interpreter used to run it.
+  The "Corresponding Source" for a work in object code form means all
+the source code needed to generate, install, and (for an executable
+work) run the object code and to modify the work, including scripts to
+control those activities.  However, it does not include the work's
+System Libraries, or general-purpose tools or generally available free
+programs which are used unmodified in performing those activities but
+which are not part of the work.  For example, Corresponding Source
+includes interface definition files associated with source files for
+the work, and the source code for shared libraries and dynamically
+linked subprograms that the work is specifically designed to require,
+such as by intimate data communication or control flow between those
+subprograms and other parts of the work.
+  The Corresponding Source need not include anything that users
+can regenerate automatically from other parts of the Corresponding
+Source.
+  The Corresponding Source for a work in source code form is that
+same work.
+  2. Basic Permissions.
+  All rights granted under this License are granted for the term of
+copyright on the Program, and are irrevocable provided the stated
+conditions are met.  This License explicitly affirms your unlimited
+permission to run the unmodified Program.  The output from running a
+covered work is covered by this License only if the output, given its
+content, constitutes a covered work.  This License acknowledges your
+rights of fair use or other equivalent, as provided by copyright law.
+  You may make, run and propagate covered works that you do not
+convey, without conditions so long as your license otherwise remains
+in force.  You may convey covered works to others for the sole purpose
+of having them make modifications exclusively for you, or provide you
+with facilities for running those works, provided that you comply with
+the terms of this License in conveying all material for which you do
+not control copyright.  Those thus making or running the covered works
+for you must do so exclusively on your behalf, under your direction
+and control, on terms that prohibit them from making any copies of
+your copyrighted material outside their relationship with you.
+  Conveying under any other circumstances is permitted solely under
+the conditions stated below.  Sublicensing is not allowed; section 10
+makes it unnecessary.
+  3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+  No covered work shall be deemed part of an effective technological
+measure under any applicable law fulfilling obligations under article
+11 of the WIPO copyright treaty adopted on 20 December 1996, or
+similar laws prohibiting or restricting circumvention of such
+measures.
+  When you convey a covered work, you waive any legal power to forbid
+circumvention of technological measures to the extent such circumvention
+is effected by exercising rights under this License with respect to
+the covered work, and you disclaim any intention to limit operation or
+modification of the work as a means of enforcing, against the work's
+users, your or third parties' legal rights to forbid circumvention of
+technological measures.
+  4. Conveying Verbatim Copies.
+  You may convey verbatim copies of the Program's source code as you
+receive it, in any medium, provided that you conspicuously and
+appropriately publish on each copy an appropriate copyright notice;
+keep intact all notices stating that this License and any
+non-permissive terms added in accord with section 7 apply to the code;
+keep intact all notices of the absence of any warranty; and give all
+recipients a copy of this License along with the Program.
+  You may charge any price or no price for each copy that you convey,
+and you may offer support or warranty protection for a fee.
+  5. Conveying Modified Source Versions.
+  You may convey a work based on the Program, or the modifications to
+produce it from the Program, in the form of source code under the
+terms of section 4, provided that you also meet all of these conditions:
+    a) The work must carry prominent notices stating that you modified
+    it, and giving a relevant date.
+    b) The work must carry prominent notices stating that it is
+    released under this License and any conditions added under section
+    7.  This requirement modifies the requirement in section 4 to
+    "keep intact all notices".
+    c) You must license the entire work, as a whole, under this
+    License to anyone who comes into possession of a copy.  This
+    License will therefore apply, along with any applicable section 7
+    additional terms, to the whole of the work, and all its parts,
+    regardless of how they are packaged.  This License gives no
+    permission to license the work in any other way, but it does not
+    invalidate such permission if you have separately received it.
+    d) If the work has interactive user interfaces, each must display
+    Appropriate Legal Notices; however, if the Program has interactive
+    interfaces that do not display Appropriate Legal Notices, your
+    work need not make them do so.
+  A compilation of a covered work with other separate and independent
+works, which are not by their nature extensions of the covered work,
+and which are not combined with it such as to form a larger program,
+in or on a volume of a storage or distribution medium, is called an
+"aggregate" if the compilation and its resulting copyright are not
+used to limit the access or legal rights of the compilation's users
+beyond what the individual works permit.  Inclusion of a covered work
+in an aggregate does not cause this License to apply to the other
+parts of the aggregate.
+  6. Conveying Non-Source Forms.
+  You may convey a covered work in object code form under the terms
+of sections 4 and 5, provided that you also convey the
+machine-readable Corresponding Source under the terms of this License,
+in one of these ways:
+    a) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by the
+    Corresponding Source fixed on a durable physical medium
+    customarily used for software interchange.
+    b) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by a
+    written offer, valid for at least three years and valid for as
+    long as you offer spare parts or customer support for that product
+    model, to give anyone who possesses the object code either (1) a
+    copy of the Corresponding Source for all the software in the
+    product that is covered by this License, on a durable physical
+    medium customarily used for software interchange, for a price no
+    more than your reasonable cost of physically performing this
+    conveying of source, or (2) access to copy the
+    Corresponding Source from a network server at no charge.
+    c) Convey individual copies of the object code with a copy of the
+    written offer to provide the Corresponding Source.  This
+    alternative is allowed only occasionally and noncommercially, and
+    only if you received the object code with such an offer, in accord
+    with subsection 6b.
+    d) Convey the object code by offering access from a designated
+    place (gratis or for a charge), and offer equivalent access to the
+    Corresponding Source in the same way through the same place at no
+    further charge.  You need not require recipients to copy the
+    Corresponding Source along with the object code.  If the place to
+    copy the object code is a network server, the Corresponding Source
+    may be on a different server (operated by you or a third party)
+    that supports equivalent copying facilities, provided you maintain
+    clear directions next to the object code saying where to find the
+    Corresponding Source.  Regardless of what server hosts the
+    Corresponding Source, you remain obligated to ensure that it is
+    available for as long as needed to satisfy these requirements.
+    e) Convey the object code using peer-to-peer transmission, provided
+    you inform other peers where the object code and Corresponding
+    Source of the work are being offered to the general public at no
+    charge under subsection 6d.
+  A separable portion of the object code, whose source code is excluded
+from the Corresponding Source as a System Library, need not be
+included in conveying the object code work.
+  A "User Product" is either (1) a "consumer product", which means any
+tangible personal property which is normally used for personal, family,
+or household purposes, or (2) anything designed or sold for incorporation
+into a dwelling.  In determining whether a product is a consumer product,
+doubtful cases shall be resolved in favor of coverage.  For a particular
+product received by a particular user, "normally used" refers to a
+typical or common use of that class of product, regardless of the status
+of the particular user or of the way in which the particular user
+actually uses, or expects or is expected to use, the product.  A product
+is a consumer product regardless of whether the product has substantial
+commercial, industrial or non-consumer uses, unless such uses represent
+the only significant mode of use of the product.
+  "Installation Information" for a User Product means any methods,
+procedures, authorization keys, or other information required to install
+and execute modified versions of a covered work in that User Product from
+a modified version of its Corresponding Source.  The information must
+suffice to ensure that the continued functioning of the modified object
+code is in no case prevented or interfered with solely because
+modification has been made.
+  If you convey an object code work under this section in, or with, or
+specifically for use in, a User Product, and the conveying occurs as
+part of a transaction in which the right of possession and use of the
+User Product is transferred to the recipient in perpetuity or for a
+fixed term (regardless of how the transaction is characterized), the
+Corresponding Source conveyed under this section must be accompanied
+by the Installation Information.  But this requirement does not apply
+if neither you nor any third party retains the ability to install
+modified object code on the User Product (for example, the work has
+been installed in ROM).
+  The requirement to provide Installation Information does not include a
+requirement to continue to provide support service, warranty, or updates
+for a work that has been modified or installed by the recipient, or for
+the User Product in which it has been modified or installed.  Access to a
+network may be denied when the modification itself materially and
+adversely affects the operation of the network or violates the rules and
+protocols for communication across the network.
+  Corresponding Source conveyed, and Installation Information provided,
+in accord with this section must be in a format that is publicly
+documented (and with an implementation available to the public in
+source code form), and must require no special password or key for
+unpacking, reading or copying.
+  7. Additional Terms.
+  "Additional permissions" are terms that supplement the terms of this
+License by making exceptions from one or more of its conditions.
+Additional permissions that are applicable to the entire Program shall
+be treated as though they were included in this License, to the extent
+that they are valid under applicable law.  If additional permissions
+apply only to part of the Program, that part may be used separately
+under those permissions, but the entire Program remains governed by
+this License without regard to the additional permissions.
+  When you convey a copy of a covered work, you may at your option
+remove any additional permissions from that copy, or from any part of
+it.  (Additional permissions may be written to require their own
+removal in certain cases when you modify the work.)  You may place
+additional permissions on material, added by you to a covered work,
+for which you have or can give appropriate copyright permission.
+  Notwithstanding any other provision of this License, for material you
+add to a covered work, you may (if authorized by the copyright holders of
+that material) supplement the terms of this License with terms:
+    a) Disclaiming warranty or limiting liability differently from the
+    terms of sections 15 and 16 of this License; or
+    b) Requiring preservation of specified reasonable legal notices or
+    author attributions in that material or in the Appropriate Legal
+    Notices displayed by works containing it; or
+    c) Prohibiting misrepresentation of the origin of that material, or
+    requiring that modified versions of such material be marked in
+    reasonable ways as different from the original version; or
+    d) Limiting the use for publicity purposes of names of licensors or
+    authors of the material; or
+    e) Declining to grant rights under trademark law for use of some
+    trade names, trademarks, or service marks; or
+    f) Requiring indemnification of licensors and authors of that
+    material by anyone who conveys the material (or modified versions of
+    it) with contractual assumptions of liability to the recipient, for
+    any liability that these contractual assumptions directly impose on
+    those licensors and authors.
+  All other non-permissive additional terms are considered "further
+restrictions" within the meaning of section 10.  If the Program as you
+received it, or any part of it, contains a notice stating that it is
+governed by this License along with a term that is a further
+restriction, you may remove that term.  If a license document contains
+a further restriction but permits relicensing or conveying under this
+License, you may add to a covered work material governed by the terms
+of that license document, provided that the further restriction does
+not survive such relicensing or conveying.
+  If you add terms to a covered work in accord with this section, you
+must place, in the relevant source files, a statement of the
+additional terms that apply to those files, or a notice indicating
+where to find the applicable terms.
+  Additional terms, permissive or non-permissive, may be stated in the
+form of a separately written license, or stated as exceptions;
+the above requirements apply either way.
+  8. Termination.
+  You may not propagate or modify a covered work except as expressly
+provided under this License.  Any attempt otherwise to propagate or
+modify it is void, and will automatically terminate your rights under
+this License (including any patent licenses granted under the third
+paragraph of section 11).
+  However, if you cease all violation of this License, then your
+license from a particular copyright holder is reinstated (a)
+provisionally, unless and until the copyright holder explicitly and
+finally terminates your license, and (b) permanently, if the copyright
+holder fails to notify you of the violation by some reasonable means
+prior to 60 days after the cessation.
+  Moreover, your license from a particular copyright holder is
+reinstated permanently if the copyright holder notifies you of the
+violation by some reasonable means, this is the first time you have
+received notice of violation of this License (for any work) from that
+copyright holder, and you cure the violation prior to 30 days after
+your receipt of the notice.
+  Termination of your rights under this section does not terminate the
+licenses of parties who have received copies or rights from you under
+this License.  If your rights have been terminated and not permanently
+reinstated, you do not qualify to receive new licenses for the same
+material under section 10.
+  9. Acceptance Not Required for Having Copies.
+  You are not required to accept this License in order to receive or
+run a copy of the Program.  Ancillary propagation of a covered work
+occurring solely as a consequence of using peer-to-peer transmission
+to receive a copy likewise does not require acceptance.  However,
+nothing other than this License grants you permission to propagate or
+modify any covered work.  These actions infringe copyright if you do
+not accept this License.  Therefore, by modifying or propagating a
+covered work, you indicate your acceptance of this License to do so.
+  10. Automatic Licensing of Downstream Recipients.
+  Each time you convey a covered work, the recipient automatically
+receives a license from the original licensors, to run, modify and
+propagate that work, subject to this License.  You are not responsible
+for enforcing compliance by third parties with this License.
+  An "entity transaction" is a transaction transferring control of an
+organization, or substantially all assets of one, or subdividing an
+organization, or merging organizations.  If propagation of a covered
+work results from an entity transaction, each party to that
+transaction who receives a copy of the work also receives whatever
+licenses to the work the party's predecessor in interest had or could
+give under the previous paragraph, plus a right to possession of the
+Corresponding Source of the work from the predecessor in interest, if
+the predecessor has it or can get it with reasonable efforts.
+  You may not impose any further restrictions on the exercise of the
+rights granted or affirmed under this License.  For example, you may
+not impose a license fee, royalty, or other charge for exercise of
+rights granted under this License, and you may not initiate litigation
+(including a cross-claim or counterclaim in a lawsuit) alleging that
+any patent claim is infringed by making, using, selling, offering for
+sale, or importing the Program or any portion of it.
+  11. Patents.
+  A "contributor" is a copyright holder who authorizes use under this
+License of the Program or a work on which the Program is based.  The
+work thus licensed is called the contributor's "contributor version".
+  A contributor's "essential patent claims" are all patent claims
+owned or controlled by the contributor, whether already acquired or
+hereafter acquired, that would be infringed by some manner, permitted
+by this License, of making, using, or selling its contributor version,
+but do not include claims that would be infringed only as a
+consequence of further modification of the contributor version.  For
+purposes of this definition, "control" includes the right to grant
+patent sublicenses in a manner consistent with the requirements of
+this License.
+  Each contributor grants you a non-exclusive, worldwide, royalty-free
+patent license under the contributor's essential patent claims, to
+make, use, sell, offer for sale, import and otherwise run, modify and
+propagate the contents of its contributor version.
+  In the following three paragraphs, a "patent license" is any express
+agreement or commitment, however denominated, not to enforce a patent
+(such as an express permission to practice a patent or covenant not to
+sue for patent infringement).  To "grant" such a patent license to a
+party means to make such an agreement or commitment not to enforce a
+patent against the party.
+  If you convey a covered work, knowingly relying on a patent license,
+and the Corresponding Source of the work is not available for anyone
+to copy, free of charge and under the terms of this License, through a
+publicly available network server or other readily accessible means,
+then you must either (1) cause the Corresponding Source to be so
+available, or (2) arrange to deprive yourself of the benefit of the
+patent license for this particular work, or (3) arrange, in a manner
+consistent with the requirements of this License, to extend the patent
+license to downstream recipients.  "Knowingly relying" means you have
+actual knowledge that, but for the patent license, your conveying the
+covered work in a country, or your recipient's use of the covered work
+in a country, would infringe one or more identifiable patents in that
+country that you have reason to believe are valid.
+  If, pursuant to or in connection with a single transaction or
+arrangement, you convey, or propagate by procuring conveyance of, a
+covered work, and grant a patent license to some of the parties
+receiving the covered work authorizing them to use, propagate, modify
+or convey a specific copy of the covered work, then the patent license
+you grant is automatically extended to all recipients of the covered
+work and works based on it.
+  A patent license is "discriminatory" if it does not include within
+the scope of its coverage, prohibits the exercise of, or is
+conditioned on the non-exercise of one or more of the rights that are
+specifically granted under this License.  You may not convey a covered
+work if you are a party to an arrangement with a third party that is
+in the business of distributing software, under which you make payment
+to the third party based on the extent of your activity of conveying
+the work, and under which the third party grants, to any of the
+parties who would receive the covered work from you, a discriminatory
+patent license (a) in connection with copies of the covered work
+conveyed by you (or copies made from those copies), or (b) primarily
+for and in connection with specific products or compilations that
+contain the covered work, unless you entered into that arrangement,
+or that patent license was granted, prior to 28 March 2007.
+  Nothing in this License shall be construed as excluding or limiting
+any implied license or other defenses to infringement that may
+otherwise be available to you under applicable patent law.
+  12. No Surrender of Others' Freedom.
+  If conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot convey a
+covered work so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you may
+not convey it at all.  For example, if you agree to terms that obligate you
+to collect a royalty for further conveying from those to whom you convey
+the Program, the only way you could satisfy both those terms and this
+License would be to refrain entirely from conveying the Program.
+  13. Use with the GNU Affero General Public License.
+  Notwithstanding any other provision of this License, you have
+permission to link or combine any covered work with a work licensed
+under version 3 of the GNU Affero General Public License into a single
+combined work, and to convey the resulting work.  The terms of this
+License will continue to apply to the part which is the covered work,
+but the special requirements of the GNU Affero General Public License,
+section 13, concerning interaction through a network will apply to the
+combination as such.
+  14. Revised Versions of this License.
+  The Free Software Foundation may publish revised and/or new versions of
+the GNU General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+  Each version is given a distinguishing version number.  If the
+Program specifies that a certain numbered version of the GNU General
+Public License "or any later version" applies to it, you have the
+option of following the terms and conditions either of that numbered
+version or of any later version published by the Free Software
+Foundation.  If the Program does not specify a version number of the
+GNU General Public License, you may choose any version ever published
+by the Free Software Foundation.
+  If the Program specifies that a proxy can decide which future
+versions of the GNU General Public License can be used, that proxy's
+public statement of acceptance of a version permanently authorizes you
+to choose that version for the Program.
+  Later license versions may give you additional or different
+permissions.  However, no additional obligations are imposed on any
+author or copyright holder as a result of your choosing to follow a
+later version.
+  15. Disclaimer of Warranty.
+  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
+APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
+HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
+OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
+IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
+ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+  16. Limitation of Liability.
+  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
+THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
+GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
+USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
+DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
+PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
+EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGES.
+  17. Interpretation of Sections 15 and 16.
+  If the disclaimer of warranty and limitation of liability provided
+above cannot be given local legal effect according to their terms,
+reviewing courts shall apply local law that most closely approximates
+an absolute waiver of all civil liability in connection with the
+Program, unless a warranty or assumption of liability accompanies a
+copy of the Program in return for a fee.
+                     END OF TERMS AND CONDITIONS
+            How to Apply These Terms to Your New Programs
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+state the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <https://www.gnu.org/licenses/>.
+Also add information on how to contact you by electronic and paper mail.
+  If the program does terminal interaction, make it output a short
+notice like this when it starts in an interactive mode:
+    <program>  Copyright (C) <year>  <name of author>
+    This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, your program's commands
+might be different; for a GUI interface, you would use an "about box".
+  You should also get your employer (if you work as a programmer) or school,
+if any, to sign a "copyright disclaimer" for the program, if necessary.
+For more information on this, and how to apply and follow the GNU GPL, see
+<https://www.gnu.org/licenses/>.
+  The GNU General Public License does not permit incorporating your program
+into proprietary programs.  If your program is a subroutine library, you
+may consider it more useful to permit linking proprietary applications with
+the library.  If this is what you want to do, use the GNU Lesser General
+Public License instead of this License.  But first, please read
+<https://www.gnu.org/licenses/why-not-lgpl.html>.

Makefile ADDED Viewed

	@@ -0,0 +1,13 @@

+build-image:
+	docker rmi manga-image-translator || true
+	docker build . --tag=manga-image-translator
+run-web-server:
+	docker run --gpus all -p 5003:5003 --ipc=host --rm zyddnys/manga-image-translator:main \
+		--target-lang=ENG \
+		--manga2eng \
+		--verbose \
+		--mode=web \
+		--use-gpu \
+		--host=0.0.0.0 \
+		--port=5003

README_CN.md ADDED Viewed

	@@ -0,0 +1,413 @@

+# 漫画图片翻译器 (中文说明)
+> 一键翻译各类图片内文字\
+> [English](README.md) | [更新日志](CHANGELOG_CN.md) \
+> 欢迎加入我们的 Discord <https://discord.gg/Ak8APNy4vb>
+针对群内、各个图站上大量不太可能会有人去翻译的图片设计，让我这种日语小白能够勉强看懂图片\
+主要支持日语，汉语、英文和韩语\
+支持图片修补和嵌字\
+该项目是[求闻转译志](https://github.com/PatchyVideo/MMDOCR-HighPerformance)的 v2 版本
+**只是初步版本，我们需要您的帮助完善**\
+这个项目目前只完成了简单的 demo，依旧存在大量不完善的地方，我们需要您的帮助完善这个项目！
+## 支持我们
+请支持我们使用 GPU 服务器，谢谢！
+- Ko-fi: <https://ko-fi.com/voilelabs>
+- Patreon: <https://www.patreon.com/voilelabs>
+- 爱发电: <https://afdian.net/@voilelabs>
+## 在线版
+官方演示站 (由 zyddnys 维护)： <https://cotrans.touhou.ai/>\
+镜像站 (由 Eidenz 维护): <https://manga.eidenz.com/>\
+浏览器脚本 (由 QiroNT 维护): <https://greasyfork.org/scripts/437569>
+- 注意如果在线版无法访问说明 Google GCP 又在重启我的服务器，此时请等待我重新开启服务。
+- 在线版使用的是目前 main 分支最新版本。
+## 使用说明
+```bash
+# 首先，确信你的机器安装了 Python 3.8 及以上版本，和相应的编译工具
+$ python --version
+Python 3.8.13
+# 拉取仓库
+$ git clone https://github.com/zyddnys/manga-image-translator.git
+# 安装依赖
+$ pip install -r requirements.txt
+```
+注意：`pydensecrf` 和其他pip包可能需要操作系统的相应编译工具（如Debian的build-essential）。
+[使用谷歌翻译时可选]\
+申请有道翻译或者 DeepL 的 API，把你的 `APP_KEY` 和 `APP_SECRET` 或 `AUTH_KEY` 写入 `translators/key.py` 中。
+### 翻译器列表
+| 名称            | 是否需要 API Key | 是否离线可用 | 其他说明                                    |
+| -------------- | ------- | ------- | ----------------------------------------------------- |
+| google         |         |         |                                                       |
+| youdao         | ✔️      |         | 需要 `YOUDAO_APP_KEY` 和 `YOUDAO_SECRET_KEY`     |
+| baidu          | ✔️      |         | 需要 `BAIDU_APP_ID` 和 `BAIDU_SECRET_KEY`        |
+| deepl          | ✔️      |         | 需要 `DEEPL_AUTH_KEY`                             |
+| caiyun          | ✔️      |         | 需要 `CAIYUN_TOKEN`                             |
+| gpt3           | ✔️      |         | Implements text-davinci-003. Requires `OPENAI_API_KEY`|
+| gpt3.5         | ✔️      |         | Implements gpt-3.5-turbo. Requires `OPENAI_API_KEY`   |
+| gpt4           | ✔️      |         | Implements gpt-4. Requires `OPENAI_API_KEY`           |
+| papago         |         |         |                                                       |
+| sakura         |         |         |需要`SAKURA_API_BASE`|
+| offline        |         | ✔️      |  自动选择可用的离线模型，只是选择器                                                  |
+| sugoi          |         | ✔️      |  只能翻译英文                                                    |
+| m2m100         |         | ✔️      |  可以翻译所有语言                                                     |
+| m2m100_big     |         | ✔️      |  带big的是完整尺寸，不带是精简版                                                    |
+| none           |         | ✔️      | 翻译成空白文本                                          |
+| mbart50    |         | ✔️      |                                                        |
+| original       |         | ✔️      | 翻译成源文本                                            |
+### 语言代码列表
+可以填入 `--target-lang` 参数
+```yaml
+CHS: Chinese (Simplified)
+CHT: Chinese (Traditional)
+CSY: Czech
+NLD: Dutch
+ENG: English
+FRA: French
+DEU: German
+HUN: Hungarian
+ITA: Italian
+JPN: Japanese
+KOR: Korean
+PLK: Polish
+PTB: Portuguese (Brazil)
+ROM: Romanian
+RUS: Russian
+ESP: Spanish
+TRK: Turkish
+VIN: Vietnames
+ARA: Arabic
+SRP: Serbian
+HRV: Croatian
+THA: Thai
+IND: Indonesian
+FIL: Filipino (Tagalog)
+```
+<!-- Auto generated start (See devscripts/make_readme.py) -->
+## 选项
+```text
+-h, --help                                   show this help message and exit
+-m, --mode {demo,batch,web,web_client,ws,api}
+                                             Run demo in single image demo mode (demo), batch
+                                             translation mode (batch), web service mode (web)
+-i, --input INPUT [INPUT ...]                Path to an image file if using demo mode, or path to an
+                                             image folder if using batch mode
+-o, --dest DEST                              Path to the destination folder for translated images in
+                                             batch mode
+-l, --target-lang {CHS,CHT,CSY,NLD,ENG,FRA,DEU,HUN,ITA,JPN,KOR,PLK,PTB,ROM,RUS,ESP,TRK,UKR,VIN,ARA,CNR,SRP,HRV,THA,IND,FIL}
+                                             Destination language
+-v, --verbose                                Print debug info and save intermediate images in result
+                                             folder
+-f, --format {png,webp,jpg,xcf,psd,pdf}      Output format of the translation.
+--attempts ATTEMPTS                          Retry attempts on encountered error. -1 means infinite
+                                             times.
+--ignore-errors                              Skip image on encountered error.
+--overwrite                                  Overwrite already translated images in batch mode.
+--skip-no-text                               Skip image without text (Will not be saved).
+--model-dir MODEL_DIR                        Model directory (by default ./models in project root)
+--use-gpu                                   Turn on/off gpu (automatic selection between mps or cuda)
+--use-gpu-limited                           Turn on/off gpu (excluding offline translator)
+--detector {default,ctd,craft,none}          Text detector used for creating a text mask from an
+                                             image, DO NOT use craft for manga, it's not designed
+                                             for it
+--ocr {32px,48px,48px_ctc,mocr}              Optical character recognition (OCR) model to use
+--use-mocr-merge                             Use bbox merge when Manga OCR inference.
+--inpainter {default,lama_large,lama_mpe,sd,none,original}
+                                             Inpainting model to use
+--upscaler {waifu2x,esrgan,4xultrasharp}     Upscaler to use. --upscale-ratio has to be set for it
+                                             to take effect
+--upscale-ratio UPSCALE_RATIO                Image upscale ratio applied before detection. Can
+                                             improve text detection.
+--colorizer {mc2}                            Colorization model to use.
+--translator {google,youdao,baidu,deepl,papago,caiyun,gpt3,gpt3.5,gpt4,none,original,offline,nllb,nllb_big,sugoi,jparacrawl,jparacrawl_big,m2m100,sakura}
+                                             Language translator to use
+--translator-chain TRANSLATOR_CHAIN          Output of one translator goes in another. Example:
+                                             --translator-chain "google:JPN;sugoi:ENG".
+--selective-translation SELECTIVE_TRANSLATION
+                                             Select a translator based on detected language in
+                                             image. Note the first translation service acts as
+                                             default if the language isn't defined. Example:
+                                             --translator-chain "google:JPN;sugoi:ENG".
+--revert-upscaling                           Downscales the previously upscaled image after
+                                             translation back to original size (Use with --upscale-
+                                             ratio).
+--detection-size DETECTION_SIZE              Size of image used for detection
+--det-rotate                                 Rotate the image for detection. Might improve
+                                             detection.
+--det-auto-rotate                            Rotate the image for detection to prefer vertical
+                                             textlines. Might improve detection.
+--det-invert                                 Invert the image colors for detection. Might improve
+                                             detection.
+--det-gamma-correct                          Applies gamma correction for detection. Might improve
+                                             detection.
+--unclip-ratio UNCLIP_RATIO                  How much to extend text skeleton to form bounding box
+--box-threshold BOX_THRESHOLD                Threshold for bbox generation
+--text-threshold TEXT_THRESHOLD              Threshold for text detection
+--min-text-length MIN_TEXT_LENGTH            Minimum text length of a text region
+--no-text-lang-skip                          Dont skip text that is seemingly already in the target
+                                             language.
+--inpainting-size INPAINTING_SIZE            Size of image used for inpainting (too large will
+                                             result in OOM)
+--inpainting-precision {fp32,fp16,bf16}      Inpainting precision for lama, use bf16 while you can.
+--colorization-size COLORIZATION_SIZE        Size of image used for colorization. Set to -1 to use
+                                             full image size
+--denoise-sigma DENOISE_SIGMA                Used by colorizer and affects color strength, range
+                                             from 0 to 255 (default 30). -1 turns it off.
+--mask-dilation-offset MASK_DILATION_OFFSET  By how much to extend the text mask to remove left-over
+                                             text pixels of the original image.
+--font-size FONT_SIZE                        Use fixed font size for rendering
+--font-size-offset FONT_SIZE_OFFSET          Offset font size by a given amount, positive number
+                                             increase font size and vice versa
+--font-size-minimum FONT_SIZE_MINIMUM        Minimum output font size. Default is
+                                             image_sides_sum/200
+--font-color FONT_COLOR                      Overwrite the text fg/bg color detected by the OCR
+                                             model. Use hex string without the "#" such as FFFFFF
+                                             for a white foreground or FFFFFF:000000 to also have a
+                                             black background around the text.
+--line-spacing LINE_SPACING                  Line spacing is font_size * this value. Default is 0.01
+                                             for horizontal text and 0.2 for vertical.
+--force-horizontal                           Force text to be rendered horizontally
+--force-vertical                             Force text to be rendered vertically
+--align-left                                 Align rendered text left
+--align-center                               Align rendered text centered
+--align-right                                Align rendered text right
+--uppercase                                  Change text to uppercase
+--lowercase                                  Change text to lowercase
+--no-hyphenation                             If renderer should be splitting up words using a hyphen
+                                             character (-)
+--manga2eng                                  Render english text translated from manga with some
+                                             additional typesetting. Ignores some other argument
+                                             options
+--gpt-config GPT_CONFIG                      Path to GPT config file, more info in README
+--use-mtpe                                   Turn on/off machine translation post editing (MTPE) on
+                                             the command line (works only on linux right now)
+--save-text                                  Save extracted text and translations into a text file.
+--save-text-file SAVE_TEXT_FILE              Like --save-text but with a specified file path.
+--filter-text FILTER_TEXT                    Filter regions by their text with a regex. Example
+                                             usage: --text-filter ".*badtext.*"
+--skip-lang                                  Skip translation if source image is one of the provide languages,
+                                             use comma to separate multiple languages. Example: JPN,ENG
+--prep-manual                                Prepare for manual typesetting by outputting blank,
+                                             inpainted images, plus copies of the original for
+                                             reference
+--font-path FONT_PATH                        Path to font file
+--gimp-font GIMP_FONT                        Font family to use for gimp rendering.
+--host HOST                                  Used by web module to decide which host to attach to
+--port PORT                                  Used by web module to decide which port to attach to
+--nonce NONCE                                Used by web module as secret for securing internal web
+                                             server communication
+--ws-url WS_URL                              Server URL for WebSocket mode
+--save-quality SAVE_QUALITY                  Quality of saved JPEG image, range from 0 to 100 with
+                                             100 being best
+--ignore-bubble IGNORE_BUBBLE                The threshold for ignoring text in non bubble areas,
+                                             with valid values ranging from 1 to 50, does not ignore
+                                             others. Recommendation 5 to 10. If it is too low,
+                                             normal bubble areas may be ignored, and if it is too
+                                             large, non bubble areas may be considered normal
+                                             bubbles
+```
+<!-- Auto generated end -->
+### 使用命令行执行
+```bash
+# 如果机器有支持 CUDA 的 NVIDIA GPU，可以添加 `--use-gpu` 参数
+# 使用 `--use-gpu-limited` 将需要使用大量显存的翻译交由CPU执行，这样可以减少显存占用
+# 使用 `--translator=<翻译器名称>` 来指定翻译器
+# 使用 `--target-lang=<语言代码>` 来指定目标语言
+# 将 <图片文件路径> 替换为图片的路径
+# 如果你要翻译的图片比较小或者模糊，可以使用upscaler提升图像大小与质量，从而提升检测翻译效果
+$ python -m manga_translator --verbose --use-gpu --translator=google --target-lang=CHS -i <path_to_image_file>
+# 结果会存放到 result 文件夹里
+```
+#### 使用命令行批量翻译
+```bash
+# 其它参数如上
+# 使用 `--mode batch` 开启批量翻译模式
+# 将 <图片文件夹路径> 替换为图片文件夹的路径
+$ python -m manga_translator --verbose --mode batch --use-gpu --translator=google --target-lang=CHS -i <图片文件夹路径>
+# 结果会存放到 `<图片文件夹路径>-translated` 文件夹里
+```
+### 使用浏览器 (Web 服务器)
+```bash
+# 其它参数如上
+# 使用 `--mode web` 开启 Web 服务器模式
+$ python -m manga_translator --verbose --mode web --use-gpu
+# 程序服务会开启在 http://127.0.0.1:5003
+```
+程序提供两个请求模式：同步模式和异步模式。\
+同步模式下你的 HTTP POST 请求会一直等待直到翻译完成。\
+异步模式下你的 HTTP POST 会立刻返回一个 `task_id`，你可以使用这个 `task_id` 去定期轮询得到翻译的状态。
+#### 同步模式
+1. POST 提交一个带图片，名字是 file 的 form 到 <http://127.0.0.1:5003/run>
+2. 等待返回
+3. 从得到的 `task_id` 去 result 文件夹里取结果，例如通过 Nginx 暴露 result 下的内容
+#### 异步模式
+1. POST 提交一个带图片，名字是 file 的 form 到<http://127.0.0.1:5003/submit>
+2. 你会得到一个 `task_id`
+3. 通过这个 `task_id` 你可以定期发送 POST 轮询请求 JSON `{"taskid": <task_id>}` 到 <http://127.0.0.1:5003/task-state>
+4. 当返回的状态是 `finished`、`error` 或 `error-lang` 时代表翻译完成
+5. 去 result 文件夹里取结果，例如通过 Nginx 暴露 result 下的内容
+#### 人工翻译
+人工翻译允许代替机翻手动填入翻译后文本
+POST 提交一个带图片，名字是 file 的 form 到 <http://127.0.0.1:5003/manual-translate>，并等待返回
+你会得到一个 JSON 数组，例如：
+```json
+{
+  "task_id": "12c779c9431f954971cae720eb104499",
+  "status": "pending",
+  "trans_result": [
+    {
+      "s": "☆上司来ちゃった……",
+      "t": ""
+    }
+  ]
+}
+```
+将翻译后内容填入 t 字符串：
+```json
+{
+  "task_id": "12c779c9431f954971cae720eb104499",
+  "status": "pending",
+  "trans_result": [
+    {
+      "s": "☆上司来ちゃった……",
+      "t": "☆上司来了..."
+    }
+  ]
+}
+```
+将该 JSON 发送到 <http://127.0.0.1:5003/post-manual-result>，并等待返回\
+之后就可以从得到的 `task_id` 去 result 文件夹里取结果，例如通过 Nginx 暴露 result 下的内容
+## 下一步
+列一下以后完善这个项目需要做的事，欢迎贡献！
+1. 使用基于扩散模型的图像修补算法，不过这样图像修补会慢很多
+2. ~~【重要，请求帮助】目前的文字渲染引擎只能勉强看，和 Adobe 的渲染引擎差距明显，我们需要您的帮助完善文本渲染！~~
+3. ~~我尝试了在 OCR 模型里提取文字颜色，均以失败告终，现在只能用 DPGMM 凑活提取文字颜色，但是效果欠佳，我会尽量完善文字颜色提取，如果您有好的建议请尽管提 issue~~
+4. ~~文本检测目前不能很好处理英语和韩语，等图片修补模型训练好了我就会训练新版的文字检测模型。~~ ~~韩语支持在做了~~
+5. 文本渲染区域是根据检测到的文本，而不是汽包决定的，这样可以处理没有汽包的图片但是不能很好进行英语嵌字，目前没有想到好的解决方案。
+6. [Ryota et al.](https://arxiv.org/abs/2012.14271) 提出了获取配对漫画作为训练数据，训练可以结合图片内容进行翻译的模型，未来可以考虑把大量图片 VQVAE 化，输入 nmt 的 encoder 辅助翻译，而不是分框提取 tag 辅助翻译，这样可以处理范围更广的图片。这需要我们也获取大量配对翻译漫画/图片数据，以及训练 VQVAE 模型。
+7. 求闻转译志针对视频设计，未来这个项目要能优化到可以处理视频，提取文本颜色用于生成 ass 字幕，进一步辅助东方视频字幕组工作。甚至可以涂改视频内容，去掉视频内字幕。
+8. ~~结合传统算法的 mask 生成优化，目前在测试 CRF 相关算法。~~
+9. ~~尚不支持倾斜文本区域合并~~
+## 效果图
+以下样例可能并未经常更新，可能不能代表当前主分支版本的效果。
+<table>
+  <thead>
+    <tr>
+      <th align="center" width="50%">原始图片</th>
+      <th align="center" width="50%">翻译后图片</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td align="center" width="50%">
+        <a href="https://user-images.githubusercontent.com/31543482/232265329-6a560438-e887-4f7f-b6a1-a61b8648f781.png">
+          <img alt="佐藤さんは知っていた - 猫麦" src="https://user-images.githubusercontent.com/31543482/232265329-6a560438-e887-4f7f-b6a1-a61b8648f781.png" />
+        </a>
+        <br />
+        <a href="https://twitter.com/09ra_19ra/status/1647079591109103617/photo/1">(Source @09ra_19ra)</a>
+      </td>
+      <td align="center" width="50%">
+        <a href="https://user-images.githubusercontent.com/31543482/232265339-514c843a-0541-4a24-b3bc-1efa6915f757.png">
+          <img alt="Output" src="https://user-images.githubusercontent.com/31543482/232265339-514c843a-0541-4a24-b3bc-1efa6915f757.png" />
+        </a>
+        <br />
+        <a href="https://user-images.githubusercontent.com/31543482/232265376-01a4557d-8120-4b6b-b062-f271df177770.png">(Mask)</a>
+      </td>
+    </tr>
+    <tr>
+      <td align="center" width="50%">
+        <a href="https://user-images.githubusercontent.com/31543482/232265479-a15c43b5-0f00-489c-9b04-5dfbcd48c432.png">
+          <img alt="Gris finds out she's of royal blood - VERTI" src="https://user-images.githubusercontent.com/31543482/232265479-a15c43b5-0f00-489c-9b04-5dfbcd48c432.png" />
+        </a>
+        <br />
+        <a href="https://twitter.com/VERTIGRIS_ART/status/1644365184142647300/photo/1">(Source @VERTIGRIS_ART)</a>
+      </td>
+      <td align="center" width="50%">
+        <a href="https://user-images.githubusercontent.com/31543482/232265480-f8ba7a28-846f-46e7-8041-3dcb1afe3f67.png">
+          <img alt="Output" src="https://user-images.githubusercontent.com/31543482/232265480-f8ba7a28-846f-46e7-8041-3dcb1afe3f67.png" />
+        </a>
+        <br />
+        <code>--detector ctd</code>
+        <a href="https://user-images.githubusercontent.com/31543482/232265483-99ad20af-dca8-4b78-90f9-a6599eb0e70b.png">(Mask)</a>
+      </td>
+    </tr>
+    <tr>
+      <td align="center" width="50%">
+        <a href="https://user-images.githubusercontent.com/31543482/232264684-5a7bcf8e-707b-4925-86b0-4212382f1680.png">
+          <img alt="陰キャお嬢様の新学期🏫📔🌸 (#3) - ひづき夜宵🎀💜" src="https://user-images.githubusercontent.com/31543482/232264684-5a7bcf8e-707b-4925-86b0-4212382f1680.png" />
+        </a>
+        <br />
+        <a href="https://twitter.com/hiduki_yayoi/status/1645186427712573440/photo/2">(Source @hiduki_yayoi)</a>
+      </td>
+      <td align="center" width="50%">
+        <a href="https://user-images.githubusercontent.com/31543482/232264644-39db36c8-a8d9-4009-823d-bf85ca0609bf.png">
+          <img alt="Output" src="https://user-images.githubusercontent.com/31543482/232264644-39db36c8-a8d9-4009-823d-bf85ca0609bf.png" />
+        </a>
+        <br />
+        <code>--translator none</code>
+        <a href="https://user-images.githubusercontent.com/31543482/232264671-bc8dd9d0-8675-4c6d-8f86-0d5b7a342233.png">(Mask)</a>
+      </td>
+    </tr>
+    <tr>
+      <td align="center" width="50%">
+        <a href="https://user-images.githubusercontent.com/31543482/232265794-5ea8a0cb-42fe-4438-80b7-3bf7eaf0ff2c.png">
+          <img alt="幼なじみの高校デビューの癖がすごい (#1) - 神吉李花☪️🐧" src="https://user-images.githubusercontent.com/31543482/232265794-5ea8a0cb-42fe-4438-80b7-3bf7eaf0ff2c.png" />
+        </a>
+        <br />
+        <a href="https://twitter.com/rikak/status/1642727617886556160/photo/1">(Source @rikak)</a>
+      </td>
+      <td align="center" width="50%">
+        <a href="https://user-images.githubusercontent.com/31543482/232265795-4bc47589-fd97-4073-8cf4-82ae216a88bc.png">
+          <img alt="Output" src="https://user-images.githubusercontent.com/31543482/232265795-4bc47589-fd97-4073-8cf4-82ae216a88bc.png" />
+        </a>
+        <br />
+        <a href="https://user-images.githubusercontent.com/31543482/232265800-6bdc7973-41fe-4d7e-a554-98ea7ca7a137.png">(Mask)</a>
+      </td>
+    </tr>
+  </tbody>
+</table>

devscripts/make_readme.py ADDED Viewed

	@@ -0,0 +1,98 @@

+#!/usr/bin/env python3
+# Adapted from https://github.com/yt-dlp/yt-dlp/tree/master/devscripts
+import os
+import sys
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+import functools
+import re
+from devscripts.utils import read_file, write_file
+from manga_translator.args import HelpFormatter, parser
+READMES = (
+    [
+        'README.md',
+        '## Options',
+        '<!-- Auto generated end -->',
+    ],
+    [
+        'README_CN.md',
+        '## 选项',
+        '<!-- Auto generated end -->',
+    ],
+)
+ALLOWED_OVERSHOOT = 2
+DISABLE_PATCH = object()
+HelpFormatter.INDENT_INCREMENT = 0
+HelpFormatter.MAX_HELP_POSITION = 45
+HelpFormatter.WIDTH = 100
+def take_section(text, start=None, end=None, *, shift=0):
+    return text[
+        text.index(start) + shift if start else None:
+        text.index(end) + shift if end else None
+    ]
+def apply_patch(text, patch):
+    return text if patch[0] is DISABLE_PATCH else re.sub(*patch, text)
+options = take_section(parser.format_help(), '\noptions:', shift=len('\noptions:'))
+max_width = max(map(len, options.split('\n')))
+switch_col_width = len(re.search(r'(?m)^\s{5,}', options).group())
+delim = f'\n{" " * switch_col_width}'
+PATCHES = (
+    # (   # Headings
+    #     r'(?m)^  (\w.+\n)(    (?=\w))?',
+    #     r'## \1'
+    # ),
+    (   # Fixup `--date` formatting
+        rf'(?m)(    --date DATE.+({delim}[^\[]+)*)\[.+({delim}.+)*$',
+        (rf'\1[now|today|yesterday][-N[day|week|month|year]].{delim}'
+         f'E.g. "--date today-2weeks" downloads only{delim}'
+         'videos uploaded on the same day two weeks ago'),
+    ),
+    (   # Do not split URLs
+        rf'({delim[:-1]})? (?P<label>\[\S+\] )?(?P<url>https?({delim})?:({delim})?/({delim})?/(({delim})?\S+)+)\s',
+        lambda mobj: ''.join((delim, mobj.group('label') or '', re.sub(r'\s+', '', mobj.group('url')), '\n'))
+    ),
+    (   # Do not split "words"
+        rf'(?m)({delim}\S+)+$',
+        lambda mobj: ''.join((delim, mobj.group(0).replace(delim, '')))
+    ),
+    # (   # Allow overshooting last line
+    #     rf'(?m)^(?P<prev>.+)${delim}(?P<current>.+)$(?!{delim})',
+    #     lambda mobj: (mobj.group().replace(delim, ' ')
+    #                   if len(mobj.group()) - len(delim) + 1 <= max_width + ALLOWED_OVERSHOOT
+    #                   else mobj.group())
+    # ),
+    # (   # Avoid newline when a space is available b/w switch and description
+    #     DISABLE_PATCH,  # This creates issues with prepare_manpage
+    #     r'(?m)^(\s{4}-.{%d})(%s)' % (switch_col_width - 6, delim),
+    #     r'\1 '
+    # ),
+    # (   # Replace brackets with a Markdown link
+    #     r'SponsorBlock API \((http.+)\)',
+    #     r'[SponsorBlock API](\1)'
+    # ),
+)
+for file, options_start, options_end in READMES:
+    readme = read_file(file)
+    write_file(file, ''.join((
+        take_section(readme, end=options_start, shift=len(options_start)),
+        '\n\n```text',
+        functools.reduce(apply_patch, PATCHES, options),
+        '```\n\n',
+        take_section(readme, options_end),
+    )))

devscripts/utils.py ADDED Viewed

	@@ -0,0 +1,42 @@

+# Adopted from https://github.com/yt-dlp/yt-dlp/tree/master/devscripts
+import argparse
+import functools
+import subprocess
+def read_file(fname):
+    with open(fname, encoding='utf-8') as f:
+        return f.read()
+def write_file(fname, content, mode='w'):
+    with open(fname, mode, encoding='utf-8') as f:
+        return f.write(content)
+def get_filename_args(has_infile=False, default_outfile=None):
+    parser = argparse.ArgumentParser()
+    if has_infile:
+        parser.add_argument('infile', help='Input file')
+    kwargs = {'nargs': '?', 'default': default_outfile} if default_outfile else {}
+    parser.add_argument('outfile', **kwargs, help='Output file')
+    opts = parser.parse_args()
+    if has_infile:
+        return opts.infile, opts.outfile
+    return opts.outfile
+def compose_functions(*functions):
+    return lambda x: functools.reduce(lambda y, f: f(y), functions, x)
+def run_process(*args, **kwargs):
+    kwargs.setdefault('text', True)
+    kwargs.setdefault('check', True)
+    kwargs.setdefault('capture_output', True)
+    if kwargs['text']:
+        kwargs.setdefault('encoding', 'utf-8')
+        kwargs.setdefault('errors', 'replace')
+    return subprocess.run(args, **kwargs)

docker_prepare.py ADDED Viewed

	@@ -0,0 +1,28 @@

+import asyncio
+from manga_translator.utils import ModelWrapper
+from manga_translator.detection import DETECTORS
+from manga_translator.ocr import OCRS
+from manga_translator.inpainting import INPAINTERS
+async def download(dict):
+  for key, value in dict.items():
+    if issubclass(value, ModelWrapper):
+      print(' -- Downloading', key)
+      try:
+        inst = value()
+        await inst.download()
+      except Exception as e:
+        print('Failed to download', key, value)
+        print(e)
+async def main():
+  await download(DETECTORS)
+  await download(OCRS)
+  await download({
+    k: v for k, v in INPAINTERS.items()
+      if k not in ['sd']
+  })
+if __name__ == '__main__':
+  asyncio.run(main())

fonts/Arial-Unicode-Regular.ttf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:14f28249244f00c13348cb211c8a83c3e6e44dcf1874ebcb083efbfc0b9d5387
+size 23892708

fonts/anime_ace.ttf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e3e311d48c305e79757cc0051aca591b735eb57002f78035969cbfc5ca4a5125
+size 108036

fonts/anime_ace_3.ttf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e9b7c40b5389c511a950234fe0add8a11da9563b468e0e8a88219ccbf2257f83
+size 58236

fonts/comic shanns 2.ttf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:64590b794cab741937889d379b205ae126ca4f3ed5cbe4f19839d2bfac246da6
+size 73988

fonts/msgothic.ttc ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ef9044f54896c6d045a425e62e38b3232d49facc5549a12837d077ff0bf74298
+size 9176636

fonts/msyh.ttc ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e4b3b9d058750fb80899c24f68e35beda606ca92694eff0e9f7f91eec7a846aa
+size 19647736

manga_translator/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+import colorama
+from dotenv import load_dotenv
+colorama.init(autoreset=True)
+load_dotenv()
+from .manga_translator import *

manga_translator/__main__.py ADDED Viewed

	@@ -0,0 +1,79 @@

+import os
+import asyncio
+import logging
+from argparse import Namespace
+from .manga_translator import (
+    MangaTranslator,
+    MangaTranslatorWeb,
+    MangaTranslatorWS,
+    MangaTranslatorAPI,
+    set_main_logger,
+)
+from .args import parser
+from .utils import (
+    BASE_PATH,
+    init_logging,
+    get_logger,
+    set_log_level,
+    natural_sort,
+)
+# TODO: Dynamic imports to reduce ram usage in web(-server) mode. Will require dealing with args.py imports.
+async def dispatch(args: Namespace):
+    args_dict = vars(args)
+    logger.info(f'Running in {args.mode} mode')
+    if args.mode in ('demo', 'batch'):
+        if not args.input:
+            raise Exception('No input image was supplied. Use -i <image_path>')
+        translator = MangaTranslator(args_dict)
+        if args.mode == 'demo':
+            if len(args.input) != 1 or not os.path.isfile(args.input[0]):
+                raise FileNotFoundError(f'Invalid single image file path for demo mode: "{" ".join(args.input)}". Use `-m batch`.')
+            dest = os.path.join(BASE_PATH, 'result/final.png')
+            args.overwrite = True # Do overwrite result/final.png file
+            await translator.translate_path(args.input[0], dest, args_dict)
+        else: # batch
+            dest = args.dest
+            for path in natural_sort(args.input):
+                await translator.translate_path(path, dest, args_dict)
+    elif args.mode == 'web':
+        from .server.web_main import dispatch
+        await dispatch(args.host, args.port, translation_params=args_dict)
+    elif args.mode == 'web_client':
+        translator = MangaTranslatorWeb(args_dict)
+        await translator.listen(args_dict)
+    elif args.mode == 'ws':
+        translator = MangaTranslatorWS(args_dict)
+        await translator.listen(args_dict)
+    elif args.mode == 'api':
+        translator = MangaTranslatorAPI(args_dict)
+        await translator.listen(args_dict)
+if __name__ == '__main__':
+    args = None
+    init_logging()
+    try:
+        args = parser.parse_args()
+        set_log_level(level=logging.DEBUG if args.verbose else logging.INFO)
+        logger = get_logger(args.mode)
+        set_main_logger(logger)
+        if args.mode != 'web':
+            logger.debug(args)
+        loop = asyncio.new_event_loop()
+        asyncio.set_event_loop(loop)
+        loop.run_until_complete(dispatch(args))
+    except KeyboardInterrupt:
+        if not args or args.mode != 'web':
+            print()
+    except Exception as e:
+        logger.error(f'{e.__class__.__name__}: {e}',
+                     exc_info=e if args and args.verbose else None)

manga_translator/args.py ADDED Viewed

	@@ -0,0 +1,182 @@

+import argparse
+import os
+from urllib.parse import unquote
+from .detection import DETECTORS
+from .ocr import OCRS
+from .inpainting import INPAINTERS
+from .translators import VALID_LANGUAGES, TRANSLATORS, TranslatorChain
+from .upscaling import UPSCALERS
+from .colorization import COLORIZERS
+from .save import OUTPUT_FORMATS
+def url_decode(s):
+    s = unquote(s)
+    if s.startswith('file:///'):
+        s = s[len('file://'):]
+    return s
+# Additional argparse types
+def path(string):
+    if not string:
+        return ''
+    s = url_decode(os.path.expanduser(string))
+    if not os.path.exists(s):
+        raise argparse.ArgumentTypeError(f'No such file or directory: "{string}"')
+    return s
+def file_path(string):
+    if not string:
+        return ''
+    s = url_decode(os.path.expanduser(string))
+    if not os.path.exists(s):
+        raise argparse.ArgumentTypeError(f'No such file: "{string}"')
+    return s
+def dir_path(string):
+    if not string:
+        return ''
+    s = url_decode(os.path.expanduser(string))
+    if not os.path.exists(s):
+        raise argparse.ArgumentTypeError(f'No such directory: "{string}"')
+    return s
+# def choice_chain(choices):
+#     """Argument type for string chains from choices separated by ':'. Example: 'choice1:choice2:choice3'"""
+#     def _func(string):
+#         if choices is not None:
+#             for s in string.split(':') or ['']:
+#                 if s not in choices:
+#                     raise argparse.ArgumentTypeError(f'Invalid choice: %s (choose from %s)' % (s, ', '.join(map(repr, choices))))
+#         return string
+#     return _func
+def translator_chain(string):
+    try:
+        return TranslatorChain(string)
+    except ValueError as e:
+        raise argparse.ArgumentTypeError(e)
+    except Exception:
+        raise argparse.ArgumentTypeError(f'Invalid translator_chain value: "{string}". Example usage: --translator "google:sugoi" -l "JPN:ENG"')
+class HelpFormatter(argparse.HelpFormatter):
+    INDENT_INCREMENT = 2
+    MAX_HELP_POSITION = 24
+    WIDTH = None
+    def __init__(self, prog: str, indent_increment: int = 2, max_help_position: int = 24, width: int = None):
+        super().__init__(prog, self.INDENT_INCREMENT, self.MAX_HELP_POSITION, self.WIDTH)
+    def _format_action_invocation(self, action: argparse.Action) -> str:
+        if action.option_strings:
+            # if the Optional doesn't take a value, format is:
+            #    -s, --long
+            if action.nargs == 0:
+                return ', '.join(action.option_strings)
+            # if the Optional takes a value, format is:
+            #    -s, --long ARGS
+            else:
+                default = self._get_default_metavar_for_optional(action)
+                args_string = self._format_args(action, default)
+                return ', '.join(action.option_strings) + ' ' + args_string
+        else:
+            return super()._format_action_invocation(action)
+parser = argparse.ArgumentParser(prog='manga_translator', description='Seamlessly translate mangas into a chosen language', formatter_class=HelpFormatter)
+parser.add_argument('-m', '--mode', default='batch', type=str, choices=['demo', 'batch', 'web', 'web_client', 'ws', 'api'], help='Run demo in single image demo mode (demo), batch translation mode (batch), web service mode (web)')
+parser.add_argument('-i', '--input', default=None, type=path, nargs='+', help='Path to an image file if using demo mode, or path to an image folder if using batch mode')
+parser.add_argument('-o', '--dest', default='', type=str, help='Path to the destination folder for translated images in batch mode')
+parser.add_argument('-l', '--target-lang', default='CHS', type=str, choices=VALID_LANGUAGES, help='Destination language')
+parser.add_argument('-v', '--verbose', action='store_true', help='Print debug info and save intermediate images in result folder')
+parser.add_argument('-f', '--format', default=None, choices=OUTPUT_FORMATS, help='Output format of the translation.')
+parser.add_argument('--attempts', default=0, type=int, help='Retry attempts on encountered error. -1 means infinite times.')
+parser.add_argument('--ignore-errors', action='store_true', help='Skip image on encountered error.')
+parser.add_argument('--overwrite', action='store_true', help='Overwrite already translated images in batch mode.')
+parser.add_argument('--skip-no-text', action='store_true', help='Skip image without text (Will not be saved).')
+parser.add_argument('--model-dir', default=None, type=dir_path, help='Model directory (by default ./models in project root)')
+parser.add_argument('--skip-lang', default=None, type=str, help='Skip translation if source image is one of the provide languages, use comma to separate multiple languages. Example: JPN,ENG')
+g = parser.add_mutually_exclusive_group()
+g.add_argument('--use-gpu', action='store_true', help='Turn on/off gpu (auto switch between mps and cuda)')
+g.add_argument('--use-gpu-limited', action='store_true', help='Turn on/off gpu (excluding offline translator)')
+parser.add_argument('--detector', default='default', type=str, choices=DETECTORS, help='Text detector used for creating a text mask from an image, DO NOT use craft for manga, it\'s not designed for it')
+parser.add_argument('--ocr', default='48px', type=str, choices=OCRS, help='Optical character recognition (OCR) model to use')
+parser.add_argument('--use-mocr-merge', action='store_true', help='Use bbox merge when Manga OCR inference.')
+parser.add_argument('--inpainter', default='lama_large', type=str, choices=INPAINTERS, help='Inpainting model to use')
+parser.add_argument('--upscaler', default='esrgan', type=str, choices=UPSCALERS, help='Upscaler to use. --upscale-ratio has to be set for it to take effect')
+parser.add_argument('--upscale-ratio', default=None, type=float, help='Image upscale ratio applied before detection. Can improve text detection.')
+parser.add_argument('--colorizer', default=None, type=str, choices=COLORIZERS, help='Colorization model to use.')
+g = parser.add_mutually_exclusive_group()
+g.add_argument('--translator', default='google', type=str, choices=TRANSLATORS, help='Language translator to use')
+g.add_argument('--translator-chain', default=None, type=translator_chain, help='Output of one translator goes in another. Example: --translator-chain "google:JPN;sugoi:ENG".')
+g.add_argument('--selective-translation', default=None, type=translator_chain, help='Select a translator based on detected language in image. Note the first translation service acts as default if the language isn\'t defined. Example: --translator-chain "google:JPN;sugoi:ENG".')
+parser.add_argument('--revert-upscaling', action='store_true', help='Downscales the previously upscaled image after translation back to original size (Use with --upscale-ratio).')
+parser.add_argument('--detection-size', default=1536, type=int, help='Size of image used for detection')
+parser.add_argument('--det-rotate', action='store_true', help='Rotate the image for detection. Might improve detection.')
+parser.add_argument('--det-auto-rotate', action='store_true', help='Rotate the image for detection to prefer vertical textlines. Might improve detection.')
+parser.add_argument('--det-invert', action='store_true', help='Invert the image colors for detection. Might improve detection.')
+parser.add_argument('--det-gamma-correct', action='store_true', help='Applies gamma correction for detection. Might improve detection.')
+parser.add_argument('--unclip-ratio', default=2.3, type=float, help='How much to extend text skeleton to form bounding box')
+parser.add_argument('--box-threshold', default=0.7, type=float, help='Threshold for bbox generation')
+parser.add_argument('--text-threshold', default=0.5, type=float, help='Threshold for text detection')
+parser.add_argument('--min-text-length', default=0, type=int, help='Minimum text length of a text region')
+parser.add_argument('--no-text-lang-skip', action='store_true', help='Dont skip text that is seemingly already in the target language.')
+parser.add_argument('--inpainting-size', default=2048, type=int, help='Size of image used for inpainting (too large will result in OOM)')
+parser.add_argument('--inpainting-precision', default='fp32', type=str, help='Inpainting precision for lama, use bf16 while you can.', choices=['fp32', 'fp16', 'bf16'])
+parser.add_argument('--colorization-size', default=576, type=int, help='Size of image used for colorization. Set to -1 to use full image size')
+parser.add_argument('--denoise-sigma', default=30, type=int, help='Used by colorizer and affects color strength, range from 0 to 255 (default 30). -1 turns it off.')
+parser.add_argument('--mask-dilation-offset', default=0, type=int, help='By how much to extend the text mask to remove left-over text pixels of the original image.')
+parser.add_argument('--disable-font-border', action='store_true', help='Disable font border')
+parser.add_argument('--font-size', default=None, type=int, help='Use fixed font size for rendering')
+parser.add_argument('--font-size-offset', default=0, type=int, help='Offset font size by a given amount, positive number increase font size and vice versa')
+parser.add_argument('--font-size-minimum', default=-1, type=int, help='Minimum output font size. Default is image_sides_sum/200')
+parser.add_argument('--font-color', default=None, type=str, help='Overwrite the text fg/bg color detected by the OCR model. Use hex string without the "#" such as FFFFFF for a white foreground or FFFFFF:000000 to also have a black background around the text.')
+parser.add_argument('--line-spacing', default=None, type=float, help='Line spacing is font_size * this value. Default is 0.01 for horizontal text and 0.2 for vertical.')
+g = parser.add_mutually_exclusive_group()
+g.add_argument('--force-horizontal', action='store_true', help='Force text to be rendered horizontally')
+g.add_argument('--force-vertical', action='store_true', help='Force text to be rendered vertically')
+g = parser.add_mutually_exclusive_group()
+g.add_argument('--align-left', action='store_true', help='Align rendered text left')
+g.add_argument('--align-center', action='store_true', help='Align rendered text centered')
+g.add_argument('--align-right', action='store_true', help='Align rendered text right')
+g = parser.add_mutually_exclusive_group()
+g.add_argument('--uppercase', action='store_true', help='Change text to uppercase')
+g.add_argument('--lowercase', action='store_true', help='Change text to lowercase')
+parser.add_argument('--no-hyphenation', action='store_true', help='If renderer should be splitting up words using a hyphen character (-)')
+parser.add_argument('--manga2eng', action='store_true', help='Render english text translated from manga with some additional typesetting. Ignores some other argument options')
+parser.add_argument('--gpt-config', type=file_path, help='Path to GPT config file, more info in README')
+parser.add_argument('--use-mtpe', action='store_true', help='Turn on/off machine translation post editing (MTPE) on the command line (works only on linux right now)')
+g = parser.add_mutually_exclusive_group()
+g.add_argument('--save-text', action='store_true', help='Save extracted text and translations into a text file.')
+g.add_argument('--save-text-file', default='', type=str, help='Like --save-text but with a specified file path.')
+parser.add_argument('--filter-text', default=None, type=str, help='Filter regions by their text with a regex. Example usage: --text-filter ".*badtext.*"')
+parser.add_argument('--prep-manual', action='store_true', help='Prepare for manual typesetting by outputting blank, inpainted images, plus copies of the original for reference')
+parser.add_argument('--font-path', default='', type=file_path, help='Path to font file')
+parser.add_argument('--gimp-font', default='Sans-serif', type=str, help='Font family to use for gimp rendering.')
+parser.add_argument('--host', default='127.0.0.1', type=str, help='Used by web module to decide which host to attach to')
+parser.add_argument('--port', default=5003, type=int, help='Used by web module to decide which port to attach to')
+parser.add_argument('--nonce', default=os.getenv('MT_WEB_NONCE', ''), type=str, help='Used by web module as secret for securing internal web server communication')
+# parser.add_argument('--log-web', action='store_true', help='Used by web module to decide if web logs should be surfaced')
+parser.add_argument('--ws-url', default='ws://localhost:5000', type=str, help='Server URL for WebSocket mode')
+parser.add_argument('--save-quality', default=100, type=int, help='Quality of saved JPEG image, range from 0 to 100 with 100 being best')
+parser.add_argument('--ignore-bubble', default=0, type=int, help='The threshold for ignoring text in non bubble areas, with valid values ranging from 1 to 50, does not ignore others. Recommendation 5 to 10. If it is too low, normal bubble areas may be ignored, and if it is too large, non bubble areas may be considered normal bubbles')
+parser.add_argument('--kernel-size', default=3, type=int, help='Set the convolution kernel size of the text erasure area to completely clean up text residues')
+# Generares dict with a default value for each argument
+DEFAULT_ARGS = vars(parser.parse_args([]))

manga_translator/colorization/__init__.py ADDED Viewed

	@@ -0,0 +1,28 @@

+from PIL import Image
+from .common import CommonColorizer, OfflineColorizer
+from .manga_colorization_v2 import MangaColorizationV2
+COLORIZERS = {
+    'mc2': MangaColorizationV2,
+}
+colorizer_cache = {}
+def get_colorizer(key: str, *args, **kwargs) -> CommonColorizer:
+    if key not in COLORIZERS:
+        raise ValueError(f'Could not find colorizer for: "{key}". Choose from the following: %s' % ','.join(COLORIZERS))
+    if not colorizer_cache.get(key):
+        upscaler = COLORIZERS[key]
+        colorizer_cache[key] = upscaler(*args, **kwargs)
+    return colorizer_cache[key]
+async def prepare(key: str):
+    upscaler = get_colorizer(key)
+    if isinstance(upscaler, OfflineColorizer):
+        await upscaler.download()
+async def dispatch(key: str, device: str = 'cpu', **kwargs) -> Image.Image:
+    colorizer = get_colorizer(key)
+    if isinstance(colorizer, OfflineColorizer):
+        await colorizer.load(device)
+    return await colorizer.colorize(**kwargs)

manga_translator/colorization/common.py ADDED Viewed

	@@ -0,0 +1,24 @@

+from PIL import Image
+from abc import abstractmethod
+from ..utils import InfererModule, ModelWrapper
+class CommonColorizer(InfererModule):
+    _VALID_UPSCALE_RATIOS = None
+    async def colorize(self, image: Image.Image, colorization_size: int, **kwargs) -> Image.Image:
+        return await self._colorize(image, colorization_size, **kwargs)
+    @abstractmethod
+    async def _colorize(self, image: Image.Image, colorization_size: int, **kwargs) -> Image.Image:
+        pass
+class OfflineColorizer(CommonColorizer, ModelWrapper):
+    _MODEL_SUB_DIR = 'colorization'
+    async def _colorize(self, *args, **kwargs):
+        return await self.infer(*args, **kwargs)
+    @abstractmethod
+    async def _infer(self, image: Image.Image, colorization_size: int, **kwargs) -> Image.Image:
+        pass

manga_translator/colorization/manga_colorization_v2.py ADDED Viewed

	@@ -0,0 +1,74 @@

+import os
+import torch
+import numpy as np
+from PIL import Image
+from torchvision.transforms import ToTensor
+from .common import OfflineColorizer
+from .manga_colorization_v2_utils.networks.models import Colorizer
+from .manga_colorization_v2_utils.denoising.denoiser import FFDNetDenoiser
+from .manga_colorization_v2_utils.utils.utils import resize_pad
+# https://github.com/qweasdd/manga-colorization-v2
+class MangaColorizationV2(OfflineColorizer):
+    _MODEL_SUB_DIR = os.path.join(OfflineColorizer._MODEL_SUB_DIR, 'manga-colorization-v2')
+    _MODEL_MAPPING = {
+        # Models were in google drive so had to upload to github
+        'generator': {
+            'url': 'https://github.com/zyddnys/manga-image-translator/releases/download/beta-0.3/manga-colorization-v2-generator.zip',
+            'file': 'generator.zip',
+            'hash': '087e6a0bc02770e732a52f33878b71a272a6123c9ac649e9b5bfb75e39e5c1d5',
+        },
+        'denoiser': {
+            'url': 'https://github.com/zyddnys/manga-image-translator/releases/download/beta-0.3/manga-colorization-v2-net_rgb.pth',
+            'file': 'net_rgb.pth',
+            'hash': '0fe98bfd2ac870b15f360661b1c4789eecefc6dc2e4462842a0dd15e149a0433',
+        }
+    }
+    async def _load(self, device: str):
+        self.device = device
+        self.colorizer = Colorizer().to(device)
+        self.colorizer.generator.load_state_dict(
+            torch.load(self._get_file_path('generator.zip'), map_location=self.device))
+        self.colorizer = self.colorizer.eval()
+        self.denoiser = FFDNetDenoiser(device, _weights_dir=self.model_dir)
+    async def _unload(self):
+        del self.colorizer
+        del self.denoiser
+    async def _infer(self, image: Image.Image, colorization_size: int, denoise_sigma=25, **kwargs) -> Image.Image:
+        # Size has to be multiple of 32
+        img = np.array(image.convert('RGBA'))
+        max_size = min(*img.shape[:2])
+        max_size -= max_size % 32
+        if colorization_size > 0:
+            size = min(max_size, colorization_size - (colorization_size % 32))
+        else:
+            # size<=576 gives best results
+            size = min(max_size, 576)
+        if 0 <= denoise_sigma and denoise_sigma <= 255:
+            img = self.denoiser.get_denoised_image(img, sigma=denoise_sigma)
+        img, current_pad = resize_pad(img, size)
+        transform = ToTensor()
+        current_image = transform(img).unsqueeze(0).to(self.device)
+        current_hint = torch.zeros(1, 4, current_image.shape[2], current_image.shape[3]).float().to(self.device)
+        with torch.no_grad():
+            fake_color, _ = self.colorizer(torch.cat([current_image, current_hint], 1))
+            fake_color = fake_color.detach()
+        result = fake_color[0].detach().cpu().permute(1, 2, 0) * 0.5 + 0.5
+        if current_pad[0] != 0:
+            result = result[:-current_pad[0]]
+        if current_pad[1] != 0:
+            result = result[:, :-current_pad[1]]
+        colored_image = result.numpy() * 255
+        return Image.fromarray(colored_image.astype(np.uint8))

manga_translator/colorization/manga_colorization_v2_utils/denoising/denoiser.py ADDED Viewed

	@@ -0,0 +1,118 @@

+"""
+Denoise an image with the FFDNet denoising method
+Copyright (C) 2018, Matias Tassano <matias.tassano@parisdescartes.fr>
+This program is free software: you can use, modify and/or
+redistribute it under the terms of the GNU General Public
+License as published by the Free Software Foundation, either
+version 3 of the License, or (at your option) any later
+version. You should have received a copy of this license along
+this program. If not, see <http://www.gnu.org/licenses/>.
+"""
+import os
+import argparse
+import time
+import numpy as np
+import cv2
+import torch
+import torch.nn as nn
+from torch.autograd import Variable
+from .models import FFDNet
+from .utils import normalize, variable_to_cv2_image, remove_dataparallel_wrapper, is_rgb
+class FFDNetDenoiser:
+    def __init__(self, _device, _sigma = 25, _weights_dir = 'denoising/models/', _in_ch = 3):
+        self.sigma = _sigma / 255
+        self.weights_dir = _weights_dir
+        self.channels = _in_ch
+        self.device = _device
+        self.model = FFDNet(num_input_channels = _in_ch)
+        self.load_weights()
+        self.model.eval()
+    def load_weights(self):
+        weights_name = 'net_rgb.pth' if self.channels == 3 else 'net_gray.pth'
+        weights_path = os.path.join(self.weights_dir, weights_name)
+        if self.device == 'cuda':
+            # data paralles only for cuda , no need for mps devices
+            state_dict = torch.load(weights_path, map_location=torch.device('cpu'))
+            self.model = nn.DataParallel(self.model,device_ids = [0]).to(self.device)
+        else:
+            # MPS devices don't support DataParallel
+            state_dict = torch.load(weights_path, map_location=self.device)
+            # CPU mode: remove the DataParallel wrapper
+            state_dict = remove_dataparallel_wrapper(state_dict)
+        self.model.load_state_dict(state_dict)
+    def get_denoised_image(self, imorig, sigma = None):
+        if sigma is not None:
+            cur_sigma = sigma / 255
+        else:
+            cur_sigma = self.sigma
+        if len(imorig.shape) < 3 or imorig.shape[2] == 1:
+            imorig = np.repeat(np.expand_dims(imorig, 2), 3, 2)
+        imorig = imorig[..., :3]
+        if (max(imorig.shape[0], imorig.shape[1]) > 1200):
+            ratio = max(imorig.shape[0], imorig.shape[1]) / 1200
+            imorig = cv2.resize(imorig, (int(imorig.shape[1] / ratio), int(imorig.shape[0] / ratio)), interpolation = cv2.INTER_AREA)
+        imorig = imorig.transpose(2, 0, 1)
+        if (imorig.max() > 1.2):
+            imorig = normalize(imorig)
+        imorig = np.expand_dims(imorig, 0)
+        # Handle odd sizes
+        expanded_h = False
+        expanded_w = False
+        sh_im = imorig.shape
+        if sh_im[2]%2 == 1:
+            expanded_h = True
+            imorig = np.concatenate((imorig, imorig[:, :, -1, :][:, :, np.newaxis, :]), axis=2)
+        if sh_im[3]%2 == 1:
+            expanded_w = True
+            imorig = np.concatenate((imorig, imorig[:, :, :, -1][:, :, :, np.newaxis]), axis=3)
+        imorig = torch.Tensor(imorig)
+        # Sets data type according to CPU or GPU modes
+        if self.device == 'cuda':
+            dtype = torch.cuda.FloatTensor
+        else:
+            # for mps devices is still floatTensor
+            dtype = torch.FloatTensor
+        imnoisy = imorig#.clone()
+        with torch.no_grad():
+            imorig, imnoisy = imorig.type(dtype), imnoisy.type(dtype)
+            nsigma = torch.FloatTensor([cur_sigma]).type(dtype)
+        # Estimate noise and subtract it from the input image
+        im_noise_estim = self.model(imnoisy, nsigma)
+        outim = torch.clamp(imnoisy - im_noise_estim, 0., 1.)
+        if expanded_h:
+            # imorig = imorig[:, :, :-1, :]
+            outim = outim[:, :, :-1, :]
+            # imnoisy = imnoisy[:, :, :-1, :]
+        if expanded_w:
+            # imorig = imorig[:, :, :, :-1]
+            outim = outim[:, :, :, :-1]
+            # imnoisy = imnoisy[:, :, :, :-1]
+        return variable_to_cv2_image(outim)

manga_translator/colorization/manga_colorization_v2_utils/denoising/functions.py ADDED Viewed

	@@ -0,0 +1,102 @@

+"""
+Functions implementing custom NN layers
+Copyright (C) 2018, Matias Tassano <matias.tassano@parisdescartes.fr>
+This program is free software: you can use, modify and/or
+redistribute it under the terms of the GNU General Public
+License as published by the Free Software Foundation, either
+version 3 of the License, or (at your option) any later
+version. You should have received a copy of this license along
+this program. If not, see <http://www.gnu.org/licenses/>.
+"""
+import torch
+from torch.autograd import Function, Variable
+def concatenate_input_noise_map(input, noise_sigma):
+    r"""Implements the first layer of FFDNet. This function returns a
+    torch.autograd.Variable composed of the concatenation of the downsampled
+    input image and the noise map. Each image of the batch of size CxHxW gets
+    converted to an array of size 4*CxH/2xW/2. Each of the pixels of the
+    non-overlapped 2x2 patches of the input image are placed in the new array
+    along the first dimension.
+    Args:
+        input: batch containing CxHxW images
+        noise_sigma: the value of the pixels of the CxH/2xW/2 noise map
+    """
+    # noise_sigma is a list of length batch_size
+    N, C, H, W = input.size()
+    dtype = input.type()
+    sca = 2
+    sca2 = sca*sca
+    Cout = sca2*C
+    Hout = H//sca
+    Wout = W//sca
+    idxL = [[0, 0], [0, 1], [1, 0], [1, 1]]
+    # Fill the downsampled image with zeros
+    if 'cuda' in dtype:
+        downsampledfeatures = torch.cuda.FloatTensor(N, Cout, Hout, Wout).fill_(0)
+    else:
+        # cpu and mps are the same
+        downsampledfeatures = torch.FloatTensor(N, Cout, Hout, Wout).fill_(0)
+    # Build the CxH/2xW/2 noise map
+    noise_map = noise_sigma.view(N, 1, 1, 1).repeat(1, C, Hout, Wout)
+    # Populate output
+    for idx in range(sca2):
+        downsampledfeatures[:, idx:Cout:sca2, :, :] = \
+            input[:, :, idxL[idx][0]::sca, idxL[idx][1]::sca]
+    # concatenate de-interleaved mosaic with noise map
+    return torch.cat((noise_map, downsampledfeatures), 1)
+class UpSampleFeaturesFunction(Function):
+    r"""Extends PyTorch's modules by implementing a torch.autograd.Function.
+    This class implements the forward and backward methods of the last layer
+    of FFDNet. It basically performs the inverse of
+    concatenate_input_noise_map(): it converts each of the images of a
+    batch of size CxH/2xW/2 to images of size C/4xHxW
+    """
+    @staticmethod
+    def forward(ctx, input):
+        N, Cin, Hin, Win = input.size()
+        dtype = input.type()
+        sca = 2
+        sca2 = sca*sca
+        Cout = Cin//sca2
+        Hout = Hin*sca
+        Wout = Win*sca
+        idxL = [[0, 0], [0, 1], [1, 0], [1, 1]]
+        assert (Cin%sca2 == 0), 'Invalid input dimensions: number of channels should be divisible by 4'
+        result = torch.zeros((N, Cout, Hout, Wout)).type(dtype)
+        for idx in range(sca2):
+            result[:, :, idxL[idx][0]::sca, idxL[idx][1]::sca] = input[:, idx:Cin:sca2, :, :]
+        return result
+    @staticmethod
+    def backward(ctx, grad_output):
+        N, Cg_out, Hg_out, Wg_out = grad_output.size()
+        dtype = grad_output.data.type()
+        sca = 2
+        sca2 = sca*sca
+        Cg_in = sca2*Cg_out
+        Hg_in = Hg_out//sca
+        Wg_in = Wg_out//sca
+        idxL = [[0, 0], [0, 1], [1, 0], [1, 1]]
+        # Build output
+        grad_input = torch.zeros((N, Cg_in, Hg_in, Wg_in)).type(dtype)
+        # Populate output
+        for idx in range(sca2):
+            grad_input[:, idx:Cg_in:sca2, :, :] = grad_output.data[:, :, idxL[idx][0]::sca, idxL[idx][1]::sca]
+        return Variable(grad_input)
+# Alias functions
+upsamplefeatures = UpSampleFeaturesFunction.apply

manga_translator/colorization/manga_colorization_v2_utils/denoising/models.py ADDED Viewed

	@@ -0,0 +1,100 @@

+"""
+Definition of the FFDNet model and its custom layers
+Copyright (C) 2018, Matias Tassano <matias.tassano@parisdescartes.fr>
+This program is free software: you can use, modify and/or
+redistribute it under the terms of the GNU General Public
+License as published by the Free Software Foundation, either
+version 3 of the License, or (at your option) any later
+version. You should have received a copy of this license along
+this program. If not, see <http://www.gnu.org/licenses/>.
+"""
+import torch.nn as nn
+from torch.autograd import Variable
+from . import functions
+class UpSampleFeatures(nn.Module):
+    r"""Implements the last layer of FFDNet
+    """
+    def __init__(self):
+        super(UpSampleFeatures, self).__init__()
+    def forward(self, x):
+        return functions.upsamplefeatures(x)
+class IntermediateDnCNN(nn.Module):
+    r"""Implements the middel part of the FFDNet architecture, which
+    is basically a DnCNN net
+    """
+    def __init__(self, input_features, middle_features, num_conv_layers):
+        super(IntermediateDnCNN, self).__init__()
+        self.kernel_size = 3
+        self.padding = 1
+        self.input_features = input_features
+        self.num_conv_layers = num_conv_layers
+        self.middle_features = middle_features
+        if self.input_features == 5:
+            self.output_features = 4 #Grayscale image
+        elif self.input_features == 15:
+            self.output_features = 12 #RGB image
+        else:
+            raise Exception('Invalid number of input features')
+        layers = []
+        layers.append(nn.Conv2d(in_channels=self.input_features,\
+                                out_channels=self.middle_features,\
+                                kernel_size=self.kernel_size,\
+                                padding=self.padding,\
+                                bias=False))
+        layers.append(nn.ReLU(inplace=True))
+        for _ in range(self.num_conv_layers-2):
+            layers.append(nn.Conv2d(in_channels=self.middle_features,\
+                                    out_channels=self.middle_features,\
+                                    kernel_size=self.kernel_size,\
+                                    padding=self.padding,\
+                                    bias=False))
+            layers.append(nn.BatchNorm2d(self.middle_features))
+            layers.append(nn.ReLU(inplace=True))
+        layers.append(nn.Conv2d(in_channels=self.middle_features,\
+                                out_channels=self.output_features,\
+                                kernel_size=self.kernel_size,\
+                                padding=self.padding,\
+                                bias=False))
+        self.itermediate_dncnn = nn.Sequential(*layers)
+    def forward(self, x):
+        out = self.itermediate_dncnn(x)
+        return out
+class FFDNet(nn.Module):
+    r"""Implements the FFDNet architecture
+    """
+    def __init__(self, num_input_channels):
+        super(FFDNet, self).__init__()
+        self.num_input_channels = num_input_channels
+        if self.num_input_channels == 1:
+            # Grayscale image
+            self.num_feature_maps = 64
+            self.num_conv_layers = 15
+            self.downsampled_channels = 5
+            self.output_features = 4
+        elif self.num_input_channels == 3:
+            # RGB image
+            self.num_feature_maps = 96
+            self.num_conv_layers = 12
+            self.downsampled_channels = 15
+            self.output_features = 12
+        else:
+            raise Exception('Invalid number of input features')
+        self.intermediate_dncnn = IntermediateDnCNN(\
+                input_features=self.downsampled_channels,\
+                middle_features=self.num_feature_maps,\
+                num_conv_layers=self.num_conv_layers)
+        self.upsamplefeatures = UpSampleFeatures()
+    def forward(self, x, noise_sigma):
+        concat_noise_x = functions.concatenate_input_noise_map(x.data, noise_sigma.data)
+        concat_noise_x = Variable(concat_noise_x)
+        h_dncnn = self.intermediate_dncnn(concat_noise_x)
+        pred_noise = self.upsamplefeatures(h_dncnn)
+        return pred_noise

manga_translator/colorization/manga_colorization_v2_utils/denoising/utils.py ADDED Viewed

	@@ -0,0 +1,66 @@

+"""
+Different utilities such as orthogonalization of weights, initialization of
+loggers, etc
+Copyright (C) 2018, Matias Tassano <matias.tassano@parisdescartes.fr>
+This program is free software: you can use, modify and/or
+redistribute it under the terms of the GNU General Public
+License as published by the Free Software Foundation, either
+version 3 of the License, or (at your option) any later
+version. You should have received a copy of this license along
+this program. If not, see <http://www.gnu.org/licenses/>.
+"""
+import numpy as np
+import cv2
+def variable_to_cv2_image(varim):
+    r"""Converts a torch.autograd.Variable to an OpenCV image
+    Args:
+        varim: a torch.autograd.Variable
+    """
+    nchannels = varim.size()[1]
+    if nchannels == 1:
+        res = (varim.data.cpu().numpy()[0, 0, :]*255.).clip(0, 255).astype(np.uint8)
+    elif nchannels == 3:
+        res = varim.data.cpu().numpy()[0]
+        res = cv2.cvtColor(res.transpose(1, 2, 0), cv2.COLOR_RGB2BGR)
+        res = (res*255.).clip(0, 255).astype(np.uint8)
+    else:
+        raise Exception('Number of color channels not supported')
+    return res
+def normalize(data):
+    return np.float32(data/255.)
+def remove_dataparallel_wrapper(state_dict):
+    r"""Converts a DataParallel model to a normal one by removing the "module."
+    wrapper in the module dictionary
+    Args:
+        state_dict: a torch.nn.DataParallel state dictionary
+    """
+    from collections import OrderedDict
+    new_state_dict = OrderedDict()
+    for k, vl in state_dict.items():
+        name = k[7:] # remove 'module.' of DataParallel
+        new_state_dict[name] = vl
+    return new_state_dict
+def is_rgb(im_path):
+    r""" Returns True if the image in im_path is an RGB image
+    """
+    from skimage.io import imread
+    rgb = False
+    im = imread(im_path)
+    if (len(im.shape) == 3):
+        if not(np.allclose(im[...,0], im[...,1]) and np.allclose(im[...,2], im[...,1])):
+            rgb = True
+    print("rgb: {}".format(rgb))
+    print("im shape: {}".format(im.shape))
+    return rgb

manga_translator/colorization/manga_colorization_v2_utils/networks/extractor.py ADDED Viewed

	@@ -0,0 +1,127 @@

+import torch
+import torch.nn as nn
+import math
+'''https://github.com/blandocs/Tag2Pix/blob/master/model/pretrained.py'''
+# Pretrained version
+class Selayer(nn.Module):
+    def __init__(self, inplanes):
+        super(Selayer, self).__init__()
+        self.global_avgpool = nn.AdaptiveAvgPool2d(1)
+        self.conv1 = nn.Conv2d(inplanes, inplanes // 16, kernel_size=1, stride=1)
+        self.conv2 = nn.Conv2d(inplanes // 16, inplanes, kernel_size=1, stride=1)
+        self.relu = nn.ReLU(inplace=True)
+        self.sigmoid = nn.Sigmoid()
+    def forward(self, x):
+        out = self.global_avgpool(x)
+        out = self.conv1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out = self.sigmoid(out)
+        return x * out
+class BottleneckX_Origin(nn.Module):
+    expansion = 4
+    def __init__(self, inplanes, planes, cardinality, stride=1, downsample=None):
+        super(BottleneckX_Origin, self).__init__()
+        self.conv1 = nn.Conv2d(inplanes, planes * 2, kernel_size=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes * 2)
+        self.conv2 = nn.Conv2d(planes * 2, planes * 2, kernel_size=3, stride=stride,
+                               padding=1, groups=cardinality, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes * 2)
+        self.conv3 = nn.Conv2d(planes * 2, planes * 4, kernel_size=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(planes * 4)
+        self.selayer = Selayer(planes * 4)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+    def forward(self, x):
+        residual = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+        out = self.conv3(out)
+        out = self.bn3(out)
+        out = self.selayer(out)
+        if self.downsample is not None:
+            residual = self.downsample(x)
+        out += residual
+        out = self.relu(out)
+        return out
+class SEResNeXt_Origin(nn.Module):
+    def __init__(self, block, layers, input_channels=3, cardinality=32, num_classes=1000):
+        super(SEResNeXt_Origin, self).__init__()
+        self.cardinality = cardinality
+        self.inplanes = 64
+        self.input_channels = input_channels
+        self.conv1 = nn.Conv2d(input_channels, 64, kernel_size=7, stride=2, padding=3,
+                               bias=False)
+        self.bn1 = nn.BatchNorm2d(64)
+        self.relu = nn.ReLU(inplace=True)
+        self.layer1 = self._make_layer(block, 64, layers[0])
+        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
+        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                m.weight.data.normal_(0, math.sqrt(2. / n))
+                if m.bias is not None:
+                    m.bias.data.zero_()
+            elif isinstance(m, nn.BatchNorm2d):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+    def _make_layer(self, block, planes, blocks, stride=1):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2d(self.inplanes, planes * block.expansion,
+                          kernel_size=1, stride=stride, bias=False),
+                nn.BatchNorm2d(planes * block.expansion),
+            )
+        layers = []
+        layers.append(block(self.inplanes, planes, self.cardinality, stride, downsample))
+        self.inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(block(self.inplanes, planes, self.cardinality))
+        return nn.Sequential(*layers)
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x1 = self.relu(x)
+        x2 = self.layer1(x1)
+        x3 = self.layer2(x2)
+        x4 = self.layer3(x3)
+        return x1, x2, x3, x4

manga_translator/colorization/manga_colorization_v2_utils/networks/models.py ADDED Viewed

	@@ -0,0 +1,319 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchvision.models as M
+import math
+from torch import Tensor
+from torch.nn import Parameter
+from .extractor import SEResNeXt_Origin, BottleneckX_Origin
+'''https://github.com/orashi/AlacGAN/blob/master/models/standard.py'''
+def l2normalize(v, eps=1e-12):
+    return v / (v.norm() + eps)
+class SpectralNorm(nn.Module):
+    def __init__(self, module, name='weight', power_iterations=1):
+        super(SpectralNorm, self).__init__()
+        self.module = module
+        self.name = name
+        self.power_iterations = power_iterations
+        if not self._made_params():
+            self._make_params()
+    def _update_u_v(self):
+        u = getattr(self.module, self.name + "_u")
+        v = getattr(self.module, self.name + "_v")
+        w = getattr(self.module, self.name + "_bar")
+        height = w.data.shape[0]
+        for _ in range(self.power_iterations):
+            v.data = l2normalize(torch.mv(torch.t(w.view(height,-1).data), u.data))
+            u.data = l2normalize(torch.mv(w.view(height,-1).data, v.data))
+        # sigma = torch.dot(u.data, torch.mv(w.view(height,-1).data, v.data))
+        sigma = u.dot(w.view(height, -1).mv(v))
+        setattr(self.module, self.name, w / sigma.expand_as(w))
+    def _made_params(self):
+        try:
+            u = getattr(self.module, self.name + "_u")
+            v = getattr(self.module, self.name + "_v")
+            w = getattr(self.module, self.name + "_bar")
+            return True
+        except AttributeError:
+            return False
+    def _make_params(self):
+        w = getattr(self.module, self.name)
+        height = w.data.shape[0]
+        width = w.view(height, -1).data.shape[1]
+        u = Parameter(w.data.new(height).normal_(0, 1), requires_grad=False)
+        v = Parameter(w.data.new(width).normal_(0, 1), requires_grad=False)
+        u.data = l2normalize(u.data)
+        v.data = l2normalize(v.data)
+        w_bar = Parameter(w.data)
+        del self.module._parameters[self.name]
+        self.module.register_parameter(self.name + "_u", u)
+        self.module.register_parameter(self.name + "_v", v)
+        self.module.register_parameter(self.name + "_bar", w_bar)
+    def forward(self, *args):
+        self._update_u_v()
+        return self.module.forward(*args)
+class Selayer(nn.Module):
+    def __init__(self, inplanes):
+        super(Selayer, self).__init__()
+        self.global_avgpool = nn.AdaptiveAvgPool2d(1)
+        self.conv1 = nn.Conv2d(inplanes, inplanes // 16, kernel_size=1, stride=1)
+        self.conv2 = nn.Conv2d(inplanes // 16, inplanes, kernel_size=1, stride=1)
+        self.relu = nn.ReLU(inplace=True)
+        self.sigmoid = nn.Sigmoid()
+    def forward(self, x):
+        out = self.global_avgpool(x)
+        out = self.conv1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out = self.sigmoid(out)
+        return x * out
+class SelayerSpectr(nn.Module):
+    def __init__(self, inplanes):
+        super(SelayerSpectr, self).__init__()
+        self.global_avgpool = nn.AdaptiveAvgPool2d(1)
+        self.conv1 = SpectralNorm(nn.Conv2d(inplanes, inplanes // 16, kernel_size=1, stride=1))
+        self.conv2 = SpectralNorm(nn.Conv2d(inplanes // 16, inplanes, kernel_size=1, stride=1))
+        self.relu = nn.ReLU(inplace=True)
+        self.sigmoid = nn.Sigmoid()
+    def forward(self, x):
+        out = self.global_avgpool(x)
+        out = self.conv1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out = self.sigmoid(out)
+        return x * out
+class ResNeXtBottleneck(nn.Module):
+    def __init__(self, in_channels=256, out_channels=256, stride=1, cardinality=32, dilate=1):
+        super(ResNeXtBottleneck, self).__init__()
+        D = out_channels // 2
+        self.out_channels = out_channels
+        self.conv_reduce = nn.Conv2d(in_channels, D, kernel_size=1, stride=1, padding=0, bias=False)
+        self.conv_conv = nn.Conv2d(D, D, kernel_size=2 + stride, stride=stride, padding=dilate, dilation=dilate,
+                                   groups=cardinality,
+                                   bias=False)
+        self.conv_expand = nn.Conv2d(D, out_channels, kernel_size=1, stride=1, padding=0, bias=False)
+        self.shortcut = nn.Sequential()
+        if stride != 1:
+            self.shortcut.add_module('shortcut',
+                                     nn.AvgPool2d(2, stride=2))
+        self.selayer = Selayer(out_channels)
+    def forward(self, x):
+        bottleneck = self.conv_reduce.forward(x)
+        bottleneck = F.leaky_relu(bottleneck, 0.2, True)
+        bottleneck = self.conv_conv.forward(bottleneck)
+        bottleneck = F.leaky_relu(bottleneck, 0.2, True)
+        bottleneck = self.conv_expand.forward(bottleneck)
+        bottleneck = self.selayer(bottleneck)
+        x = self.shortcut.forward(x)
+        return x + bottleneck
+class SpectrResNeXtBottleneck(nn.Module):
+    def __init__(self, in_channels=256, out_channels=256, stride=1, cardinality=32, dilate=1):
+        super(SpectrResNeXtBottleneck, self).__init__()
+        D = out_channels // 2
+        self.out_channels = out_channels
+        self.conv_reduce = SpectralNorm(nn.Conv2d(in_channels, D, kernel_size=1, stride=1, padding=0, bias=False))
+        self.conv_conv = SpectralNorm(nn.Conv2d(D, D, kernel_size=2 + stride, stride=stride, padding=dilate, dilation=dilate,
+                                   groups=cardinality,
+                                   bias=False))
+        self.conv_expand = SpectralNorm(nn.Conv2d(D, out_channels, kernel_size=1, stride=1, padding=0, bias=False))
+        self.shortcut = nn.Sequential()
+        if stride != 1:
+            self.shortcut.add_module('shortcut',
+                                     nn.AvgPool2d(2, stride=2))
+        self.selayer = SelayerSpectr(out_channels)
+    def forward(self, x):
+        bottleneck = self.conv_reduce.forward(x)
+        bottleneck = F.leaky_relu(bottleneck, 0.2, True)
+        bottleneck = self.conv_conv.forward(bottleneck)
+        bottleneck = F.leaky_relu(bottleneck, 0.2, True)
+        bottleneck = self.conv_expand.forward(bottleneck)
+        bottleneck = self.selayer(bottleneck)
+        x = self.shortcut.forward(x)
+        return x + bottleneck
+class FeatureConv(nn.Module):
+    def __init__(self, input_dim=512, output_dim=512):
+        super(FeatureConv, self).__init__()
+        no_bn = True
+        seq = []
+        seq.append(nn.Conv2d(input_dim, output_dim, kernel_size=3, stride=1, padding=1, bias=False))
+        if not no_bn: seq.append(nn.BatchNorm2d(output_dim))
+        seq.append(nn.ReLU(inplace=True))
+        seq.append(nn.Conv2d(output_dim, output_dim, kernel_size=3, stride=2, padding=1, bias=False))
+        if not no_bn: seq.append(nn.BatchNorm2d(output_dim))
+        seq.append(nn.ReLU(inplace=True))
+        seq.append(nn.Conv2d(output_dim, output_dim, kernel_size=3, stride=1, padding=1, bias=False))
+        seq.append(nn.ReLU(inplace=True))
+        self.network = nn.Sequential(*seq)
+    def forward(self, x):
+        return self.network(x)
+class Generator(nn.Module):
+    def __init__(self, ngf=64):
+        super(Generator, self).__init__()
+        self.encoder = SEResNeXt_Origin(BottleneckX_Origin, [3, 4, 6, 3], num_classes= 370, input_channels=1)
+        self.to0 =  self._make_encoder_block_first(5, 32)
+        self.to1 = self._make_encoder_block(32, 64)
+        self.to2 = self._make_encoder_block(64, 92)
+        self.to3 = self._make_encoder_block(92, 128)
+        self.to4 = self._make_encoder_block(128, 256)
+        self.deconv_for_decoder = nn.Sequential(
+            nn.ConvTranspose2d(256, 128, 3, stride=2, padding=1, output_padding=1), # output is 64 * 64
+            nn.LeakyReLU(0.2),
+            nn.ConvTranspose2d(128, 64, 3, stride=2, padding=1, output_padding=1), # output is 128 * 128
+            nn.LeakyReLU(0.2),
+            nn.ConvTranspose2d(64, 32, 3, stride=1, padding=1, output_padding=0), # output is 256 * 256
+            nn.LeakyReLU(0.2),
+            nn.ConvTranspose2d(32, 3, 3, stride=1, padding=1, output_padding=0), # output is 256 * 256
+            nn.Tanh(),
+        )
+        tunnel4 = nn.Sequential(*[ResNeXtBottleneck(512, 512, cardinality=32, dilate=1) for _ in range(20)])
+        self.tunnel4 = nn.Sequential(nn.Conv2d(1024 + 128, 512, kernel_size=3, stride=1, padding=1),
+                                     nn.LeakyReLU(0.2, True),
+                                     tunnel4,
+                                     nn.Conv2d(512, 1024, kernel_size=3, stride=1, padding=1),
+                                     nn.PixelShuffle(2),
+                                     nn.LeakyReLU(0.2, True)
+                                     )  # 64
+        depth = 2
+        tunnel = [ResNeXtBottleneck(256, 256, cardinality=32, dilate=1) for _ in range(depth)]
+        tunnel += [ResNeXtBottleneck(256, 256, cardinality=32, dilate=2) for _ in range(depth)]
+        tunnel += [ResNeXtBottleneck(256, 256, cardinality=32, dilate=4) for _ in range(depth)]
+        tunnel += [ResNeXtBottleneck(256, 256, cardinality=32, dilate=2),
+                   ResNeXtBottleneck(256, 256, cardinality=32, dilate=1)]
+        tunnel3 = nn.Sequential(*tunnel)
+        self.tunnel3 = nn.Sequential(nn.Conv2d(512 + 256, 256, kernel_size=3, stride=1, padding=1),
+                                     nn.LeakyReLU(0.2, True),
+                                     tunnel3,
+                                     nn.Conv2d(256, 512, kernel_size=3, stride=1, padding=1),
+                                     nn.PixelShuffle(2),
+                                     nn.LeakyReLU(0.2, True)
+                                     )  # 128
+        tunnel = [ResNeXtBottleneck(128, 128, cardinality=32, dilate=1) for _ in range(depth)]
+        tunnel += [ResNeXtBottleneck(128, 128, cardinality=32, dilate=2) for _ in range(depth)]
+        tunnel += [ResNeXtBottleneck(128, 128, cardinality=32, dilate=4) for _ in range(depth)]
+        tunnel += [ResNeXtBottleneck(128, 128, cardinality=32, dilate=2),
+                   ResNeXtBottleneck(128, 128, cardinality=32, dilate=1)]
+        tunnel2 = nn.Sequential(*tunnel)
+        self.tunnel2 = nn.Sequential(nn.Conv2d(128 + 256 + 64, 128, kernel_size=3, stride=1, padding=1),
+                                     nn.LeakyReLU(0.2, True),
+                                     tunnel2,
+                                     nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1),
+                                     nn.PixelShuffle(2),
+                                     nn.LeakyReLU(0.2, True)
+                                     )
+        tunnel = [ResNeXtBottleneck(64, 64, cardinality=16, dilate=1)]
+        tunnel += [ResNeXtBottleneck(64, 64, cardinality=16, dilate=2)]
+        tunnel += [ResNeXtBottleneck(64, 64, cardinality=16, dilate=4)]
+        tunnel += [ResNeXtBottleneck(64, 64, cardinality=16, dilate=2),
+                   ResNeXtBottleneck(64, 64, cardinality=16, dilate=1)]
+        tunnel1 = nn.Sequential(*tunnel)
+        self.tunnel1 = nn.Sequential(nn.Conv2d(64 + 32, 64, kernel_size=3, stride=1, padding=1),
+                                     nn.LeakyReLU(0.2, True),
+                                     tunnel1,
+                                     nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
+                                     nn.PixelShuffle(2),
+                                     nn.LeakyReLU(0.2, True)
+                                     )
+        self.exit = nn.Sequential(nn.Conv2d(64 + 32, 32, kernel_size=3, stride=1, padding=1),
+                                 nn.LeakyReLU(0.2, True),
+                                 nn.Conv2d(32, 3, kernel_size= 1, stride = 1, padding = 0))
+    def _make_encoder_block(self, inplanes, planes):
+        return nn.Sequential(
+            nn.Conv2d(inplanes, planes, 3, 2, 1),
+            nn.LeakyReLU(0.2),
+            nn.Conv2d(planes, planes, 3, 1, 1),
+            nn.LeakyReLU(0.2),
+        )
+    def _make_encoder_block_first(self, inplanes, planes):
+        return nn.Sequential(
+            nn.Conv2d(inplanes, planes, 3, 1, 1),
+            nn.LeakyReLU(0.2),
+            nn.Conv2d(planes, planes, 3, 1, 1),
+            nn.LeakyReLU(0.2),
+        )
+    def forward(self, sketch):
+        x0 = self.to0(sketch)
+        aux_out = self.to1(x0)
+        aux_out = self.to2(aux_out)
+        aux_out = self.to3(aux_out)
+        x1, x2, x3, x4 = self.encoder(sketch[:, 0:1])
+        out = self.tunnel4(torch.cat([x4, aux_out], 1))
+        x = self.tunnel3(torch.cat([out, x3], 1))
+        x = self.tunnel2(torch.cat([x, x2, x1], 1))
+        x = torch.tanh(self.exit(torch.cat([x, x0], 1)))
+        decoder_output = self.deconv_for_decoder(out)
+        return x, decoder_output
+class Colorizer(nn.Module):
+    def __init__(self):
+        super(Colorizer, self).__init__()
+        self.generator = Generator()
+    def forward(self, x, extractor_grad = False):
+        fake, guide = self.generator(x)
+        return fake, guide

manga_translator/colorization/manga_colorization_v2_utils/utils/utils.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import numpy as np
+import cv2
+def resize_pad(img, size = 256):
+    if len(img.shape) == 2:
+        img = np.expand_dims(img, 2)
+    if img.shape[2] == 1:
+        img = np.repeat(img, 3, 2)
+    if img.shape[2] == 4:
+        img = img[:, :, :3]
+    pad = None
+    if (img.shape[0] < img.shape[1]):
+        height = img.shape[0]
+        ratio = height / (size * 1.5)
+        width = int(np.ceil(img.shape[1] / ratio))
+        img = cv2.resize(img, (width, int(size * 1.5)), interpolation = cv2.INTER_AREA)
+        new_width = width + (32 - width % 32)
+        pad = (0, new_width - width)
+        img = np.pad(img, ((0, 0), (0, pad[1]), (0, 0)), 'maximum')
+    else:
+        width = img.shape[1]
+        ratio = width / size
+        height = int(np.ceil(img.shape[0] / ratio))
+        img = cv2.resize(img, (size, height), interpolation = cv2.INTER_AREA)
+        new_height = height + (32 - height % 32)
+        pad = (new_height - height, 0)
+        img = np.pad(img, ((0, pad[0]), (0, 0), (0, 0)), 'maximum')
+    if (img.dtype == 'float32'):
+        np.clip(img, 0, 1, out = img)
+    return img[:, :, :1], pad

manga_translator/detection/__init__.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import numpy as np
+from .default import DefaultDetector
+from .dbnet_convnext import DBConvNextDetector
+from .ctd import ComicTextDetector
+from .craft import CRAFTDetector
+from .none import NoneDetector
+from .common import CommonDetector, OfflineDetector
+DETECTORS = {
+    'default': DefaultDetector,
+    'dbconvnext': DBConvNextDetector,
+    'ctd': ComicTextDetector,
+    'craft': CRAFTDetector,
+    'none': NoneDetector,
+}
+detector_cache = {}
+def get_detector(key: str, *args, **kwargs) -> CommonDetector:
+    if key not in DETECTORS:
+        raise ValueError(f'Could not find detector for: "{key}". Choose from the following: %s' % ','.join(DETECTORS))
+    if not detector_cache.get(key):
+        detector = DETECTORS[key]
+        detector_cache[key] = detector(*args, **kwargs)
+    return detector_cache[key]
+async def prepare(detector_key: str):
+    detector = get_detector(detector_key)
+    if isinstance(detector, OfflineDetector):
+        await detector.download()
+async def dispatch(detector_key: str, image: np.ndarray, detect_size: int, text_threshold: float, box_threshold: float, unclip_ratio: float,
+                   invert: bool, gamma_correct: bool, rotate: bool, auto_rotate: bool = False, device: str = 'cpu', verbose: bool = False):
+    detector = get_detector(detector_key)
+    if isinstance(detector, OfflineDetector):
+        await detector.load(device)
+    return await detector.detect(image, detect_size, text_threshold, box_threshold, unclip_ratio, invert, gamma_correct, rotate, auto_rotate, verbose)

manga_translator/detection/common.py ADDED Viewed

	@@ -0,0 +1,146 @@

+from abc import abstractmethod
+from typing import List, Tuple
+from collections import Counter
+import numpy as np
+import cv2
+from ..utils import InfererModule, ModelWrapper, Quadrilateral
+class CommonDetector(InfererModule):
+    async def detect(self, image: np.ndarray, detect_size: int, text_threshold: float, box_threshold: float, unclip_ratio: float,
+                     invert: bool, gamma_correct: bool, rotate: bool, auto_rotate: bool = False, verbose: bool = False):
+        '''
+        Returns textblock list and text mask.
+        '''
+        # Apply filters
+        img_h, img_w = image.shape[:2]
+        orig_image = image.copy()
+        minimum_image_size = 400
+        # Automatically add border if image too small (instead of simply resizing due to them more likely containing large fonts)
+        add_border = min(img_w, img_h) < minimum_image_size
+        if rotate:
+            self.logger.debug('Adding rotation')
+            image = self._add_rotation(image)
+        if add_border:
+            self.logger.debug('Adding border')
+            image = self._add_border(image, minimum_image_size)
+        if invert:
+            self.logger.debug('Adding inversion')
+            image = self._add_inversion(image)
+        if gamma_correct:
+            self.logger.debug('Adding gamma correction')
+            image = self._add_gamma_correction(image)
+        # if True:
+        #     self.logger.debug('Adding histogram equalization')
+        #     image = self._add_histogram_equalization(image)
+        # cv2.imwrite('histogram.png', image)
+        # cv2.waitKey(0)
+        # Run detection
+        textlines, raw_mask, mask = await self._detect(image, detect_size, text_threshold, box_threshold, unclip_ratio, verbose)
+        textlines = list(filter(lambda x: x.area > 1, textlines))
+        # Remove filters
+        if add_border:
+            textlines, raw_mask, mask = self._remove_border(image, img_w, img_h, textlines, raw_mask, mask)
+        if auto_rotate:
+            # Rotate if horizontal aspect ratios are prevalent to potentially improve detection
+            if len(textlines) > 0:
+                orientations = ['h' if txtln.aspect_ratio > 1 else 'v' for txtln in textlines]
+                majority_orientation = Counter(orientations).most_common(1)[0][0]
+            else:
+                majority_orientation = 'h'
+            if majority_orientation == 'h':
+                self.logger.info('Rerunning detection with 90° rotation')
+                return await self.detect(orig_image, detect_size, text_threshold, box_threshold, unclip_ratio, invert, gamma_correct,
+                                         rotate=(not rotate), auto_rotate=False, verbose=verbose)
+        if rotate:
+            textlines, raw_mask, mask = self._remove_rotation(textlines, raw_mask, mask, img_w, img_h)
+        return textlines, raw_mask, mask
+    @abstractmethod
+    async def _detect(self, image: np.ndarray, detect_size: int, text_threshold: float, box_threshold: float,
+                      unclip_ratio: float, verbose: bool = False) -> Tuple[List[Quadrilateral], np.ndarray, np.ndarray]:
+        pass
+    def _add_border(self, image: np.ndarray, target_side_length: int):
+        old_h, old_w = image.shape[:2]
+        new_w = new_h = max(old_w, old_h, target_side_length)
+        new_image = np.zeros([new_h, new_w, 3]).astype(np.uint8)
+        # new_image[:] = np.array([255, 255, 255], np.uint8)
+        x, y = 0, 0
+        # x, y = (new_h - old_h) // 2, (new_w - old_w) // 2
+        new_image[y:y+old_h, x:x+old_w] = image
+        return new_image
+    def _remove_border(self, image: np.ndarray, old_w: int, old_h: int, textlines: List[Quadrilateral], raw_mask, mask):
+        new_h, new_w = image.shape[:2]
+        raw_mask = cv2.resize(raw_mask, (new_w, new_h), interpolation=cv2.INTER_LINEAR)
+        raw_mask = raw_mask[:old_h, :old_w]
+        if mask is not None:
+            mask = cv2.resize(mask, (new_w, new_h), interpolation=cv2.INTER_LINEAR)
+            mask = mask[:old_h, :old_w]
+        # Filter out regions within the border and clamp the points of the remaining regions
+        new_textlines = []
+        for txtln in textlines:
+            if txtln.xyxy[0] >= old_w and txtln.xyxy[1] >= old_h:
+                continue
+            points = txtln.pts
+            points[:,0] = np.clip(points[:,0], 0, old_w)
+            points[:,1] = np.clip(points[:,1], 0, old_h)
+            new_txtln = Quadrilateral(points, txtln.text, txtln.prob)
+            new_textlines.append(new_txtln)
+        return new_textlines, raw_mask, mask
+    def _add_rotation(self, image: np.ndarray):
+        return np.rot90(image, k=-1)
+    def _remove_rotation(self, textlines, raw_mask, mask, img_w, img_h):
+        raw_mask = np.ascontiguousarray(np.rot90(raw_mask))
+        if mask is not None:
+            mask = np.ascontiguousarray(np.rot90(mask).astype(np.uint8))
+        for i, txtln in enumerate(textlines):
+            rotated_pts = txtln.pts[:,[1,0]]
+            rotated_pts[:,1] = -rotated_pts[:,1] + img_h
+            textlines[i] = Quadrilateral(rotated_pts, txtln.text, txtln.prob)
+        return textlines, raw_mask, mask
+    def _add_inversion(self, image: np.ndarray):
+        return cv2.bitwise_not(image)
+    def _add_gamma_correction(self, image: np.ndarray):
+        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+        mid = 0.5
+        mean = np.mean(gray)
+        gamma = np.log(mid * 255) / np.log(mean)
+        img_gamma = np.power(image, gamma).clip(0,255).astype(np.uint8)
+        return img_gamma
+    def _add_histogram_equalization(self, image: np.ndarray):
+        img_yuv = cv2.cvtColor(image, cv2.COLOR_BGR2YUV)
+        # equalize the histogram of the Y channel
+        img_yuv[:,:,0] = cv2.equalizeHist(img_yuv[:,:,0])
+        # convert the YUV image back to RGB format
+        img_output = cv2.cvtColor(img_yuv, cv2.COLOR_YUV2BGR)
+        return img_output
+class OfflineDetector(CommonDetector, ModelWrapper):
+    _MODEL_SUB_DIR = 'detection'
+    async def _detect(self, *args, **kwargs):
+        return await self.infer(*args, **kwargs)
+    @abstractmethod
+    async def _infer(self, image: np.ndarray, detect_size: int, text_threshold: float, box_threshold: float,
+                       unclip_ratio: float, verbose: bool = False):
+        pass

manga_translator/detection/craft.py ADDED Viewed

	@@ -0,0 +1,200 @@

+"""
+Copyright (c) 2019-present NAVER Corp.
+MIT License
+"""
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import os
+import shutil
+import numpy as np
+import torch
+import cv2
+import einops
+from typing import List, Tuple
+from .default_utils.DBNet_resnet34 import TextDetection as TextDetectionDefault
+from .default_utils import imgproc, dbnet_utils, craft_utils
+from .common import OfflineDetector
+from ..utils import TextBlock, Quadrilateral, det_rearrange_forward
+from shapely.geometry import Polygon, MultiPoint
+from shapely import affinity
+from .craft_utils.vgg16_bn import vgg16_bn, init_weights
+from .craft_utils.refiner import RefineNet
+class double_conv(nn.Module):
+    def __init__(self, in_ch, mid_ch, out_ch):
+        super(double_conv, self).__init__()
+        self.conv = nn.Sequential(
+            nn.Conv2d(in_ch + mid_ch, mid_ch, kernel_size=1),
+            nn.BatchNorm2d(mid_ch),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(mid_ch, out_ch, kernel_size=3, padding=1),
+            nn.BatchNorm2d(out_ch),
+            nn.ReLU(inplace=True)
+        )
+    def forward(self, x):
+        x = self.conv(x)
+        return x
+class CRAFT(nn.Module):
+    def __init__(self, pretrained=False, freeze=False):
+        super(CRAFT, self).__init__()
+        """ Base network """
+        self.basenet = vgg16_bn(pretrained, freeze)
+        """ U network """
+        self.upconv1 = double_conv(1024, 512, 256)
+        self.upconv2 = double_conv(512, 256, 128)
+        self.upconv3 = double_conv(256, 128, 64)
+        self.upconv4 = double_conv(128, 64, 32)
+        num_class = 2
+        self.conv_cls = nn.Sequential(
+            nn.Conv2d(32, 32, kernel_size=3, padding=1), nn.ReLU(inplace=True),
+            nn.Conv2d(32, 32, kernel_size=3, padding=1), nn.ReLU(inplace=True),
+            nn.Conv2d(32, 16, kernel_size=3, padding=1), nn.ReLU(inplace=True),
+            nn.Conv2d(16, 16, kernel_size=1), nn.ReLU(inplace=True),
+            nn.Conv2d(16, num_class, kernel_size=1),
+        )
+        init_weights(self.upconv1.modules())
+        init_weights(self.upconv2.modules())
+        init_weights(self.upconv3.modules())
+        init_weights(self.upconv4.modules())
+        init_weights(self.conv_cls.modules())
+    def forward(self, x):
+        """ Base network """
+        sources = self.basenet(x)
+        """ U network """
+        y = torch.cat([sources[0], sources[1]], dim=1)
+        y = self.upconv1(y)
+        y = F.interpolate(y, size=sources[2].size()[2:], mode='bilinear', align_corners=False)
+        y = torch.cat([y, sources[2]], dim=1)
+        y = self.upconv2(y)
+        y = F.interpolate(y, size=sources[3].size()[2:], mode='bilinear', align_corners=False)
+        y = torch.cat([y, sources[3]], dim=1)
+        y = self.upconv3(y)
+        y = F.interpolate(y, size=sources[4].size()[2:], mode='bilinear', align_corners=False)
+        y = torch.cat([y, sources[4]], dim=1)
+        feature = self.upconv4(y)
+        y = self.conv_cls(feature)
+        return y.permute(0,2,3,1), feature
+from collections import OrderedDict
+def copyStateDict(state_dict):
+    if list(state_dict.keys())[0].startswith("module"):
+        start_idx = 1
+    else:
+        start_idx = 0
+    new_state_dict = OrderedDict()
+    for k, v in state_dict.items():
+        name = ".".join(k.split(".")[start_idx:])
+        new_state_dict[name] = v
+    return new_state_dict
+class CRAFTDetector(OfflineDetector):
+    _MODEL_MAPPING = {
+        'refiner': {
+            'url': 'https://github.com/zyddnys/manga-image-translator/releases/download/beta-0.3/craft_refiner_CTW1500.pth',
+            'hash': 'f7000cd3e9c76f2231b62b32182212203f73c08dfaa12bb16ffb529948a01399',
+            'file': 'craft_refiner_CTW1500.pth',
+        },
+        'craft': {
+            'url': 'https://github.com/zyddnys/manga-image-translator/releases/download/beta-0.3/craft_mlt_25k.pth',
+            'hash': '4a5efbfb48b4081100544e75e1e2b57f8de3d84f213004b14b85fd4b3748db17',
+            'file': 'craft_mlt_25k.pth',
+        }
+    }
+    def __init__(self, *args, **kwargs):
+        os.makedirs(self.model_dir, exist_ok=True)
+        if os.path.exists('craft_mlt_25k.pth'):
+            shutil.move('craft_mlt_25k.pth', self._get_file_path('craft_mlt_25k.pth'))
+        if os.path.exists('craft_refiner_CTW1500.pth'):
+            shutil.move('craft_refiner_CTW1500.pth', self._get_file_path('craft_refiner_CTW1500.pth'))
+        super().__init__(*args, **kwargs)
+    async def _load(self, device: str):
+        self.model = CRAFT()
+        self.model.load_state_dict(copyStateDict(torch.load(self._get_file_path('craft_mlt_25k.pth'), map_location='cpu')))
+        self.model.eval()
+        self.model_refiner = RefineNet()
+        self.model_refiner.load_state_dict(copyStateDict(torch.load(self._get_file_path('craft_refiner_CTW1500.pth'), map_location='cpu')))
+        self.model_refiner.eval()
+        self.device = device
+        if device == 'cuda' or device == 'mps':
+            self.model = self.model.to(self.device)
+            self.model_refiner = self.model_refiner.to(self.device)
+        global MODEL
+        MODEL = self.model
+    async def _unload(self):
+        del self.model
+    async def _infer(self, image: np.ndarray, detect_size: int, text_threshold: float, box_threshold: float,
+                     unclip_ratio: float, verbose: bool = False):
+        img_resized, target_ratio, size_heatmap, pad_w, pad_h = imgproc.resize_aspect_ratio(image, detect_size, interpolation = cv2.INTER_CUBIC, mag_ratio = 1)
+        ratio_h = ratio_w = 1 / target_ratio
+        # preprocessing
+        x = imgproc.normalizeMeanVariance(img_resized)
+        x = torch.from_numpy(x).permute(2, 0, 1)    # [h, w, c] to [c, h, w]
+        x = x.unsqueeze(0).to(self.device)                # [c, h, w] to [b, c, h, w]
+        with torch.no_grad() :
+            y, feature = self.model(x)
+        # make score and link map
+        score_text = y[0,:,:,0].cpu().data.numpy()
+        score_link = y[0,:,:,1].cpu().data.numpy()
+        # refine link
+        y_refiner = self.model_refiner(y, feature)
+        score_link = y_refiner[0,:,:,0].cpu().data.numpy()
+        # Post-processing
+        boxes, polys = craft_utils.getDetBoxes(score_text, score_link, text_threshold, box_threshold, box_threshold, True)
+        # coordinate adjustment
+        boxes = craft_utils.adjustResultCoordinates(boxes, ratio_w, ratio_h)
+        polys = craft_utils.adjustResultCoordinates(polys, ratio_w, ratio_h)
+        for k in range(len(polys)):
+            if polys[k] is None: polys[k] = boxes[k]
+        mask = np.zeros(shape = (image.shape[0], image.shape[1]), dtype = np.uint8)
+        for poly in polys :
+            mask = cv2.fillPoly(mask, [poly.reshape((-1, 1, 2)).astype(np.int32)], color = 255)
+        polys_ret = []
+        for i in range(len(polys)) :
+            poly = MultiPoint(polys[i])
+            if poly.area > 10 :
+                rect = poly.minimum_rotated_rectangle
+                rect = affinity.scale(rect, xfact = 1.2, yfact = 1.2)
+                polys_ret.append(np.roll(np.asarray(list(rect.exterior.coords)[:4]), 2))
+        kern = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (9, 9))
+        mask = cv2.dilate(mask, kern)
+        textlines = [Quadrilateral(pts.astype(int), '', 1) for pts in polys_ret]
+        textlines = list(filter(lambda q: q.area > 16, textlines))
+        return textlines, mask, None

manga_translator/detection/craft_utils/refiner.py ADDED Viewed

	@@ -0,0 +1,65 @@

+"""
+Copyright (c) 2019-present NAVER Corp.
+MIT License
+"""
+# -*- coding: utf-8 -*-
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.autograd import Variable
+from .vgg16_bn import init_weights
+class RefineNet(nn.Module):
+    def __init__(self):
+        super(RefineNet, self).__init__()
+        self.last_conv = nn.Sequential(
+            nn.Conv2d(34, 64, kernel_size=3, padding=1), nn.BatchNorm2d(64), nn.ReLU(inplace=True),
+            nn.Conv2d(64, 64, kernel_size=3, padding=1), nn.BatchNorm2d(64), nn.ReLU(inplace=True),
+            nn.Conv2d(64, 64, kernel_size=3, padding=1), nn.BatchNorm2d(64), nn.ReLU(inplace=True)
+        )
+        self.aspp1 = nn.Sequential(
+            nn.Conv2d(64, 128, kernel_size=3, dilation=6, padding=6), nn.BatchNorm2d(128), nn.ReLU(inplace=True),
+            nn.Conv2d(128, 128, kernel_size=1), nn.BatchNorm2d(128), nn.ReLU(inplace=True),
+            nn.Conv2d(128, 1, kernel_size=1)
+        )
+        self.aspp2 = nn.Sequential(
+            nn.Conv2d(64, 128, kernel_size=3, dilation=12, padding=12), nn.BatchNorm2d(128), nn.ReLU(inplace=True),
+            nn.Conv2d(128, 128, kernel_size=1), nn.BatchNorm2d(128), nn.ReLU(inplace=True),
+            nn.Conv2d(128, 1, kernel_size=1)
+        )
+        self.aspp3 = nn.Sequential(
+            nn.Conv2d(64, 128, kernel_size=3, dilation=18, padding=18), nn.BatchNorm2d(128), nn.ReLU(inplace=True),
+            nn.Conv2d(128, 128, kernel_size=1), nn.BatchNorm2d(128), nn.ReLU(inplace=True),
+            nn.Conv2d(128, 1, kernel_size=1)
+        )
+        self.aspp4 = nn.Sequential(
+            nn.Conv2d(64, 128, kernel_size=3, dilation=24, padding=24), nn.BatchNorm2d(128), nn.ReLU(inplace=True),
+            nn.Conv2d(128, 128, kernel_size=1), nn.BatchNorm2d(128), nn.ReLU(inplace=True),
+            nn.Conv2d(128, 1, kernel_size=1)
+        )
+        init_weights(self.last_conv.modules())
+        init_weights(self.aspp1.modules())
+        init_weights(self.aspp2.modules())
+        init_weights(self.aspp3.modules())
+        init_weights(self.aspp4.modules())
+    def forward(self, y, upconv4):
+        refine = torch.cat([y.permute(0,3,1,2), upconv4], dim=1)
+        refine = self.last_conv(refine)
+        aspp1 = self.aspp1(refine)
+        aspp2 = self.aspp2(refine)
+        aspp3 = self.aspp3(refine)
+        aspp4 = self.aspp4(refine)
+        #out = torch.add([aspp1, aspp2, aspp3, aspp4], dim=1)
+        out = aspp1 + aspp2 + aspp3 + aspp4
+        return out.permute(0, 2, 3, 1)  # , refine.permute(0,2,3,1)

manga_translator/detection/craft_utils/vgg16_bn.py ADDED Viewed

	@@ -0,0 +1,71 @@

+from collections import namedtuple
+import torch
+import torch.nn as nn
+import torch.nn.init as init
+from torchvision import models
+def init_weights(modules):
+    for m in modules:
+        if isinstance(m, nn.Conv2d):
+            init.xavier_uniform_(m.weight.data)
+            if m.bias is not None:
+                m.bias.data.zero_()
+        elif isinstance(m, nn.BatchNorm2d):
+            m.weight.data.fill_(1)
+            m.bias.data.zero_()
+        elif isinstance(m, nn.Linear):
+            m.weight.data.normal_(0, 0.01)
+            m.bias.data.zero_()
+class vgg16_bn(torch.nn.Module):
+    def __init__(self, pretrained=True, freeze=True):
+        super(vgg16_bn, self).__init__()
+        vgg_pretrained_features = models.vgg16_bn().features
+        self.slice1 = torch.nn.Sequential()
+        self.slice2 = torch.nn.Sequential()
+        self.slice3 = torch.nn.Sequential()
+        self.slice4 = torch.nn.Sequential()
+        self.slice5 = torch.nn.Sequential()
+        for x in range(12):         # conv2_2
+            self.slice1.add_module(str(x), vgg_pretrained_features[x])
+        for x in range(12, 19):         # conv3_3
+            self.slice2.add_module(str(x), vgg_pretrained_features[x])
+        for x in range(19, 29):         # conv4_3
+            self.slice3.add_module(str(x), vgg_pretrained_features[x])
+        for x in range(29, 39):         # conv5_3
+            self.slice4.add_module(str(x), vgg_pretrained_features[x])
+        # fc6, fc7 without atrous conv
+        self.slice5 = torch.nn.Sequential(
+                nn.MaxPool2d(kernel_size=3, stride=1, padding=1),
+                nn.Conv2d(512, 1024, kernel_size=3, padding=6, dilation=6),
+                nn.Conv2d(1024, 1024, kernel_size=1)
+        )
+        if not pretrained:
+            init_weights(self.slice1.modules())
+            init_weights(self.slice2.modules())
+            init_weights(self.slice3.modules())
+            init_weights(self.slice4.modules())
+        init_weights(self.slice5.modules())        # no pretrained model for fc6 and fc7
+        if freeze:
+            for param in self.slice1.parameters():      # only first conv
+                param.requires_grad= False
+    def forward(self, X):
+        h = self.slice1(X)
+        h_relu2_2 = h
+        h = self.slice2(h)
+        h_relu3_2 = h
+        h = self.slice3(h)
+        h_relu4_3 = h
+        h = self.slice4(h)
+        h_relu5_3 = h
+        h = self.slice5(h)
+        h_fc7 = h
+        vgg_outputs = namedtuple("VggOutputs", ['fc7', 'relu5_3', 'relu4_3', 'relu3_2', 'relu2_2'])
+        out = vgg_outputs(h_fc7, h_relu5_3, h_relu4_3, h_relu3_2, h_relu2_2)
+        return out

manga_translator/detection/ctd.py ADDED Viewed

	@@ -0,0 +1,186 @@

+import os
+import shutil
+import numpy as np
+import einops
+from typing import Union, Tuple
+import cv2
+import torch
+from .ctd_utils.basemodel import TextDetBase, TextDetBaseDNN
+from .ctd_utils.utils.yolov5_utils import non_max_suppression
+from .ctd_utils.utils.db_utils import SegDetectorRepresenter
+from .ctd_utils.utils.imgproc_utils import letterbox
+from .ctd_utils.textmask import REFINEMASK_INPAINT, refine_mask
+from .common import OfflineDetector
+from ..utils import Quadrilateral, det_rearrange_forward
+def preprocess_img(img, input_size=(1024, 1024), device='cpu', bgr2rgb=True, half=False, to_tensor=True):
+    if bgr2rgb:
+        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+    img_in, ratio, (dw, dh) = letterbox(img, new_shape=input_size, auto=False, stride=64)
+    if to_tensor:
+        img_in = img_in.transpose((2, 0, 1))[::-1]  # HWC to CHW, BGR to RGB
+        img_in = np.array([np.ascontiguousarray(img_in)]).astype(np.float32) / 255
+        if to_tensor:
+            img_in = torch.from_numpy(img_in).to(device)
+            if half:
+                img_in = img_in.half()
+    return img_in, ratio, int(dw), int(dh)
+def postprocess_mask(img: Union[torch.Tensor, np.ndarray], thresh=None):
+    # img = img.permute(1, 2, 0)
+    if isinstance(img, torch.Tensor):
+        img = img.squeeze_()
+        if img.device != 'cpu':
+            img = img.detach().cpu()
+        img = img.numpy()
+    else:
+        img = img.squeeze()
+    if thresh is not None:
+        img = img > thresh
+    img = img * 255
+    # if isinstance(img, torch.Tensor):
+    return img.astype(np.uint8)
+def postprocess_yolo(det, conf_thresh, nms_thresh, resize_ratio, sort_func=None):
+    det = non_max_suppression(det, conf_thresh, nms_thresh)[0]
+    # bbox = det[..., 0:4]
+    if det.device != 'cpu':
+        det = det.detach_().cpu().numpy()
+    det[..., [0, 2]] = det[..., [0, 2]] * resize_ratio[0]
+    det[..., [1, 3]] = det[..., [1, 3]] * resize_ratio[1]
+    if sort_func is not None:
+        det = sort_func(det)
+    blines = det[..., 0:4].astype(np.int32)
+    confs = np.round(det[..., 4], 3)
+    cls = det[..., 5].astype(np.int32)
+    return blines, cls, confs
+class ComicTextDetector(OfflineDetector):
+    _MODEL_MAPPING = {
+        'model-cuda': {
+            'url': 'https://github.com/zyddnys/manga-image-translator/releases/download/beta-0.3/comictextdetector.pt',
+            'hash': '1f90fa60aeeb1eb82e2ac1167a66bf139a8a61b8780acd351ead55268540cccb',
+            'file': '.',
+        },
+        'model-cpu': {
+            'url': 'https://github.com/zyddnys/manga-image-translator/releases/download/beta-0.3/comictextdetector.pt.onnx',
+            'hash': '1a86ace74961413cbd650002e7bb4dcec4980ffa21b2f19b86933372071d718f',
+            'file': '.',
+        },
+    }
+    def __init__(self, *args, **kwargs):
+        os.makedirs(self.model_dir, exist_ok=True)
+        if os.path.exists('comictextdetector.pt'):
+            shutil.move('comictextdetector.pt', self._get_file_path('comictextdetector.pt'))
+        if os.path.exists('comictextdetector.pt.onnx'):
+            shutil.move('comictextdetector.pt.onnx', self._get_file_path('comictextdetector.pt.onnx'))
+        super().__init__(*args, **kwargs)
+    async def _load(self, device: str, input_size=1024, half=False, nms_thresh=0.35, conf_thresh=0.4):
+        self.device = device
+        if self.device == 'cuda' or self.device == 'mps':
+            self.model = TextDetBase(self._get_file_path('comictextdetector.pt'), device=self.device, act='leaky')
+            self.model.to(self.device)
+            self.backend = 'torch'
+        else:
+            model_path = self._get_file_path('comictextdetector.pt.onnx')
+            self.model = cv2.dnn.readNetFromONNX(model_path)
+            self.model = TextDetBaseDNN(input_size, model_path)
+            self.backend = 'opencv'
+        if isinstance(input_size, int):
+            input_size = (input_size, input_size)
+        self.input_size = input_size
+        self.half = half
+        self.conf_thresh = conf_thresh
+        self.nms_thresh = nms_thresh
+        self.seg_rep = SegDetectorRepresenter(thresh=0.3)
+    async def _unload(self):
+        del self.model
+    def det_batch_forward_ctd(self, batch: np.ndarray, device: str) -> Tuple[np.ndarray, np.ndarray]:
+        if isinstance(self.model, TextDetBase):
+            batch = einops.rearrange(batch.astype(np.float32) / 255., 'n h w c -> n c h w')
+            batch = torch.from_numpy(batch).to(device)
+            _, mask, lines = self.model(batch)
+            mask = mask.detach().cpu().numpy()
+            lines = lines.detach().cpu().numpy()
+        elif isinstance(self.model, TextDetBaseDNN):
+            mask_lst, line_lst = [], []
+            for b in batch:
+                _, mask, lines = self.model(b)
+                if mask.shape[1] == 2:     # some version of opencv spit out reversed result
+                    tmp = mask
+                    mask = lines
+                    lines = tmp
+                mask_lst.append(mask)
+                line_lst.append(lines)
+            lines, mask = np.concatenate(line_lst, 0), np.concatenate(mask_lst, 0)
+        else:
+            raise NotImplementedError
+        return lines, mask
+    @torch.no_grad()
+    async def _infer(self, image: np.ndarray, detect_size: int, text_threshold: float, box_threshold: float,
+                     unclip_ratio: float, verbose: bool = False):
+        # keep_undetected_mask = False
+        # refine_mode = REFINEMASK_INPAINT
+        im_h, im_w = image.shape[:2]
+        lines_map, mask = det_rearrange_forward(image, self.det_batch_forward_ctd, self.input_size[0], 4, self.device, verbose)
+        # blks = []
+        # resize_ratio = [1, 1]
+        if lines_map is None:
+            img_in, ratio, dw, dh = preprocess_img(image, input_size=self.input_size, device=self.device, half=self.half, to_tensor=self.backend=='torch')
+            blks, mask, lines_map = self.model(img_in)
+            if self.backend == 'opencv':
+                if mask.shape[1] == 2: # some version of opencv spit out reversed result
+                    tmp = mask
+                    mask = lines_map
+                    lines_map = tmp
+            mask = mask.squeeze()
+            # resize_ratio = (im_w / (self.input_size[0] - dw), im_h / (self.input_size[1] - dh))
+            # blks = postprocess_yolo(blks, self.conf_thresh, self.nms_thresh, resize_ratio)
+            mask = mask[..., :mask.shape[0]-dh, :mask.shape[1]-dw]
+            lines_map = lines_map[..., :lines_map.shape[2]-dh, :lines_map.shape[3]-dw]
+        mask = postprocess_mask(mask)
+        lines, scores = self.seg_rep(None, lines_map, height=im_h, width=im_w)
+        box_thresh = 0.6
+        idx = np.where(scores[0] > box_thresh)
+        lines, scores = lines[0][idx], scores[0][idx]
+        # map output to input img
+        mask = cv2.resize(mask, (im_w, im_h), interpolation=cv2.INTER_LINEAR)
+        # if lines.size == 0:
+        #     lines = []
+        # else:
+        #     lines = lines.astype(np.int32)
+        # YOLO was used for finding bboxes which to order the lines into. This is now solved
+        # through the textline merger, which seems to work more reliably.
+        # The YOLO language detection seems unnecessary as it could never be as good as
+        # using the OCR extracted string directly.
+        # Doing it for increasing the textline merge accuracy doesn't really work either,
+        # as the merge could be postponed until after the OCR finishes.
+        textlines = [Quadrilateral(pts.astype(int), '', score) for pts, score in zip(lines, scores)]
+        mask_refined = refine_mask(image, mask, textlines, refine_mode=None)
+        return textlines, mask_refined, None
+        # blk_list = group_output(blks, lines, im_w, im_h, mask)
+        # mask_refined = refine_mask(image, mask, blk_list, refine_mode=refine_mode)
+        # if keep_undetected_mask:
+        #     mask_refined = refine_undetected_mask(image, mask, mask_refined, blk_list, refine_mode=refine_mode)
+        # return blk_list, mask, mask_refined

manga_translator/detection/ctd_utils/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from .basemodel import TextDetBase, TextDetBaseDNN
+from .utils.yolov5_utils import non_max_suppression
+from .utils.db_utils import SegDetectorRepresenter
+from .utils.imgproc_utils import letterbox
+from .textmask import refine_mask, refine_undetected_mask, REFINEMASK_INPAINT, REFINEMASK_ANNOTATION

manga_translator/detection/ctd_utils/basemodel.py ADDED Viewed

	@@ -0,0 +1,250 @@

+import cv2
+import copy
+import torch
+import torch.nn as nn
+from .utils.yolov5_utils import fuse_conv_and_bn
+from .utils.weight_init import init_weights
+from .yolov5.yolo import load_yolov5_ckpt
+from .yolov5.common import C3, Conv
+TEXTDET_MASK = 0
+TEXTDET_DET = 1
+TEXTDET_INFERENCE = 2
+class double_conv_up_c3(nn.Module):
+    def __init__(self, in_ch, mid_ch, out_ch, act=True):
+        super(double_conv_up_c3, self).__init__()
+        self.conv = nn.Sequential(
+        C3(in_ch+mid_ch, mid_ch, act=act),
+        nn.ConvTranspose2d(mid_ch, out_ch, kernel_size=4, stride = 2, padding=1, bias=False),
+        nn.BatchNorm2d(out_ch),
+        nn.ReLU(inplace=True),
+        )
+    def forward(self, x):
+        return self.conv(x)
+class double_conv_c3(nn.Module):
+    def __init__(self, in_ch, out_ch, stride=1, act=True):
+        super(double_conv_c3, self).__init__()
+        if stride > 1:
+            self.down = nn.AvgPool2d(2,stride=2) if stride > 1 else None
+        self.conv = C3(in_ch, out_ch, act=act)
+    def forward(self, x):
+        if self.down is not None:
+            x = self.down(x)
+        x = self.conv(x)
+        return x
+class UnetHead(nn.Module):
+    def __init__(self, act=True) -> None:
+        super(UnetHead, self).__init__()
+        self.down_conv1 = double_conv_c3(512, 512, 2, act=act)
+        self.upconv0 = double_conv_up_c3(0, 512, 256, act=act)
+        self.upconv2 = double_conv_up_c3(256, 512, 256, act=act)
+        self.upconv3 = double_conv_up_c3(0, 512, 256, act=act)
+        self.upconv4 = double_conv_up_c3(128, 256, 128, act=act)
+        self.upconv5 = double_conv_up_c3(64, 128, 64, act=act)
+        self.upconv6 = nn.Sequential(
+            nn.ConvTranspose2d(64, 1, kernel_size=4, stride = 2, padding=1, bias=False),
+            nn.Sigmoid()
+        )
+    def forward(self, f160, f80, f40, f20, f3, forward_mode=TEXTDET_MASK):
+        # input: 640@3
+        d10 = self.down_conv1(f3) # 512@10
+        u20 = self.upconv0(d10)  # 256@10
+        u40 = self.upconv2(torch.cat([f20, u20], dim = 1)) # 256@40
+        if forward_mode == TEXTDET_DET:
+            return f80, f40, u40
+        else:
+            u80 = self.upconv3(torch.cat([f40, u40], dim = 1)) # 256@80
+            u160 = self.upconv4(torch.cat([f80, u80], dim = 1)) # 128@160
+            u320 = self.upconv5(torch.cat([f160, u160], dim = 1)) # 64@320
+            mask = self.upconv6(u320)
+            if forward_mode == TEXTDET_MASK:
+                return mask
+            else:
+                return mask, [f80, f40, u40]
+    def init_weight(self, init_func):
+        self.apply(init_func)
+class DBHead(nn.Module):
+    def __init__(self, in_channels, k = 50, shrink_with_sigmoid=True, act=True):
+        super().__init__()
+        self.k = k
+        self.shrink_with_sigmoid = shrink_with_sigmoid
+        self.upconv3 = double_conv_up_c3(0, 512, 256, act=act)
+        self.upconv4 = double_conv_up_c3(128, 256, 128, act=act)
+        self.conv = nn.Sequential(
+            nn.Conv2d(128, in_channels, 1),
+            nn.BatchNorm2d(in_channels),
+            nn.ReLU(inplace=True)
+        )
+        self.binarize = nn.Sequential(
+            nn.Conv2d(in_channels, in_channels // 4, 3, padding=1),
+            nn.BatchNorm2d(in_channels // 4),
+            nn.ReLU(inplace=True),
+            nn.ConvTranspose2d(in_channels // 4, in_channels // 4, 2, 2),
+            nn.BatchNorm2d(in_channels // 4),
+            nn.ReLU(inplace=True),
+            nn.ConvTranspose2d(in_channels // 4, 1, 2, 2)
+            )
+        self.thresh = self._init_thresh(in_channels)
+    def forward(self, f80, f40, u40, shrink_with_sigmoid=True, step_eval=False):
+        shrink_with_sigmoid = self.shrink_with_sigmoid
+        u80 = self.upconv3(torch.cat([f40, u40], dim = 1)) # 256@80
+        x = self.upconv4(torch.cat([f80, u80], dim = 1)) # 128@160
+        x = self.conv(x)
+        threshold_maps = self.thresh(x)
+        x = self.binarize(x)
+        shrink_maps = torch.sigmoid(x)
+        if self.training:
+            binary_maps = self.step_function(shrink_maps, threshold_maps)
+            if shrink_with_sigmoid:
+                return torch.cat((shrink_maps, threshold_maps, binary_maps), dim=1)
+            else:
+                return torch.cat((shrink_maps, threshold_maps, binary_maps, x), dim=1)
+        else:
+            if step_eval:
+                return self.step_function(shrink_maps, threshold_maps)
+            else:
+                return torch.cat((shrink_maps, threshold_maps), dim=1)
+    def init_weight(self, init_func):
+        self.apply(init_func)
+    def _init_thresh(self, inner_channels, serial=False, smooth=False, bias=False):
+        in_channels = inner_channels
+        if serial:
+            in_channels += 1
+        self.thresh = nn.Sequential(
+            nn.Conv2d(in_channels, inner_channels // 4, 3, padding=1, bias=bias),
+            nn.BatchNorm2d(inner_channels // 4),
+            nn.ReLU(inplace=True),
+            self._init_upsample(inner_channels // 4, inner_channels // 4, smooth=smooth, bias=bias),
+            nn.BatchNorm2d(inner_channels // 4),
+            nn.ReLU(inplace=True),
+            self._init_upsample(inner_channels // 4, 1, smooth=smooth, bias=bias),
+            nn.Sigmoid())
+        return self.thresh
+    def _init_upsample(self, in_channels, out_channels, smooth=False, bias=False):
+        if smooth:
+            inter_out_channels = out_channels
+            if out_channels == 1:
+                inter_out_channels = in_channels
+            module_list = [
+                nn.Upsample(scale_factor=2, mode='nearest'),
+                nn.Conv2d(in_channels, inter_out_channels, 3, 1, 1, bias=bias)]
+            if out_channels == 1:
+                module_list.append(nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=1, bias=True))
+            return nn.Sequential(module_list)
+        else:
+            return nn.ConvTranspose2d(in_channels, out_channels, 2, 2)
+    def step_function(self, x, y):
+        return torch.reciprocal(1 + torch.exp(-self.k * (x - y)))
+class TextDetector(nn.Module):
+    def __init__(self, weights, map_location='cpu', forward_mode=TEXTDET_MASK, act=True):
+        super(TextDetector, self).__init__()
+        yolov5s_backbone = load_yolov5_ckpt(weights=weights, map_location=map_location)
+        yolov5s_backbone.eval()
+        out_indices = [1, 3, 5, 7, 9]
+        yolov5s_backbone.out_indices = out_indices
+        yolov5s_backbone.model = yolov5s_backbone.model[:max(out_indices)+1]
+        self.act = act
+        self.seg_net = UnetHead(act=act)
+        self.backbone = yolov5s_backbone
+        self.dbnet = None
+        self.forward_mode = forward_mode
+    def train_mask(self):
+        self.forward_mode = TEXTDET_MASK
+        self.backbone.eval()
+        self.seg_net.train()
+    def initialize_db(self, unet_weights):
+        self.dbnet = DBHead(64, act=self.act)
+        self.seg_net.load_state_dict(torch.load(unet_weights, map_location='cpu')['weights'])
+        self.dbnet.init_weight(init_weights)
+        self.dbnet.upconv3 = copy.deepcopy(self.seg_net.upconv3)
+        self.dbnet.upconv4 = copy.deepcopy(self.seg_net.upconv4)
+        del self.seg_net.upconv3
+        del self.seg_net.upconv4
+        del self.seg_net.upconv5
+        del self.seg_net.upconv6
+        # del self.seg_net.conv_mask
+    def train_db(self):
+        self.forward_mode = TEXTDET_DET
+        self.backbone.eval()
+        self.seg_net.eval()
+        self.dbnet.train()
+    def forward(self, x):
+        forward_mode = self.forward_mode
+        with torch.no_grad():
+            outs = self.backbone(x)
+        if forward_mode == TEXTDET_MASK:
+            return self.seg_net(*outs, forward_mode=forward_mode)
+        elif forward_mode == TEXTDET_DET:
+            with torch.no_grad():
+                outs = self.seg_net(*outs, forward_mode=forward_mode)
+            return self.dbnet(*outs)
+def get_base_det_models(model_path, device='cpu', half=False, act='leaky'):
+    textdetector_dict = torch.load(model_path, map_location=device)
+    blk_det = load_yolov5_ckpt(textdetector_dict['blk_det'], map_location=device)
+    text_seg = UnetHead(act=act)
+    text_seg.load_state_dict(textdetector_dict['text_seg'])
+    text_det = DBHead(64, act=act)
+    text_det.load_state_dict(textdetector_dict['text_det'])
+    if half:
+        return blk_det.eval().half(), text_seg.eval().half(), text_det.eval().half()
+    return blk_det.eval().to(device), text_seg.eval().to(device), text_det.eval().to(device)
+class TextDetBase(nn.Module):
+    def __init__(self, model_path, device='cpu', half=False, fuse=False, act='leaky'):
+        super(TextDetBase, self).__init__()
+        self.blk_det, self.text_seg, self.text_det = get_base_det_models(model_path, device, half, act=act)
+        if fuse:
+            self.fuse()
+    def fuse(self):
+        def _fuse(model):
+            for m in model.modules():
+                if isinstance(m, (Conv)) and hasattr(m, 'bn'):
+                    m.conv = fuse_conv_and_bn(m.conv, m.bn)  # update conv
+                    delattr(m, 'bn')  # remove batchnorm
+                    m.forward = m.forward_fuse  # update forward
+            return model
+        self.text_seg = _fuse(self.text_seg)
+        self.text_det = _fuse(self.text_det)
+    def forward(self, features):
+        blks, features = self.blk_det(features, detect=True)
+        mask, features = self.text_seg(*features, forward_mode=TEXTDET_INFERENCE)
+        lines = self.text_det(*features, step_eval=False)
+        return blks[0], mask, lines
+class TextDetBaseDNN:
+    def __init__(self, input_size, model_path):
+        self.input_size = input_size
+        self.model = cv2.dnn.readNetFromONNX(model_path)
+        self.uoln = self.model.getUnconnectedOutLayersNames()
+    def __call__(self, im_in):
+        blob = cv2.dnn.blobFromImage(im_in, scalefactor=1 / 255.0, size=(self.input_size, self.input_size))
+        self.model.setInput(blob)
+        blks, mask, lines_map  = self.model.forward(self.uoln)
+        return blks, mask, lines_map

manga_translator/detection/ctd_utils/textmask.py ADDED Viewed

	@@ -0,0 +1,174 @@

+from typing import List
+import cv2
+import numpy as np
+from .utils.imgproc_utils import union_area, enlarge_window
+from ...utils import TextBlock, Quadrilateral
+WHITE = (255, 255, 255)
+BLACK = (0, 0, 0)
+LANG_ENG = 0
+LANG_JPN = 1
+REFINEMASK_INPAINT = 0
+REFINEMASK_ANNOTATION = 1
+def get_topk_color(color_list, bins, k=3, color_var=10, bin_tol=0.001):
+    idx = np.argsort(bins * -1)
+    color_list, bins = color_list[idx], bins[idx]
+    top_colors = [color_list[0]]
+    bin_tol = np.sum(bins) * bin_tol
+    if len(color_list) > 1:
+        for color, bin in zip(color_list[1:], bins[1:]):
+            if np.abs(np.array(top_colors) - color).min() > color_var:
+                top_colors.append(color)
+            if len(top_colors) >= k or bin < bin_tol:
+                break
+    return top_colors
+def minxor_thresh(threshed, mask, dilate=False):
+    neg_threshed = 255 - threshed
+    e_size = 1
+    if dilate:
+        element = cv2.getStructuringElement(cv2.MORPH_RECT, (2 * e_size + 1, 2 * e_size + 1),(e_size, e_size))
+        neg_threshed = cv2.dilate(neg_threshed, element, iterations=1)
+        threshed = cv2.dilate(threshed, element, iterations=1)
+    neg_xor_sum = cv2.bitwise_xor(neg_threshed, mask).sum()
+    xor_sum = cv2.bitwise_xor(threshed, mask).sum()
+    if neg_xor_sum < xor_sum:
+        return neg_threshed, neg_xor_sum
+    else:
+        return threshed, xor_sum
+def get_otsuthresh_masklist(img, pred_mask, per_channel=False) -> List[np.ndarray]:
+    channels = [img[..., 0], img[..., 1], img[..., 2]]
+    mask_list = []
+    for c in channels:
+        _, threshed = cv2.threshold(c, 1, 255, cv2.THRESH_OTSU+cv2.THRESH_BINARY)
+        threshed, xor_sum = minxor_thresh(threshed, pred_mask, dilate=False)
+        mask_list.append([threshed, xor_sum])
+    mask_list.sort(key=lambda x: x[1])
+    if per_channel:
+        return mask_list
+    else:
+        return [mask_list[0]]
+def get_topk_masklist(im_grey, pred_mask):
+    if len(im_grey.shape) == 3 and im_grey.shape[-1] == 3:
+        im_grey = cv2.cvtColor(im_grey, cv2.COLOR_BGR2GRAY)
+    msk = np.ascontiguousarray(pred_mask)
+    candidate_grey_px = im_grey[np.where(cv2.erode(msk, np.ones((3,3), np.uint8), iterations=1) > 127)]
+    bin, his = np.histogram(candidate_grey_px, bins=255)
+    topk_color = get_topk_color(his, bin, color_var=10, k=3)
+    color_range = 30
+    mask_list = list()
+    for ii, color in enumerate(topk_color):
+        c_top = min(color+color_range, 255)
+        c_bottom = c_top - 2 * color_range
+        threshed = cv2.inRange(im_grey, c_bottom, c_top)
+        threshed, xor_sum = minxor_thresh(threshed, msk)
+        mask_list.append([threshed, xor_sum])
+    return mask_list
+def merge_mask_list(mask_list, pred_mask, blk: Quadrilateral = None, pred_thresh=30, text_window=None, filter_with_lines=False, refine_mode=REFINEMASK_INPAINT):
+    mask_list.sort(key=lambda x: x[1])
+    linemask = None
+    if blk is not None and filter_with_lines:
+        linemask = np.zeros_like(pred_mask)
+        lines = blk.pts.astype(np.int64)
+        for line in lines:
+            line[..., 0] -= text_window[0]
+            line[..., 1] -= text_window[1]
+            cv2.fillPoly(linemask, [line], 255)
+        linemask = cv2.dilate(linemask, np.ones((3, 3), np.uint8), iterations=3)
+    if pred_thresh > 0:
+        e_size = 1
+        element = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (2 * e_size + 1, 2 * e_size + 1),(e_size, e_size))
+        pred_mask = cv2.erode(pred_mask, element, iterations=1)
+        _, pred_mask = cv2.threshold(pred_mask, 60, 255, cv2.THRESH_BINARY)
+    connectivity = 8
+    mask_merged = np.zeros_like(pred_mask)
+    for ii, (candidate_mask, xor_sum) in enumerate(mask_list):
+        num_labels, labels, stats, centroids = cv2.connectedComponentsWithStats(candidate_mask, connectivity, cv2.CV_16U)
+        for label_index, stat, centroid in zip(range(num_labels), stats, centroids):
+            if label_index != 0: # skip background label
+                x, y, w, h, area = stat
+                if w * h < 3:
+                    continue
+                x1, y1, x2, y2 = x, y, x+w, y+h
+                label_local = labels[y1: y2, x1: x2]
+                label_coordinates = np.where(label_local==label_index)
+                tmp_merged = np.zeros_like(label_local, np.uint8)
+                tmp_merged[label_coordinates] = 255
+                tmp_merged = cv2.bitwise_or(mask_merged[y1: y2, x1: x2], tmp_merged)
+                xor_merged = cv2.bitwise_xor(tmp_merged, pred_mask[y1: y2, x1: x2]).sum()
+                xor_origin = cv2.bitwise_xor(mask_merged[y1: y2, x1: x2], pred_mask[y1: y2, x1: x2]).sum()
+                if xor_merged < xor_origin:
+                    mask_merged[y1: y2, x1: x2] = tmp_merged
+    if refine_mode == REFINEMASK_INPAINT:
+        mask_merged = cv2.dilate(mask_merged, np.ones((5, 5), np.uint8), iterations=1)
+    # fill holes
+    num_labels, labels, stats, centroids = cv2.connectedComponentsWithStats(255-mask_merged, connectivity, cv2.CV_16U)
+    sorted_area = np.sort(stats[:, -1])
+    if len(sorted_area) > 1:
+        area_thresh = sorted_area[-2]
+    else:
+        area_thresh = sorted_area[-1]
+    for label_index, stat, centroid in zip(range(num_labels), stats, centroids):
+        x, y, w, h, area = stat
+        if area < area_thresh:
+            x1, y1, x2, y2 = x, y, x+w, y+h
+            label_local = labels[y1: y2, x1: x2]
+            label_coordinates = np.where(label_local==label_index)
+            tmp_merged = np.zeros_like(label_local, np.uint8)
+            tmp_merged[label_coordinates] = 255
+            tmp_merged = cv2.bitwise_or(mask_merged[y1: y2, x1: x2], tmp_merged)
+            xor_merged = cv2.bitwise_xor(tmp_merged, pred_mask[y1: y2, x1: x2]).sum()
+            xor_origin = cv2.bitwise_xor(mask_merged[y1: y2, x1: x2], pred_mask[y1: y2, x1: x2]).sum()
+            if xor_merged < xor_origin:
+                mask_merged[y1: y2, x1: x2] = tmp_merged
+    return mask_merged
+def refine_undetected_mask(img: np.ndarray, mask_pred: np.ndarray, mask_refined: np.ndarray, blk_list: List[TextBlock], refine_mode=REFINEMASK_INPAINT):
+    mask_pred[np.where(mask_refined > 30)] = 0
+    _, pred_mask_t = cv2.threshold(mask_pred, 30, 255, cv2.THRESH_BINARY)
+    num_labels, labels, stats, centroids = cv2.connectedComponentsWithStats(pred_mask_t, 4, cv2.CV_16U)
+    valid_labels = np.where(stats[:, -1] > 50)[0]
+    seg_blk_list = []
+    if len(valid_labels) > 0:
+        for lab_index in valid_labels[1:]:
+            x, y, w, h, area = stats[lab_index]
+            bx1, by1 = x, y
+            bx2, by2 = x+w, y+h
+            bbox = [bx1, by1, bx2, by2]
+            bbox_score = -1
+            for blk in blk_list:
+                bbox_s = union_area(blk.xyxy, bbox)
+                if bbox_s > bbox_score:
+                    bbox_score = bbox_s
+            if bbox_score / w / h < 0.5:
+                seg_blk_list.append(TextBlock(bbox))
+    if len(seg_blk_list) > 0:
+        mask_refined = cv2.bitwise_or(mask_refined, refine_mask(img, mask_pred, seg_blk_list, refine_mode=refine_mode))
+    return mask_refined
+def refine_mask(img: np.ndarray, pred_mask: np.ndarray, blk_list: List[Quadrilateral], refine_mode: int = REFINEMASK_INPAINT) -> np.ndarray:
+    mask_refined = np.zeros_like(pred_mask)
+    for blk in blk_list:
+        bx1, by1, bx2, by2 = enlarge_window(blk.xyxy, img.shape[1], img.shape[0])
+        im = np.ascontiguousarray(img[by1: by2, bx1: bx2])
+        msk = np.ascontiguousarray(pred_mask[by1: by2, bx1: bx2])
+        mask_list = get_topk_masklist(im, msk)
+        mask_list += get_otsuthresh_masklist(im, msk, per_channel=False)
+        mask_merged = merge_mask_list(mask_list, msk, blk=blk, text_window=[bx1, by1, bx2, by2], refine_mode=refine_mode)
+        mask_refined[by1: by2, bx1: bx2] = cv2.bitwise_or(mask_refined[by1: by2, bx1: bx2], mask_merged)
+        # cv2.imshow('im', im)
+        # cv2.imshow('msk', msk)
+        # cv2.imshow('mask_refined', mask_refined[by1: by2, bx1: bx2])
+        # cv2.waitKey(0)
+    return mask_refined

manga_translator/detection/ctd_utils/utils/db_utils.py ADDED Viewed

	@@ -0,0 +1,706 @@

+import cv2
+import numpy as np
+import pyclipper
+from shapely.geometry import Polygon
+from collections import namedtuple
+import warnings
+import torch
+warnings.filterwarnings('ignore')
+def iou_rotate(box_a, box_b, method='union'):
+    rect_a = cv2.minAreaRect(box_a)
+    rect_b = cv2.minAreaRect(box_b)
+    r1 = cv2.rotatedRectangleIntersection(rect_a, rect_b)
+    if r1[0] == 0:
+        return 0
+    else:
+        inter_area = cv2.contourArea(r1[1])
+        area_a = cv2.contourArea(box_a)
+        area_b = cv2.contourArea(box_b)
+        union_area = area_a + area_b - inter_area
+        if union_area == 0 or inter_area == 0:
+            return 0
+        if method == 'union':
+            iou = inter_area / union_area
+        elif method == 'intersection':
+            iou = inter_area / min(area_a, area_b)
+        else:
+            raise NotImplementedError
+        return iou
+class SegDetectorRepresenter():
+    def __init__(self, thresh=0.3, box_thresh=0.7, max_candidates=1000, unclip_ratio=1.5):
+        self.min_size = 3
+        self.thresh = thresh
+        self.box_thresh = box_thresh
+        self.max_candidates = max_candidates
+        self.unclip_ratio = unclip_ratio
+    def __call__(self, batch, pred, is_output_polygon=False, height=None, width=None):
+        '''
+        batch: (image, polygons, ignore_tags
+        batch: a dict produced by dataloaders.
+            image: tensor of shape (N, C, H, W).
+            polygons: tensor of shape (N, K, 4, 2), the polygons of objective regions.
+            ignore_tags: tensor of shape (N, K), indicates whether a region is ignorable or not.
+            shape: the original shape of images.
+            filename: the original filenames of images.
+        pred:
+            binary: text region segmentation map, with shape (N, H, W)
+            thresh: [if exists] thresh hold prediction with shape (N, H, W)
+            thresh_binary: [if exists] binarized with threshold, (N, H, W)
+        '''
+        pred = pred[:, 0, :, :]
+        segmentation = self.binarize(pred)
+        boxes_batch = []
+        scores_batch = []
+        # print(pred.size())
+        batch_size = pred.size(0) if isinstance(pred, torch.Tensor) else pred.shape[0]
+        if height is None:
+            height = pred.shape[1]
+        if width is None:
+            width = pred.shape[2]
+        for batch_index in range(batch_size):
+            if is_output_polygon:
+                boxes, scores = self.polygons_from_bitmap(pred[batch_index], segmentation[batch_index], width, height)
+            else:
+                boxes, scores = self.boxes_from_bitmap(pred[batch_index], segmentation[batch_index], width, height)
+            boxes_batch.append(boxes)
+            scores_batch.append(scores)
+        return boxes_batch, scores_batch
+    def binarize(self, pred) -> np.ndarray:
+        return pred > self.thresh
+    def polygons_from_bitmap(self, pred, _bitmap, dest_width, dest_height):
+        '''
+        _bitmap: single map with shape (H, W),
+            whose values are binarized as {0, 1}
+        '''
+        assert len(_bitmap.shape) == 2
+        bitmap = _bitmap.cpu().numpy()  # The first channel
+        pred = pred.cpu().detach().numpy()
+        height, width = bitmap.shape
+        boxes = []
+        scores = []
+        contours, _ = cv2.findContours((bitmap * 255).astype(np.uint8), cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
+        for contour in contours[:self.max_candidates]:
+            epsilon = 0.005 * cv2.arcLength(contour, True)
+            approx = cv2.approxPolyDP(contour, epsilon, True)
+            points = approx.reshape((-1, 2))
+            if points.shape[0] < 4:
+                continue
+            # _, sside = self.get_mini_boxes(contour)
+            # if sside < self.min_size:
+            #     continue
+            score = self.box_score_fast(pred, contour.squeeze(1))
+            if self.box_thresh > score:
+                continue
+            if points.shape[0] > 2:
+                box = self.unclip(points, unclip_ratio=self.unclip_ratio)
+                if len(box) > 1:
+                    continue
+            else:
+                continue
+            box = box.reshape(-1, 2)
+            _, sside = self.get_mini_boxes(box.reshape((-1, 1, 2)))
+            if sside < self.min_size + 2:
+                continue
+            if not isinstance(dest_width, int):
+                dest_width = dest_width.item()
+                dest_height = dest_height.item()
+            box[:, 0] = np.clip(np.round(box[:, 0] / width * dest_width), 0, dest_width)
+            box[:, 1] = np.clip(np.round(box[:, 1] / height * dest_height), 0, dest_height)
+            boxes.append(box)
+            scores.append(score)
+        return boxes, scores
+    def boxes_from_bitmap(self, pred, _bitmap, dest_width, dest_height):
+        '''
+        _bitmap: single map with shape (H, W),
+            whose values are binarized as {0, 1}
+        '''
+        assert len(_bitmap.shape) == 2
+        if isinstance(pred, torch.Tensor):
+            bitmap = _bitmap.cpu().numpy()  # The first channel
+            pred = pred.cpu().detach().numpy()
+        else:
+            bitmap = _bitmap
+        # cv2.imwrite('tmp.png', (bitmap*255).astype(np.uint8))
+        height, width = bitmap.shape
+        contours, _ = cv2.findContours((bitmap * 255).astype(np.uint8), cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
+        num_contours = min(len(contours), self.max_candidates)
+        boxes = np.zeros((num_contours, 4, 2), dtype=np.int16)
+        scores = np.zeros((num_contours,), dtype=np.float32)
+        for index in range(num_contours):
+            contour = contours[index].squeeze(1)
+            points, sside = self.get_mini_boxes(contour)
+            # if sside < self.min_size:
+            #     continue
+            if sside < 2:
+                continue
+            points = np.array(points)
+            score = self.box_score_fast(pred, contour)
+            # if self.box_thresh > score:
+            #     continue
+            box = self.unclip(points, unclip_ratio=self.unclip_ratio).reshape(-1, 1, 2)
+            box, sside = self.get_mini_boxes(box)
+            # if sside < 5:
+            #     continue
+            box = np.array(box)
+            if not isinstance(dest_width, int):
+                dest_width = dest_width.item()
+                dest_height = dest_height.item()
+            box[:, 0] = np.clip(np.round(box[:, 0] / width * dest_width), 0, dest_width)
+            box[:, 1] = np.clip(np.round(box[:, 1] / height * dest_height), 0, dest_height)
+            boxes[index, :, :] = box.astype(np.int16)
+            scores[index] = score
+        return boxes, scores
+    def unclip(self, box, unclip_ratio=1.5):
+        poly = Polygon(box)
+        distance = poly.area * unclip_ratio / poly.length
+        offset = pyclipper.PyclipperOffset()
+        offset.AddPath(box, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON)
+        expanded = np.array(offset.Execute(distance))
+        return expanded
+    def get_mini_boxes(self, contour):
+        bounding_box = cv2.minAreaRect(contour)
+        points = sorted(list(cv2.boxPoints(bounding_box)), key=lambda x: x[0])
+        index_1, index_2, index_3, index_4 = 0, 1, 2, 3
+        if points[1][1] > points[0][1]:
+            index_1 = 0
+            index_4 = 1
+        else:
+            index_1 = 1
+            index_4 = 0
+        if points[3][1] > points[2][1]:
+            index_2 = 2
+            index_3 = 3
+        else:
+            index_2 = 3
+            index_3 = 2
+        box = [points[index_1], points[index_2], points[index_3], points[index_4]]
+        return box, min(bounding_box[1])
+    def box_score_fast(self, bitmap, _box):
+        h, w = bitmap.shape[:2]
+        box = _box.copy()
+        xmin = np.clip(np.floor(box[:, 0].min()).astype(np.int32), 0, w - 1)
+        xmax = np.clip(np.ceil(box[:, 0].max()).astype(np.int32), 0, w - 1)
+        ymin = np.clip(np.floor(box[:, 1].min()).astype(np.int32), 0, h - 1)
+        ymax = np.clip(np.ceil(box[:, 1].max()).astype(np.int32), 0, h - 1)
+        mask = np.zeros((ymax - ymin + 1, xmax - xmin + 1), dtype=np.uint8)
+        box[:, 0] = box[:, 0] - xmin
+        box[:, 1] = box[:, 1] - ymin
+        cv2.fillPoly(mask, box.reshape(1, -1, 2).astype(np.int32), 1)
+        if bitmap.dtype == np.float16:
+            bitmap = bitmap.astype(np.float32)
+        return cv2.mean(bitmap[ymin:ymax + 1, xmin:xmax + 1], mask)[0]
+class AverageMeter(object):
+    """Computes and stores the average and current value"""
+    def __init__(self):
+        self.reset()
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+        return self
+class DetectionIoUEvaluator(object):
+    def __init__(self, is_output_polygon=False, iou_constraint=0.5, area_precision_constraint=0.5):
+        self.is_output_polygon = is_output_polygon
+        self.iou_constraint = iou_constraint
+        self.area_precision_constraint = area_precision_constraint
+    def evaluate_image(self, gt, pred):
+        def get_union(pD, pG):
+            return Polygon(pD).union(Polygon(pG)).area
+        def get_intersection_over_union(pD, pG):
+            return get_intersection(pD, pG) / get_union(pD, pG)
+        def get_intersection(pD, pG):
+            return Polygon(pD).intersection(Polygon(pG)).area
+        def compute_ap(confList, matchList, numGtCare):
+            correct = 0
+            AP = 0
+            if len(confList) > 0:
+                confList = np.array(confList)
+                matchList = np.array(matchList)
+                sorted_ind = np.argsort(-confList)
+                confList = confList[sorted_ind]
+                matchList = matchList[sorted_ind]
+                for n in range(len(confList)):
+                    match = matchList[n]
+                    if match:
+                        correct += 1
+                        AP += float(correct) / (n + 1)
+                if numGtCare > 0:
+                    AP /= numGtCare
+            return AP
+        perSampleMetrics = {}
+        matchedSum = 0
+        Rectangle = namedtuple('Rectangle', 'xmin ymin xmax ymax')
+        numGlobalCareGt = 0
+        numGlobalCareDet = 0
+        arrGlobalConfidences = []
+        arrGlobalMatches = []
+        recall = 0
+        precision = 0
+        hmean = 0
+        detMatched = 0
+        iouMat = np.empty([1, 1])
+        gtPols = []
+        detPols = []
+        gtPolPoints = []
+        detPolPoints = []
+        # Array of Ground Truth Polygons' keys marked as don't Care
+        gtDontCarePolsNum = []
+        # Array of Detected Polygons' matched with a don't Care GT
+        detDontCarePolsNum = []
+        pairs = []
+        detMatchedNums = []
+        arrSampleConfidences = []
+        arrSampleMatch = []
+        evaluationLog = ""
+        for n in range(len(gt)):
+            points = gt[n]['points']
+            # transcription = gt[n]['text']
+            dontCare = gt[n]['ignore']
+            if not Polygon(points).is_valid or not Polygon(points).is_simple:
+                continue
+            gtPol = points
+            gtPols.append(gtPol)
+            gtPolPoints.append(points)
+            if dontCare:
+                gtDontCarePolsNum.append(len(gtPols) - 1)
+        evaluationLog += "GT polygons: " + str(len(gtPols)) + (" (" + str(len(
+            gtDontCarePolsNum)) + " don't care)\n" if len(gtDontCarePolsNum) > 0 else "\n")
+        for n in range(len(pred)):
+            points = pred[n]['points']
+            if not Polygon(points).is_valid or not Polygon(points).is_simple:
+                continue
+            detPol = points
+            detPols.append(detPol)
+            detPolPoints.append(points)
+            if len(gtDontCarePolsNum) > 0:
+                for dontCarePol in gtDontCarePolsNum:
+                    dontCarePol = gtPols[dontCarePol]
+                    intersected_area = get_intersection(dontCarePol, detPol)
+                    pdDimensions = Polygon(detPol).area
+                    precision = 0 if pdDimensions == 0 else intersected_area / pdDimensions
+                    if (precision > self.area_precision_constraint):
+                        detDontCarePolsNum.append(len(detPols) - 1)
+                        break
+        evaluationLog += "DET polygons: " + str(len(detPols)) + (" (" + str(len(
+            detDontCarePolsNum)) + " don't care)\n" if len(detDontCarePolsNum) > 0 else "\n")
+        if len(gtPols) > 0 and len(detPols) > 0:
+            # Calculate IoU and precision matrixs
+            outputShape = [len(gtPols), len(detPols)]
+            iouMat = np.empty(outputShape)
+            gtRectMat = np.zeros(len(gtPols), np.int8)
+            detRectMat = np.zeros(len(detPols), np.int8)
+            if self.is_output_polygon:
+                for gtNum in range(len(gtPols)):
+                    for detNum in range(len(detPols)):
+                        pG = gtPols[gtNum]
+                        pD = detPols[detNum]
+                        iouMat[gtNum, detNum] = get_intersection_over_union(pD, pG)
+            else:
+                # gtPols = np.float32(gtPols)
+                # detPols = np.float32(detPols)
+                for gtNum in range(len(gtPols)):
+                    for detNum in range(len(detPols)):
+                        pG = np.float32(gtPols[gtNum])
+                        pD = np.float32(detPols[detNum])
+                        iouMat[gtNum, detNum] = iou_rotate(pD, pG)
+            for gtNum in range(len(gtPols)):
+                for detNum in range(len(detPols)):
+                    if gtRectMat[gtNum] == 0 and detRectMat[
+                        detNum] == 0 and gtNum not in gtDontCarePolsNum and detNum not in detDontCarePolsNum:
+                        if iouMat[gtNum, detNum] > self.iou_constraint:
+                            gtRectMat[gtNum] = 1
+                            detRectMat[detNum] = 1
+                            detMatched += 1
+                            pairs.append({'gt': gtNum, 'det': detNum})
+                            detMatchedNums.append(detNum)
+                            evaluationLog += "Match GT #" + \
+                                             str(gtNum) + " with Det #" + str(detNum) + "\n"
+        numGtCare = (len(gtPols) - len(gtDontCarePolsNum))
+        numDetCare = (len(detPols) - len(detDontCarePolsNum))
+        if numGtCare == 0:
+            recall = float(1)
+            precision = float(0) if numDetCare > 0 else float(1)
+        else:
+            recall = float(detMatched) / numGtCare
+            precision = 0 if numDetCare == 0 else float(
+                detMatched) / numDetCare
+        hmean = 0 if (precision + recall) == 0 else 2.0 * \
+                                                    precision * recall / (precision + recall)
+        matchedSum += detMatched
+        numGlobalCareGt += numGtCare
+        numGlobalCareDet += numDetCare
+        perSampleMetrics = {
+            'precision': precision,
+            'recall': recall,
+            'hmean': hmean,
+            'pairs': pairs,
+            'iouMat': [] if len(detPols) > 100 else iouMat.tolist(),
+            'gtPolPoints': gtPolPoints,
+            'detPolPoints': detPolPoints,
+            'gtCare': numGtCare,
+            'detCare': numDetCare,
+            'gtDontCare': gtDontCarePolsNum,
+            'detDontCare': detDontCarePolsNum,
+            'detMatched': detMatched,
+            'evaluationLog': evaluationLog
+        }
+        return perSampleMetrics
+    def combine_results(self, results):
+        numGlobalCareGt = 0
+        numGlobalCareDet = 0
+        matchedSum = 0
+        for result in results:
+            numGlobalCareGt += result['gtCare']
+            numGlobalCareDet += result['detCare']
+            matchedSum += result['detMatched']
+        methodRecall = 0 if numGlobalCareGt == 0 else float(
+            matchedSum) / numGlobalCareGt
+        methodPrecision = 0 if numGlobalCareDet == 0 else float(
+            matchedSum) / numGlobalCareDet
+        methodHmean = 0 if methodRecall + methodPrecision == 0 else 2 * \
+                                                                    methodRecall * methodPrecision / (
+                                                                            methodRecall + methodPrecision)
+        methodMetrics = {'precision': methodPrecision,
+                         'recall': methodRecall, 'hmean': methodHmean}
+        return methodMetrics
+class QuadMetric():
+    def __init__(self, is_output_polygon=False):
+        self.is_output_polygon = is_output_polygon
+        self.evaluator = DetectionIoUEvaluator(is_output_polygon=is_output_polygon)
+    def measure(self, batch, output, box_thresh=0.6):
+        '''
+        batch: (image, polygons, ignore_tags
+        batch: a dict produced by dataloaders.
+            image: tensor of shape (N, C, H, W).
+            polygons: tensor of shape (N, K, 4, 2), the polygons of objective regions.
+            ignore_tags: tensor of shape (N, K), indicates whether a region is ignorable or not.
+            shape: the original shape of images.
+            filename: the original filenames of images.
+        output: (polygons, ...)
+        '''
+        results = []
+        gt_polyons_batch = batch['text_polys']
+        ignore_tags_batch = batch['ignore_tags']
+        pred_polygons_batch = np.array(output[0])
+        pred_scores_batch = np.array(output[1])
+        for polygons, pred_polygons, pred_scores, ignore_tags in zip(gt_polyons_batch, pred_polygons_batch, pred_scores_batch, ignore_tags_batch):
+            gt = [dict(points=np.int64(polygons[i]), ignore=ignore_tags[i]) for i in range(len(polygons))]
+            if self.is_output_polygon:
+                pred = [dict(points=pred_polygons[i]) for i in range(len(pred_polygons))]
+            else:
+                pred = []
+                # print(pred_polygons.shape)
+                for i in range(pred_polygons.shape[0]):
+                    if pred_scores[i] >= box_thresh:
+                        # print(pred_polygons[i,:,:].tolist())
+                        pred.append(dict(points=pred_polygons[i, :, :].astype(np.int32)))
+                # pred = [dict(points=pred_polygons[i,:,:].tolist()) if pred_scores[i] >= box_thresh for i in range(pred_polygons.shape[0])]
+            results.append(self.evaluator.evaluate_image(gt, pred))
+        return results
+    def validate_measure(self, batch, output, box_thresh=0.6):
+        return self.measure(batch, output, box_thresh)
+    def evaluate_measure(self, batch, output):
+        return self.measure(batch, output), np.linspace(0, batch['image'].shape[0]).tolist()
+    def gather_measure(self, raw_metrics):
+        raw_metrics = [image_metrics
+                       for batch_metrics in raw_metrics
+                       for image_metrics in batch_metrics]
+        result = self.evaluator.combine_results(raw_metrics)
+        precision = AverageMeter()
+        recall = AverageMeter()
+        fmeasure = AverageMeter()
+        precision.update(result['precision'], n=len(raw_metrics))
+        recall.update(result['recall'], n=len(raw_metrics))
+        fmeasure_score = 2 * precision.val * recall.val / (precision.val + recall.val + 1e-8)
+        fmeasure.update(fmeasure_score)
+        return {
+            'precision': precision,
+            'recall': recall,
+            'fmeasure': fmeasure
+        }
+def shrink_polygon_py(polygon, shrink_ratio):
+    """
+    对框进行缩放，返回去的比例为1/shrink_ratio 即可
+    """
+    cx = polygon[:, 0].mean()
+    cy = polygon[:, 1].mean()
+    polygon[:, 0] = cx + (polygon[:, 0] - cx) * shrink_ratio
+    polygon[:, 1] = cy + (polygon[:, 1] - cy) * shrink_ratio
+    return polygon
+def shrink_polygon_pyclipper(polygon, shrink_ratio):
+    from shapely.geometry import Polygon
+    import pyclipper
+    polygon_shape = Polygon(polygon)
+    distance = polygon_shape.area * (1 - np.power(shrink_ratio, 2)) / polygon_shape.length
+    subject = [tuple(l) for l in polygon]
+    padding = pyclipper.PyclipperOffset()
+    padding.AddPath(subject, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON)
+    shrunk = padding.Execute(-distance)
+    if shrunk == []:
+        shrunk = np.array(shrunk)
+    else:
+        shrunk = np.array(shrunk[0]).reshape(-1, 2)
+    return shrunk
+class MakeShrinkMap():
+    r'''
+    Making binary mask from detection data with ICDAR format.
+    Typically following the process of class `MakeICDARData`.
+    '''
+    def __init__(self, min_text_size=4, shrink_ratio=0.4, shrink_type='pyclipper'):
+        shrink_func_dict = {'py': shrink_polygon_py, 'pyclipper': shrink_polygon_pyclipper}
+        self.shrink_func = shrink_func_dict[shrink_type]
+        self.min_text_size = min_text_size
+        self.shrink_ratio = shrink_ratio
+    def __call__(self, data: dict) -> dict:
+        """
+        从scales中随机选择一个尺度，对图片和文本框进行缩放
+        :param data: {'imgs':,'text_polys':,'texts':,'ignore_tags':}
+        :return:
+        """
+        image = data['imgs']
+        text_polys = data['text_polys']
+        ignore_tags = data['ignore_tags']
+        h, w = image.shape[:2]
+        text_polys, ignore_tags = self.validate_polygons(text_polys, ignore_tags, h, w)
+        gt = np.zeros((h, w), dtype=np.float32)
+        mask = np.ones((h, w), dtype=np.float32)
+        for i in range(len(text_polys)):
+            polygon = text_polys[i]
+            height = max(polygon[:, 1]) - min(polygon[:, 1])
+            width = max(polygon[:, 0]) - min(polygon[:, 0])
+            if ignore_tags[i] or min(height, width) < self.min_text_size:
+                cv2.fillPoly(mask, polygon.astype(np.int32)[np.newaxis, :, :], 0)
+                ignore_tags[i] = True
+            else:
+                shrunk = self.shrink_func(polygon, self.shrink_ratio)
+                if shrunk.size == 0:
+                    cv2.fillPoly(mask, polygon.astype(np.int32)[np.newaxis, :, :], 0)
+                    ignore_tags[i] = True
+                    continue
+                cv2.fillPoly(gt, [shrunk.astype(np.int32)], 1)
+        data['shrink_map'] = gt
+        data['shrink_mask'] = mask
+        return data
+    def validate_polygons(self, polygons, ignore_tags, h, w):
+        '''
+        polygons (numpy.array, required): of shape (num_instances, num_points, 2)
+        '''
+        if len(polygons) == 0:
+            return polygons, ignore_tags
+        assert len(polygons) == len(ignore_tags)
+        for polygon in polygons:
+            polygon[:, 0] = np.clip(polygon[:, 0], 0, w - 1)
+            polygon[:, 1] = np.clip(polygon[:, 1], 0, h - 1)
+        for i in range(len(polygons)):
+            area = self.polygon_area(polygons[i])
+            if abs(area) < 1:
+                ignore_tags[i] = True
+            if area > 0:
+                polygons[i] = polygons[i][::-1, :]
+        return polygons, ignore_tags
+    def polygon_area(self, polygon):
+        return cv2.contourArea(polygon)
+class MakeBorderMap():
+    def __init__(self, shrink_ratio=0.4, thresh_min=0.3, thresh_max=0.7):
+        self.shrink_ratio = shrink_ratio
+        self.thresh_min = thresh_min
+        self.thresh_max = thresh_max
+    def __call__(self, data: dict) -> dict:
+        """
+        从scales中随机选择一个尺度，对图片和文本框进行缩放
+        :param data: {'imgs':,'text_polys':,'texts':,'ignore_tags':}
+        :return:
+        """
+        im = data['imgs']
+        text_polys = data['text_polys']
+        ignore_tags = data['ignore_tags']
+        canvas = np.zeros(im.shape[:2], dtype=np.float32)
+        mask = np.zeros(im.shape[:2], dtype=np.float32)
+        for i in range(len(text_polys)):
+            if ignore_tags[i]:
+                continue
+            self.draw_border_map(text_polys[i], canvas, mask=mask)
+        canvas = canvas * (self.thresh_max - self.thresh_min) + self.thresh_min
+        data['threshold_map'] = canvas
+        data['threshold_mask'] = mask
+        return data
+    def draw_border_map(self, polygon, canvas, mask):
+        polygon = np.array(polygon)
+        assert polygon.ndim == 2
+        assert polygon.shape[1] == 2
+        polygon_shape = Polygon(polygon)
+        if polygon_shape.area <= 0:
+            return
+        distance = polygon_shape.area * (1 - np.power(self.shrink_ratio, 2)) / polygon_shape.length
+        subject = [tuple(l) for l in polygon]
+        padding = pyclipper.PyclipperOffset()
+        padding.AddPath(subject, pyclipper.JT_ROUND,
+                        pyclipper.ET_CLOSEDPOLYGON)
+        padded_polygon = np.array(padding.Execute(distance)[0])
+        cv2.fillPoly(mask, [padded_polygon.astype(np.int32)], 1.0)
+        xmin = padded_polygon[:, 0].min()
+        xmax = padded_polygon[:, 0].max()
+        ymin = padded_polygon[:, 1].min()
+        ymax = padded_polygon[:, 1].max()
+        width = xmax - xmin + 1
+        height = ymax - ymin + 1
+        polygon[:, 0] = polygon[:, 0] - xmin
+        polygon[:, 1] = polygon[:, 1] - ymin
+        xs = np.broadcast_to(
+            np.linspace(0, width - 1, num=width).reshape(1, width), (height, width))
+        ys = np.broadcast_to(
+            np.linspace(0, height - 1, num=height).reshape(height, 1), (height, width))
+        distance_map = np.zeros(
+            (polygon.shape[0], height, width), dtype=np.float32)
+        for i in range(polygon.shape[0]):
+            j = (i + 1) % polygon.shape[0]
+            absolute_distance = self.distance(xs, ys, polygon[i], polygon[j])
+            distance_map[i] = np.clip(absolute_distance / distance, 0, 1)
+        distance_map = distance_map.min(axis=0)
+        xmin_valid = min(max(0, xmin), canvas.shape[1] - 1)
+        xmax_valid = min(max(0, xmax), canvas.shape[1] - 1)
+        ymin_valid = min(max(0, ymin), canvas.shape[0] - 1)
+        ymax_valid = min(max(0, ymax), canvas.shape[0] - 1)
+        canvas[ymin_valid:ymax_valid + 1, xmin_valid:xmax_valid + 1] = np.fmax(
+            1 - distance_map[
+                ymin_valid - ymin:ymax_valid - ymax + height,
+                xmin_valid - xmin:xmax_valid - xmax + width],
+            canvas[ymin_valid:ymax_valid + 1, xmin_valid:xmax_valid + 1])
+    def distance(self, xs, ys, point_1, point_2):
+        '''
+        compute the distance from point to a line
+        ys: coordinates in the first axis
+        xs: coordinates in the second axis
+        point_1, point_2: (x, y), the end of the line
+        '''
+        height, width = xs.shape[:2]
+        square_distance_1 = np.square(xs - point_1[0]) + np.square(ys - point_1[1])
+        square_distance_2 = np.square(xs - point_2[0]) + np.square(ys - point_2[1])
+        square_distance = np.square(point_1[0] - point_2[0]) + np.square(point_1[1] - point_2[1])
+        cosin = (square_distance - square_distance_1 - square_distance_2) / (2 * np.sqrt(square_distance_1 * square_distance_2))
+        square_sin = 1 - np.square(cosin)
+        square_sin = np.nan_to_num(square_sin)
+        result = np.sqrt(square_distance_1 * square_distance_2 * square_sin / square_distance)
+        result[cosin < 0] = np.sqrt(np.fmin(square_distance_1, square_distance_2))[cosin < 0]
+        return result
+    def extend_line(self, point_1, point_2, result):
+        ex_point_1 = (int(round(point_1[0] + (point_1[0] - point_2[0]) * (1 + self.shrink_ratio))),
+                      int(round(point_1[1] + (point_1[1] - point_2[1]) * (1 + self.shrink_ratio))))
+        cv2.line(result, tuple(ex_point_1), tuple(point_1), 4096.0, 1, lineType=cv2.LINE_AA, shift=0)
+        ex_point_2 = (int(round(point_2[0] + (point_2[0] - point_1[0]) * (1 + self.shrink_ratio))),
+                      int(round(point_2[1] + (point_2[1] - point_1[1]) * (1 + self.shrink_ratio))))
+        cv2.line(result, tuple(ex_point_2), tuple(point_2), 4096.0, 1, lineType=cv2.LINE_AA, shift=0)
+        return ex_point_1, ex_point_2

manga_translator/detection/ctd_utils/utils/imgproc_utils.py ADDED Viewed

	@@ -0,0 +1,180 @@

+import numpy as np
+import cv2
+import random
+from typing import List
+def hex2bgr(hex):
+    gmask = 254 << 8
+    rmask = 254
+    b = hex >> 16
+    g = (hex & gmask) >> 8
+    r = hex & rmask
+    return np.stack([b, g, r]).transpose()
+def union_area(bboxa, bboxb):
+    x1 = max(bboxa[0], bboxb[0])
+    y1 = max(bboxa[1], bboxb[1])
+    x2 = min(bboxa[2], bboxb[2])
+    y2 = min(bboxa[3], bboxb[3])
+    if y2 < y1 or x2 < x1:
+        return -1
+    return (y2 - y1) * (x2 - x1)
+def get_yololabel_strings(clslist, labellist):
+    content = ''
+    for cls, xywh in zip(clslist, labellist):
+        content += str(int(cls)) + ' ' + ' '.join([str(e) for e in xywh]) + '\n'
+    if len(content) != 0:
+        content = content[:-1]
+    return content
+# 4 points bbox to 8 points polygon
+def xywh2xyxypoly(xywh, to_int=True):
+    xyxypoly = np.tile(xywh[:, [0, 1]], 4)
+    xyxypoly[:, [2, 4]] += xywh[:, [2]]
+    xyxypoly[:, [5, 7]] += xywh[:, [3]]
+    if to_int:
+        xyxypoly = xyxypoly.astype(np.int64)
+    return xyxypoly
+def xyxy2yolo(xyxy, w: int, h: int):
+    if xyxy == [] or xyxy == np.array([]) or len(xyxy) == 0:
+        return None
+    if isinstance(xyxy, list):
+        xyxy = np.array(xyxy)
+    if len(xyxy.shape) == 1:
+        xyxy = np.array([xyxy])
+    yolo = np.copy(xyxy).astype(np.float64)
+    yolo[:, [0, 2]] =  yolo[:, [0, 2]] / w
+    yolo[:, [1, 3]] = yolo[:, [1, 3]] / h
+    yolo[:, [2, 3]] -= yolo[:, [0, 1]]
+    yolo[:, [0, 1]] += yolo[:, [2, 3]] / 2
+    return yolo
+def yolo_xywh2xyxy(xywh: np.array, w: int, h:  int, to_int=True):
+    if xywh is None:
+        return None
+    if len(xywh) == 0:
+        return None
+    if len(xywh.shape) == 1:
+        xywh = np.array([xywh])
+    xywh[:, [0, 2]] *= w
+    xywh[:, [1, 3]] *= h
+    xywh[:, [0, 1]] -= xywh[:, [2, 3]] / 2
+    xywh[:, [2, 3]] += xywh[:, [0, 1]]
+    if to_int:
+        xywh = xywh.astype(np.int64)
+    return xywh
+def letterbox(im, new_shape=(640, 640), color=(0, 0, 0), auto=False, scaleFill=False, scaleup=True, stride=128):
+    # Resize and pad image while meeting stride-multiple constraints
+    shape = im.shape[:2]  # current shape [height, width]
+    if not isinstance(new_shape, tuple):
+        new_shape = (new_shape, new_shape)
+    # Scale ratio (new / old)
+    r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
+    if not scaleup:  # only scale down, do not scale up (for better val mAP)
+        r = min(r, 1.0)
+    # Compute padding
+    ratio = r, r  # width, height ratios
+    new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
+    dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1]  # wh padding
+    if auto:  # minimum rectangle
+        dw, dh = np.mod(dw, stride), np.mod(dh, stride)  # wh padding
+    elif scaleFill:  # stretch
+        dw, dh = 0.0, 0.0
+        new_unpad = (new_shape[1], new_shape[0])
+        ratio = new_shape[1] / shape[1], new_shape[0] / shape[0]  # width, height ratios
+    # dw /= 2  # divide padding into 2 sides
+    # dh /= 2
+    dh, dw = int(dh), int(dw)
+    if shape[::-1] != new_unpad:  # resize
+        im = cv2.resize(im, new_unpad, interpolation=cv2.INTER_LINEAR)
+    top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
+    left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
+    im = cv2.copyMakeBorder(im, 0, dh, 0, dw, cv2.BORDER_CONSTANT, value=color)  # add border
+    return im, ratio, (dw, dh)
+def resize_keepasp(im, new_shape=640, scaleup=True, interpolation=cv2.INTER_LINEAR, stride=None):
+    shape = im.shape[:2]  # current shape [height, width]
+    if new_shape is not None:
+        if not isinstance(new_shape, tuple):
+            new_shape = (new_shape, new_shape)
+    else:
+        new_shape = shape
+    # Scale ratio (new / old)
+    r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
+    if not scaleup:  # only scale down, do not scale up (for better val mAP)
+        r = min(r, 1.0)
+    new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
+    if stride is not None:
+        h, w = new_unpad
+        if new_shape[0] % stride != 0:
+            new_h = (stride - (new_shape[0] % stride)) + h
+        else:
+            new_h = h
+        if w % stride != 0:
+            new_w = (stride - (w % stride)) + w
+        else:
+            new_w = w
+        new_unpad = (new_h, new_w)
+    if shape[::-1] != new_unpad:  # resize
+        im = cv2.resize(im, new_unpad, interpolation=interpolation)
+    return im
+def enlarge_window(rect, im_w, im_h, ratio=2.5, aspect_ratio=1.0) -> List:
+    assert ratio > 1.0
+    x1, y1, x2, y2 = rect
+    w = x2 - x1
+    h = y2 - y1
+    # https://numpy.org/doc/stable/reference/generated/numpy.roots.html
+    coeff = [aspect_ratio, w+h*aspect_ratio, (1-ratio)*w*h]
+    roots = np.roots(coeff)
+    roots.sort()
+    delta = int(round(roots[-1] / 2 ))
+    delta_w = int(delta * aspect_ratio)
+    delta_w = min(x1, im_w - x2, delta_w)
+    delta = min(y1, im_h - y2, delta)
+    rect = np.array([x1-delta_w, y1-delta, x2+delta_w, y2+delta], dtype=np.int64)
+    return rect.tolist()
+def draw_connected_labels(num_labels, labels, stats, centroids, names="draw_connected_labels", skip_background=True):
+    labdraw = np.zeros((labels.shape[0], labels.shape[1], 3), dtype=np.uint8)
+    max_ind = 0
+    if isinstance(num_labels, int):
+        num_labels = range(num_labels)
+    # for ind, lab in enumerate((range(num_labels))):
+    for lab in num_labels:
+        if skip_background and lab == 0:
+            continue
+        randcolor = (random.randint(0,255), random.randint(0,255), random.randint(0,255))
+        labdraw[np.where(labels==lab)] = randcolor
+        maxr, minr = 0.5, 0.001
+        maxw, maxh = stats[max_ind][2] * maxr, stats[max_ind][3] * maxr
+        minarea = labdraw.shape[0] * labdraw.shape[1] * minr
+        stat = stats[lab]
+        bboxarea = stat[2] * stat[3]
+        if stat[2] < maxw and stat[3] < maxh and bboxarea > minarea:
+            pix = np.zeros((labels.shape[0], labels.shape[1]), dtype=np.uint8)
+            pix[np.where(labels==lab)] = 255
+            rect = cv2.minAreaRect(cv2.findNonZero(pix))
+            box = np.int0(cv2.boxPoints(rect))
+            labdraw = cv2.drawContours(labdraw, [box], 0, randcolor, 2)
+            labdraw = cv2.circle(labdraw, (int(centroids[lab][0]),int(centroids[lab][1])), radius=5, color=(random.randint(0,255), random.randint(0,255), random.randint(0,255)), thickness=-1)
+    cv2.imshow(names, labdraw)
+    return labdraw

manga_translator/detection/ctd_utils/utils/io_utils.py ADDED Viewed

	@@ -0,0 +1,54 @@

+import os
+import os.path as osp
+import glob
+from pathlib import Path
+import cv2
+import numpy as np
+import json
+IMG_EXT = ['.bmp', '.jpg', '.png', '.jpeg']
+NP_BOOL_TYPES = (np.bool_, np.bool8)
+NP_FLOAT_TYPES = (np.float_, np.float16, np.float32, np.float64)
+NP_INT_TYPES = (np.int_, np.int8, np.int16, np.int32, np.int64, np.uint, np.uint8, np.uint16, np.uint32, np.uint64)
+# https://stackoverflow.com/questions/26646362/numpy-array-is-not-json-serializable
+class NumpyEncoder(json.JSONEncoder):
+    def default(self, obj):
+        if isinstance(obj, np.ndarray):
+            return obj.tolist()
+        elif isinstance(obj, np.ScalarType):
+            if isinstance(obj, NP_BOOL_TYPES):
+                return bool(obj)
+            elif isinstance(obj, NP_FLOAT_TYPES):
+                return float(obj)
+            elif isinstance(obj, NP_INT_TYPES):
+                return int(obj)
+        return json.JSONEncoder.default(self, obj)
+def find_all_imgs(img_dir, abs_path=False):
+    imglist = list()
+    for filep in glob.glob(osp.join(img_dir, "*")):
+        filename = osp.basename(filep)
+        file_suffix = Path(filename).suffix
+        if file_suffix.lower() not in IMG_EXT:
+            continue
+        if abs_path:
+            imglist.append(filep)
+        else:
+            imglist.append(filename)
+    return imglist
+def imread(imgpath, read_type=cv2.IMREAD_COLOR):
+    # img = cv2.imread(imgpath, read_type)
+    # if img is None:
+    img = cv2.imdecode(np.fromfile(imgpath, dtype=np.uint8), read_type)
+    return img
+def imwrite(img_path, img, ext='.png'):
+    suffix = Path(img_path).suffix
+    if suffix != '':
+        img_path = img_path.replace(suffix, ext)
+    else:
+        img_path += ext
+    cv2.imencode(ext, img)[1].tofile(img_path)

manga_translator/detection/ctd_utils/utils/weight_init.py ADDED Viewed

	@@ -0,0 +1,103 @@

+import torch.nn as nn
+import torch
+def constant_init(module, val, bias=0):
+    nn.init.constant_(module.weight, val)
+    if hasattr(module, 'bias') and module.bias is not None:
+        nn.init.constant_(module.bias, bias)
+def xavier_init(module, gain=1, bias=0, distribution='normal'):
+    assert distribution in ['uniform', 'normal']
+    if distribution == 'uniform':
+        nn.init.xavier_uniform_(module.weight, gain=gain)
+    else:
+        nn.init.xavier_normal_(module.weight, gain=gain)
+    if hasattr(module, 'bias') and module.bias is not None:
+        nn.init.constant_(module.bias, bias)
+def normal_init(module, mean=0, std=1, bias=0):
+    nn.init.normal_(module.weight, mean, std)
+    if hasattr(module, 'bias') and module.bias is not None:
+        nn.init.constant_(module.bias, bias)
+def uniform_init(module, a=0, b=1, bias=0):
+    nn.init.uniform_(module.weight, a, b)
+    if hasattr(module, 'bias') and module.bias is not None:
+        nn.init.constant_(module.bias, bias)
+def kaiming_init(module,
+                 a=0,
+                 is_rnn=False,
+                 mode='fan_in',
+                 nonlinearity='leaky_relu',
+                 bias=0,
+                 distribution='normal'):
+    assert distribution in ['uniform', 'normal']
+    if distribution == 'uniform':
+        if is_rnn:
+            for name, param in module.named_parameters():
+                if 'bias' in name:
+                    nn.init.constant_(param, bias)
+                elif 'weight' in name:
+                    nn.init.kaiming_uniform_(param,
+                                             a=a,
+                                             mode=mode,
+                                             nonlinearity=nonlinearity)
+        else:
+            nn.init.kaiming_uniform_(module.weight,
+                                     a=a,
+                                     mode=mode,
+                                     nonlinearity=nonlinearity)
+    else:
+        if is_rnn:
+            for name, param in module.named_parameters():
+                if 'bias' in name:
+                    nn.init.constant_(param, bias)
+                elif 'weight' in name:
+                    nn.init.kaiming_normal_(param,
+                                            a=a,
+                                            mode=mode,
+                                            nonlinearity=nonlinearity)
+        else:
+            nn.init.kaiming_normal_(module.weight,
+                                    a=a,
+                                    mode=mode,
+                                    nonlinearity=nonlinearity)
+    if not is_rnn and hasattr(module, 'bias') and module.bias is not None:
+        nn.init.constant_(module.bias, bias)
+def bilinear_kernel(in_channels, out_channels, kernel_size):
+    factor = (kernel_size + 1) // 2
+    if kernel_size % 2 == 1:
+        center = factor - 1
+    else:
+        center = factor - 0.5
+    og = (torch.arange(kernel_size).reshape(-1, 1),
+          torch.arange(kernel_size).reshape(1, -1))
+    filt = (1 - torch.abs(og[0] - center) / factor) * \
+           (1 - torch.abs(og[1] - center) / factor)
+    weight = torch.zeros((in_channels, out_channels,
+                          kernel_size, kernel_size))
+    weight[range(in_channels), range(out_channels), :, :] = filt
+    return weight
+def init_weights(m):
+    # for m in modules:
+    if isinstance(m, nn.Conv2d):
+        kaiming_init(m)
+    elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
+        constant_init(m, 1)
+    elif isinstance(m, nn.Linear):
+        xavier_init(m)
+    elif isinstance(m, (nn.LSTM, nn.LSTMCell)):
+        kaiming_init(m, is_rnn=True)
+    # elif isinstance(m, nn.ConvTranspose2d):
+    #     m.weight.data.copy_(bilinear_kernel(m.in_channels, m.out_channels, 4));

manga_translator/detection/ctd_utils/utils/yolov5_utils.py ADDED Viewed

	@@ -0,0 +1,243 @@

+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import cv2
+import numpy as np
+import time
+import torchvision
+def scale_img(img, ratio=1.0, same_shape=False, gs=32):  # img(16,3,256,416)
+    # scales img(bs,3,y,x) by ratio constrained to gs-multiple
+    if ratio == 1.0:
+        return img
+    else:
+        h, w = img.shape[2:]
+        s = (int(h * ratio), int(w * ratio))  # new size
+        img = F.interpolate(img, size=s, mode='bilinear', align_corners=False)  # resize
+        if not same_shape:  # pad/crop img
+            h, w = (math.ceil(x * ratio / gs) * gs for x in (h, w))
+        return F.pad(img, [0, w - s[1], 0, h - s[0]], value=0.447)  # value = imagenet mean
+def fuse_conv_and_bn(conv, bn):
+    # Fuse convolution and batchnorm layers https://tehnokv.com/posts/fusing-batchnorm-and-conv/
+    fusedconv = nn.Conv2d(conv.in_channels,
+                          conv.out_channels,
+                          kernel_size=conv.kernel_size,
+                          stride=conv.stride,
+                          padding=conv.padding,
+                          groups=conv.groups,
+                          bias=True).requires_grad_(False).to(conv.weight.device)
+    # prepare filters
+    w_conv = conv.weight.clone().view(conv.out_channels, -1)
+    w_bn = torch.diag(bn.weight.div(torch.sqrt(bn.eps + bn.running_var)))
+    fusedconv.weight.copy_(torch.mm(w_bn, w_conv).view(fusedconv.weight.shape))
+    # prepare spatial bias
+    b_conv = torch.zeros(conv.weight.size(0), device=conv.weight.device) if conv.bias is None else conv.bias
+    b_bn = bn.bias - bn.weight.mul(bn.running_mean).div(torch.sqrt(bn.running_var + bn.eps))
+    fusedconv.bias.copy_(torch.mm(w_bn, b_conv.reshape(-1, 1)).reshape(-1) + b_bn)
+    return fusedconv
+def check_anchor_order(m):
+    # Check anchor order against stride order for YOLOv5 Detect() module m, and correct if necessary
+    a = m.anchors.prod(-1).view(-1)  # anchor area
+    da = a[-1] - a[0]  # delta a
+    ds = m.stride[-1] - m.stride[0]  # delta s
+    if da.sign() != ds.sign():  # same order
+        m.anchors[:] = m.anchors.flip(0)
+def initialize_weights(model):
+    for m in model.modules():
+        t = type(m)
+        if t is nn.Conv2d:
+            pass  # nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
+        elif t is nn.BatchNorm2d:
+            m.eps = 1e-3
+            m.momentum = 0.03
+        elif t in [nn.Hardswish, nn.LeakyReLU, nn.ReLU, nn.ReLU6, nn.SiLU]:
+            m.inplace = True
+def make_divisible(x, divisor):
+    # Returns nearest x divisible by divisor
+    if isinstance(divisor, torch.Tensor):
+        divisor = int(divisor.max())  # to int
+    return math.ceil(x / divisor) * divisor
+def intersect_dicts(da, db, exclude=()):
+    # Dictionary intersection of matching keys and shapes, omitting 'exclude' keys, using da values
+    return {k: v for k, v in da.items() if k in db and not any(x in k for x in exclude) and v.shape == db[k].shape}
+def check_version(current='0.0.0', minimum='0.0.0', name='version ', pinned=False, hard=False):
+    # Check version vs. required version
+    from packaging import version
+    current, minimum = (version.parse(x) for x in (current, minimum))
+    result = (current == minimum) if pinned else (current >= minimum)  # bool
+    if hard:  # assert min requirements met
+        assert result, f'{name}{minimum} required by YOLOv5, but {name}{current} is currently installed'
+    else:
+        return result
+class Colors:
+    # Ultralytics color palette https://ultralytics.com/
+    def __init__(self):
+        # hex = matplotlib.colors.TABLEAU_COLORS.values()
+        hex = ('FF3838', 'FF9D97', 'FF701F', 'FFB21D', 'CFD231', '48F90A', '92CC17', '3DDB86', '1A9334', '00D4BB',
+               '2C99A8', '00C2FF', '344593', '6473FF', '0018EC', '8438FF', '520085', 'CB38FF', 'FF95C8', 'FF37C7')
+        self.palette = [self.hex2rgb('#' + c) for c in hex]
+        self.n = len(self.palette)
+    def __call__(self, i, bgr=False):
+        c = self.palette[int(i) % self.n]
+        return (c[2], c[1], c[0]) if bgr else c
+    @staticmethod
+    def hex2rgb(h):  # rgb order (PIL)
+        return tuple(int(h[1 + i:1 + i + 2], 16) for i in (0, 2, 4))
+def box_iou(box1, box2):
+    # https://github.com/pytorch/vision/blob/master/torchvision/ops/boxes.py
+    """
+    Return intersection-over-union (Jaccard index) of boxes.
+    Both sets of boxes are expected to be in (x1, y1, x2, y2) format.
+    Arguments:
+        box1 (Tensor[N, 4])
+        box2 (Tensor[M, 4])
+    Returns:
+        iou (Tensor[N, M]): the NxM matrix containing the pairwise
+            IoU values for every element in boxes1 and boxes2
+    """
+    def box_area(box):
+        # box = 4xn
+        return (box[2] - box[0]) * (box[3] - box[1])
+    area1 = box_area(box1.T)
+    area2 = box_area(box2.T)
+    # inter(N,M) = (rb(N,M,2) - lt(N,M,2)).clamp(0).prod(2)
+    inter = (torch.min(box1[:, None, 2:], box2[:, 2:]) - torch.max(box1[:, None, :2], box2[:, :2])).clamp(0).prod(2)
+    return inter / (area1[:, None] + area2 - inter)  # iou = inter / (area1 + area2 - inter)
+def non_max_suppression(prediction, conf_thres=0.25, iou_thres=0.45, classes=None, agnostic=False, multi_label=False,
+                        labels=(), max_det=300):
+    """Runs Non-Maximum Suppression (NMS) on inference results
+    Returns:
+         list of detections, on (n,6) tensor per image [xyxy, conf, cls]
+    """
+    if isinstance(prediction, np.ndarray):
+        prediction = torch.from_numpy(prediction)
+    nc = prediction.shape[2] - 5  # number of classes
+    xc = prediction[..., 4] > conf_thres  # candidates
+    # Checks
+    assert 0 <= conf_thres <= 1, f'Invalid Confidence threshold {conf_thres}, valid values are between 0.0 and 1.0'
+    assert 0 <= iou_thres <= 1, f'Invalid IoU {iou_thres}, valid values are between 0.0 and 1.0'
+    # Settings
+    min_wh, max_wh = 2, 4096  # (pixels) minimum and maximum box width and height
+    max_nms = 30000  # maximum number of boxes into torchvision.ops.nms()
+    time_limit = 10.0  # seconds to quit after
+    redundant = True  # require redundant detections
+    multi_label &= nc > 1  # multiple labels per box (adds 0.5ms/img)
+    merge = False  # use merge-NMS
+    t = time.time()
+    output = [torch.zeros((0, 6), device=prediction.device)] * prediction.shape[0]
+    for xi, x in enumerate(prediction):  # image index, image inference
+        # Apply constraints
+        # x[((x[..., 2:4] < min_wh) | (x[..., 2:4] > max_wh)).any(1), 4] = 0  # width-height
+        x = x[xc[xi]]  # confidence
+        # Cat apriori labels if autolabelling
+        if labels and len(labels[xi]):
+            l = labels[xi]
+            v = torch.zeros((len(l), nc + 5), device=x.device)
+            v[:, :4] = l[:, 1:5]  # box
+            v[:, 4] = 1.0  # conf
+            v[range(len(l)), l[:, 0].long() + 5] = 1.0  # cls
+            x = torch.cat((x, v), 0)
+        # If none remain process next image
+        if not x.shape[0]:
+            continue
+        # Compute conf
+        x[:, 5:] *= x[:, 4:5]  # conf = obj_conf * cls_conf
+        # Box (center x, center y, width, height) to (x1, y1, x2, y2)
+        box = xywh2xyxy(x[:, :4])
+        # Detections matrix nx6 (xyxy, conf, cls)
+        if multi_label:
+            i, j = (x[:, 5:] > conf_thres).nonzero(as_tuple=False).T
+            x = torch.cat((box[i], x[i, j + 5, None], j[:, None].float()), 1)
+        else:  # best class only
+            conf, j = x[:, 5:].max(1, keepdim=True)
+            x = torch.cat((box, conf, j.float()), 1)[conf.view(-1) > conf_thres]
+        # Filter by class
+        if classes is not None:
+            x = x[(x[:, 5:6] == torch.tensor(classes, device=x.device)).any(1)]
+        # Apply finite constraint
+        # if not torch.isfinite(x).all():
+        #     x = x[torch.isfinite(x).all(1)]
+        # Check shape
+        n = x.shape[0]  # number of boxes
+        if not n:  # no boxes
+            continue
+        elif n > max_nms:  # excess boxes
+            x = x[x[:, 4].argsort(descending=True)[:max_nms]]  # sort by confidence
+        # Batched NMS
+        c = x[:, 5:6] * (0 if agnostic else max_wh)  # classes
+        boxes, scores = x[:, :4] + c, x[:, 4]  # boxes (offset by class), scores
+        i = torchvision.ops.nms(boxes, scores, iou_thres)  # NMS
+        if i.shape[0] > max_det:  # limit detections
+            i = i[:max_det]
+        if merge and (1 < n < 3E3):  # Merge NMS (boxes merged using weighted mean)
+            # update boxes as boxes(i,4) = weights(i,n) * boxes(n,4)
+            iou = box_iou(boxes[i], boxes) > iou_thres  # iou matrix
+            weights = iou * scores[None]  # box weights
+            x[i, :4] = torch.mm(weights, x[:, :4]).float() / weights.sum(1, keepdim=True)  # merged boxes
+            if redundant:
+                i = i[iou.sum(1) > 1]  # require redundancy
+        output[xi] = x[i]
+        if (time.time() - t) > time_limit:
+            print(f'WARNING: NMS time limit {time_limit}s exceeded')
+            break  # time limit exceeded
+    return output
+def xywh2xyxy(x):
+    # Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right
+    y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x)
+    y[:, 0] = x[:, 0] - x[:, 2] / 2  # top left x
+    y[:, 1] = x[:, 1] - x[:, 3] / 2  # top left y
+    y[:, 2] = x[:, 0] + x[:, 2] / 2  # bottom right x
+    y[:, 3] = x[:, 1] + x[:, 3] / 2  # bottom right y
+    return y
+DEFAULT_LANG_LIST = ['eng', 'ja']
+def draw_bbox(pred, img, lang_list=None):
+    if lang_list is None:
+        lang_list = DEFAULT_LANG_LIST
+    lw = max(round(sum(img.shape) / 2 * 0.003), 2)  # line width
+    pred = pred.astype(np.int32)
+    colors = Colors()
+    img = np.copy(img)
+    for ii, obj in enumerate(pred):
+        p1, p2 = (obj[0], obj[1]), (obj[2], obj[3])
+        label = lang_list[obj[-1]] + str(ii+1)
+        cv2.rectangle(img, p1, p2, colors(obj[-1], bgr=True), lw, lineType=cv2.LINE_AA)
+        t_w, t_h = cv2.getTextSize(label, 0, fontScale=lw / 3, thickness=lw)[0]
+        cv2.putText(img, label, (p1[0], p1[1] + t_h + 2), 0, lw / 3, colors(obj[-1], bgr=True), max(lw-1, 1), cv2.LINE_AA)
+    return img

manga_translator/detection/ctd_utils/yolov5/common.py ADDED Viewed

	@@ -0,0 +1,289 @@

+# YOLOv5 🚀 by Ultralytics, GPL-3.0 license
+"""
+Common modules
+"""
+import json
+import math
+import platform
+import warnings
+from collections import OrderedDict, namedtuple
+from copy import copy
+from pathlib import Path
+import cv2
+import numpy as np
+import requests
+import torch
+import torch.nn as nn
+from PIL import Image
+from torch.cuda import amp
+from ..utils.yolov5_utils import make_divisible, initialize_weights, check_anchor_order, check_version, fuse_conv_and_bn
+def autopad(k, p=None):  # kernel, padding
+    # Pad to 'same'
+    if p is None:
+        p = k // 2 if isinstance(k, int) else [x // 2 for x in k]  # auto-pad
+    return p
+class Conv(nn.Module):
+    # Standard convolution
+    def __init__(self, c1, c2, k=1, s=1, p=None, g=1, act=True):  # ch_in, ch_out, kernel, stride, padding, groups
+        super().__init__()
+        self.conv = nn.Conv2d(c1, c2, k, s, autopad(k, p), groups=g, bias=False)
+        self.bn = nn.BatchNorm2d(c2)
+        if isinstance(act, bool):
+            self.act = nn.SiLU() if act is True else (act if isinstance(act, nn.Module) else nn.Identity())
+        elif isinstance(act, str):
+            if act == 'leaky':
+                self.act = nn.LeakyReLU(0.1, inplace=True)
+            elif act == 'relu':
+                self.act = nn.ReLU(inplace=True)
+            else:
+                self.act = None
+    def forward(self, x):
+        return self.act(self.bn(self.conv(x)))
+    def forward_fuse(self, x):
+        return self.act(self.conv(x))
+class DWConv(Conv):
+    # Depth-wise convolution class
+    def __init__(self, c1, c2, k=1, s=1, act=True):  # ch_in, ch_out, kernel, stride, padding, groups
+        super().__init__(c1, c2, k, s, g=math.gcd(c1, c2), act=act)
+class TransformerLayer(nn.Module):
+    # Transformer layer https://arxiv.org/abs/2010.11929 (LayerNorm layers removed for better performance)
+    def __init__(self, c, num_heads):
+        super().__init__()
+        self.q = nn.Linear(c, c, bias=False)
+        self.k = nn.Linear(c, c, bias=False)
+        self.v = nn.Linear(c, c, bias=False)
+        self.ma = nn.MultiheadAttention(embed_dim=c, num_heads=num_heads)
+        self.fc1 = nn.Linear(c, c, bias=False)
+        self.fc2 = nn.Linear(c, c, bias=False)
+    def forward(self, x):
+        x = self.ma(self.q(x), self.k(x), self.v(x))[0] + x
+        x = self.fc2(self.fc1(x)) + x
+        return x
+class TransformerBlock(nn.Module):
+    # Vision Transformer https://arxiv.org/abs/2010.11929
+    def __init__(self, c1, c2, num_heads, num_layers):
+        super().__init__()
+        self.conv = None
+        if c1 != c2:
+            self.conv = Conv(c1, c2)
+        self.linear = nn.Linear(c2, c2)  # learnable position embedding
+        self.tr = nn.Sequential(*(TransformerLayer(c2, num_heads) for _ in range(num_layers)))
+        self.c2 = c2
+    def forward(self, x):
+        if self.conv is not None:
+            x = self.conv(x)
+        b, _, w, h = x.shape
+        p = x.flatten(2).permute(2, 0, 1)
+        return self.tr(p + self.linear(p)).permute(1, 2, 0).reshape(b, self.c2, w, h)
+class Bottleneck(nn.Module):
+    # Standard bottleneck
+    def __init__(self, c1, c2, shortcut=True, g=1, e=0.5, act=True):  # ch_in, ch_out, shortcut, groups, expansion
+        super().__init__()
+        c_ = int(c2 * e)  # hidden channels
+        self.cv1 = Conv(c1, c_, 1, 1, act=act)
+        self.cv2 = Conv(c_, c2, 3, 1, g=g, act=act)
+        self.add = shortcut and c1 == c2
+    def forward(self, x):
+        return x + self.cv2(self.cv1(x)) if self.add else self.cv2(self.cv1(x))
+class BottleneckCSP(nn.Module):
+    # CSP Bottleneck https://github.com/WongKinYiu/CrossStagePartialNetworks
+    def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):  # ch_in, ch_out, number, shortcut, groups, expansion
+        super().__init__()
+        c_ = int(c2 * e)  # hidden channels
+        self.cv1 = Conv(c1, c_, 1, 1)
+        self.cv2 = nn.Conv2d(c1, c_, 1, 1, bias=False)
+        self.cv3 = nn.Conv2d(c_, c_, 1, 1, bias=False)
+        self.cv4 = Conv(2 * c_, c2, 1, 1)
+        self.bn = nn.BatchNorm2d(2 * c_)  # applied to cat(cv2, cv3)
+        self.act = nn.SiLU()
+        self.m = nn.Sequential(*(Bottleneck(c_, c_, shortcut, g, e=1.0) for _ in range(n)))
+    def forward(self, x):
+        y1 = self.cv3(self.m(self.cv1(x)))
+        y2 = self.cv2(x)
+        return self.cv4(self.act(self.bn(torch.cat((y1, y2), dim=1))))
+class C3(nn.Module):
+    # CSP Bottleneck with 3 convolutions
+    def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5, act=True):  # ch_in, ch_out, number, shortcut, groups, expansion
+        super().__init__()
+        c_ = int(c2 * e)  # hidden channels
+        self.cv1 = Conv(c1, c_, 1, 1, act=act)
+        self.cv2 = Conv(c1, c_, 1, 1, act=act)
+        self.cv3 = Conv(2 * c_, c2, 1, act=act)  # act=FReLU(c2)
+        self.m = nn.Sequential(*(Bottleneck(c_, c_, shortcut, g, e=1.0, act=act) for _ in range(n)))
+        # self.m = nn.Sequential(*[CrossConv(c_, c_, 3, 1, g, 1.0, shortcut) for _ in range(n)])
+    def forward(self, x):
+        return self.cv3(torch.cat((self.m(self.cv1(x)), self.cv2(x)), dim=1))
+class C3TR(C3):
+    # C3 module with TransformerBlock()
+    def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):
+        super().__init__(c1, c2, n, shortcut, g, e)
+        c_ = int(c2 * e)
+        self.m = TransformerBlock(c_, c_, 4, n)
+class C3SPP(C3):
+    # C3 module with SPP()
+    def __init__(self, c1, c2, k=(5, 9, 13), n=1, shortcut=True, g=1, e=0.5):
+        super().__init__(c1, c2, n, shortcut, g, e)
+        c_ = int(c2 * e)
+        self.m = SPP(c_, c_, k)
+class C3Ghost(C3):
+    # C3 module with GhostBottleneck()
+    def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):
+        super().__init__(c1, c2, n, shortcut, g, e)
+        c_ = int(c2 * e)  # hidden channels
+        self.m = nn.Sequential(*(GhostBottleneck(c_, c_) for _ in range(n)))
+class SPP(nn.Module):
+    # Spatial Pyramid Pooling (SPP) layer https://arxiv.org/abs/1406.4729
+    def __init__(self, c1, c2, k=(5, 9, 13)):
+        super().__init__()
+        c_ = c1 // 2  # hidden channels
+        self.cv1 = Conv(c1, c_, 1, 1)
+        self.cv2 = Conv(c_ * (len(k) + 1), c2, 1, 1)
+        self.m = nn.ModuleList([nn.MaxPool2d(kernel_size=x, stride=1, padding=x // 2) for x in k])
+    def forward(self, x):
+        x = self.cv1(x)
+        with warnings.catch_warnings():
+            warnings.simplefilter('ignore')  # suppress torch 1.9.0 max_pool2d() warning
+            return self.cv2(torch.cat([x] + [m(x) for m in self.m], 1))
+class SPPF(nn.Module):
+    # Spatial Pyramid Pooling - Fast (SPPF) layer for YOLOv5 by Glenn Jocher
+    def __init__(self, c1, c2, k=5):  # equivalent to SPP(k=(5, 9, 13))
+        super().__init__()
+        c_ = c1 // 2  # hidden channels
+        self.cv1 = Conv(c1, c_, 1, 1)
+        self.cv2 = Conv(c_ * 4, c2, 1, 1)
+        self.m = nn.MaxPool2d(kernel_size=k, stride=1, padding=k // 2)
+    def forward(self, x):
+        x = self.cv1(x)
+        with warnings.catch_warnings():
+            warnings.simplefilter('ignore')  # suppress torch 1.9.0 max_pool2d() warning
+            y1 = self.m(x)
+            y2 = self.m(y1)
+            return self.cv2(torch.cat([x, y1, y2, self.m(y2)], 1))
+class Focus(nn.Module):
+    # Focus wh information into c-space
+    def __init__(self, c1, c2, k=1, s=1, p=None, g=1, act=True):  # ch_in, ch_out, kernel, stride, padding, groups
+        super().__init__()
+        self.conv = Conv(c1 * 4, c2, k, s, p, g, act)
+        # self.contract = Contract(gain=2)
+    def forward(self, x):  # x(b,c,w,h) -> y(b,4c,w/2,h/2)
+        return self.conv(torch.cat([x[..., ::2, ::2], x[..., 1::2, ::2], x[..., ::2, 1::2], x[..., 1::2, 1::2]], 1))
+        # return self.conv(self.contract(x))
+class GhostConv(nn.Module):
+    # Ghost Convolution https://github.com/huawei-noah/ghostnet
+    def __init__(self, c1, c2, k=1, s=1, g=1, act=True):  # ch_in, ch_out, kernel, stride, groups
+        super().__init__()
+        c_ = c2 // 2  # hidden channels
+        self.cv1 = Conv(c1, c_, k, s, None, g, act)
+        self.cv2 = Conv(c_, c_, 5, 1, None, c_, act)
+    def forward(self, x):
+        y = self.cv1(x)
+        return torch.cat([y, self.cv2(y)], 1)
+class GhostBottleneck(nn.Module):
+    # Ghost Bottleneck https://github.com/huawei-noah/ghostnet
+    def __init__(self, c1, c2, k=3, s=1):  # ch_in, ch_out, kernel, stride
+        super().__init__()
+        c_ = c2 // 2
+        self.conv = nn.Sequential(GhostConv(c1, c_, 1, 1),  # pw
+                                  DWConv(c_, c_, k, s, act=False) if s == 2 else nn.Identity(),  # dw
+                                  GhostConv(c_, c2, 1, 1, act=False))  # pw-linear
+        self.shortcut = nn.Sequential(DWConv(c1, c1, k, s, act=False),
+                                      Conv(c1, c2, 1, 1, act=False)) if s == 2 else nn.Identity()
+    def forward(self, x):
+        return self.conv(x) + self.shortcut(x)
+class Contract(nn.Module):
+    # Contract width-height into channels, i.e. x(1,64,80,80) to x(1,256,40,40)
+    def __init__(self, gain=2):
+        super().__init__()
+        self.gain = gain
+    def forward(self, x):
+        b, c, h, w = x.size()  # assert (h / s == 0) and (W / s == 0), 'Indivisible gain'
+        s = self.gain
+        x = x.view(b, c, h // s, s, w // s, s)  # x(1,64,40,2,40,2)
+        x = x.permute(0, 3, 5, 1, 2, 4).contiguous()  # x(1,2,2,64,40,40)
+        return x.view(b, c * s * s, h // s, w // s)  # x(1,256,40,40)
+class Expand(nn.Module):
+    # Expand channels into width-height, i.e. x(1,64,80,80) to x(1,16,160,160)
+    def __init__(self, gain=2):
+        super().__init__()
+        self.gain = gain
+    def forward(self, x):
+        b, c, h, w = x.size()  # assert C / s ** 2 == 0, 'Indivisible gain'
+        s = self.gain
+        x = x.view(b, s, s, c // s ** 2, h, w)  # x(1,2,2,16,80,80)
+        x = x.permute(0, 3, 4, 1, 5, 2).contiguous()  # x(1,16,80,2,80,2)
+        return x.view(b, c // s ** 2, h * s, w * s)  # x(1,16,160,160)
+class Concat(nn.Module):
+    # Concatenate a list of tensors along dimension
+    def __init__(self, dimension=1):
+        super().__init__()
+        self.d = dimension
+    def forward(self, x):
+        return torch.cat(x, self.d)
+class Classify(nn.Module):
+    # Classification head, i.e. x(b,c1,20,20) to x(b,c2)
+    def __init__(self, c1, c2, k=1, s=1, p=None, g=1):  # ch_in, ch_out, kernel, stride, padding, groups
+        super().__init__()
+        self.aap = nn.AdaptiveAvgPool2d(1)  # to x(b,c1,1,1)
+        self.conv = nn.Conv2d(c1, c2, k, s, autopad(k, p), groups=g)  # to x(b,c2,1,1)
+        self.flat = nn.Flatten()
+    def forward(self, x):
+        z = torch.cat([self.aap(y) for y in (x if isinstance(x, list) else [x])], 1)  # cat if list
+        return self.flat(self.conv(z))  # flatten to x(b,c2)

manga_translator/detection/ctd_utils/yolov5/yolo.py ADDED Viewed

	@@ -0,0 +1,311 @@

+from operator import mod
+from cv2 import imshow
+# from utils.yolov5_utils import scale_img
+from copy import deepcopy
+from .common import *
+class Detect(nn.Module):
+    stride = None  # strides computed during build
+    onnx_dynamic = False  # ONNX export parameter
+    def __init__(self, nc=80, anchors=(), ch=(), inplace=True):  # detection layer
+        super().__init__()
+        self.nc = nc  # number of classes
+        self.no = nc + 5  # number of outputs per anchor
+        self.nl = len(anchors)  # number of detection layers
+        self.na = len(anchors[0]) // 2  # number of anchors
+        self.grid = [torch.zeros(1)] * self.nl  # init grid
+        self.anchor_grid = [torch.zeros(1)] * self.nl  # init anchor grid
+        self.register_buffer('anchors', torch.tensor(anchors).float().view(self.nl, -1, 2))  # shape(nl,na,2)
+        self.m = nn.ModuleList(nn.Conv2d(x, self.no * self.na, 1) for x in ch)  # output conv
+        self.inplace = inplace  # use in-place ops (e.g. slice assignment)
+    def forward(self, x):
+        z = []  # inference output
+        for i in range(self.nl):
+            x[i] = self.m[i](x[i])  # conv
+            bs, _, ny, nx = x[i].shape  # x(bs,255,20,20) to x(bs,3,20,20,85)
+            x[i] = x[i].view(bs, self.na, self.no, ny, nx).permute(0, 1, 3, 4, 2).contiguous()
+            if not self.training:  # inference
+                if self.onnx_dynamic or self.grid[i].shape[2:4] != x[i].shape[2:4]:
+                    self.grid[i], self.anchor_grid[i] = self._make_grid(nx, ny, i)
+                y = x[i].sigmoid()
+                if self.inplace:
+                    y[..., 0:2] = (y[..., 0:2] * 2 - 0.5 + self.grid[i]) * self.stride[i]  # xy
+                    y[..., 2:4] = (y[..., 2:4] * 2) ** 2 * self.anchor_grid[i]  # wh
+                else:  # for YOLOv5 on AWS Inferentia https://github.com/ultralytics/yolov5/pull/2953
+                    xy = (y[..., 0:2] * 2 - 0.5 + self.grid[i]) * self.stride[i]  # xy
+                    wh = (y[..., 2:4] * 2) ** 2 * self.anchor_grid[i]  # wh
+                    y = torch.cat((xy, wh, y[..., 4:]), -1)
+                z.append(y.view(bs, -1, self.no))
+        return x if self.training else (torch.cat(z, 1), x)
+    def _make_grid(self, nx=20, ny=20, i=0):
+        d = self.anchors[i].device
+        if check_version(torch.__version__, '1.10.0'):  # torch>=1.10.0 meshgrid workaround for torch>=0.7 compatibility
+            yv, xv = torch.meshgrid([torch.arange(ny, device=d), torch.arange(nx, device=d)], indexing='ij')
+        else:
+            yv, xv = torch.meshgrid([torch.arange(ny, device=d), torch.arange(nx, device=d)])
+        grid = torch.stack((xv, yv), 2).expand((1, self.na, ny, nx, 2)).float()
+        anchor_grid = (self.anchors[i].clone() * self.stride[i]) \
+            .view((1, self.na, 1, 1, 2)).expand((1, self.na, ny, nx, 2)).float()
+        return grid, anchor_grid
+class Model(nn.Module):
+    def __init__(self, cfg='yolov5s.yaml', ch=3, nc=None, anchors=None):  # model, input channels, number of classes
+        super().__init__()
+        self.out_indices = None
+        if isinstance(cfg, dict):
+            self.yaml = cfg  # model dict
+        else:  # is *.yaml
+            import yaml  # for torch hub
+            self.yaml_file = Path(cfg).name
+            with open(cfg, encoding='ascii', errors='ignore') as f:
+                self.yaml = yaml.safe_load(f)  # model dict
+        # Define model
+        ch = self.yaml['ch'] = self.yaml.get('ch', ch)  # input channels
+        if nc and nc != self.yaml['nc']:
+            # LOGGER.info(f"Overriding model.yaml nc={self.yaml['nc']} with nc={nc}")
+            self.yaml['nc'] = nc  # override yaml value
+        if anchors:
+            # LOGGER.info(f'Overriding model.yaml anchors with anchors={anchors}')
+            self.yaml['anchors'] = round(anchors)  # override yaml value
+        self.model, self.save = parse_model(deepcopy(self.yaml), ch=[ch])  # model, savelist
+        self.names = [str(i) for i in range(self.yaml['nc'])]  # default names
+        self.inplace = self.yaml.get('inplace', True)
+        # Build strides, anchors
+        m = self.model[-1]  # Detect()
+        # with torch.no_grad():
+        if isinstance(m, Detect):
+            s = 256  # 2x min stride
+            m.inplace = self.inplace
+            m.stride = torch.tensor([s / x.shape[-2] for x in self.forward(torch.zeros(1, ch, s, s))])  # forward
+            m.anchors /= m.stride.view(-1, 1, 1)
+            check_anchor_order(m)
+            self.stride = m.stride
+            self._initialize_biases()  # only run once
+        # Init weights, biases
+        initialize_weights(self)
+    def forward(self, x, augment=False, profile=False, visualize=False, detect=False):
+        # if augment:
+        #     return self._forward_augment(x)  # augmented inference, None
+        return self._forward_once(x, profile, visualize, detect=detect)  # single-scale inference, train
+    # def _forward_augment(self, x):
+    #     img_size = x.shape[-2:]  # height, width
+    #     s = [1, 0.83, 0.67]  # scales
+    #     f = [None, 3, None]  # flips (2-ud, 3-lr)
+    #     y = []  # outputs
+    #     for si, fi in zip(s, f):
+    #         xi = scale_img(x.flip(fi) if fi else x, si, gs=int(self.stride.max()))
+    #         yi = self._forward_once(xi)[0]  # forward
+    #         # cv2.imwrite(f'img_{si}.jpg', 255 * xi[0].cpu().numpy().transpose((1, 2, 0))[:, :, ::-1])  # save
+    #         yi = self._descale_pred(yi, fi, si, img_size)
+    #         y.append(yi)
+    #     y = self._clip_augmented(y)  # clip augmented tails
+    #     return torch.cat(y, 1), None  # augmented inference, train
+    def _forward_once(self, x, profile=False, visualize=False, detect=False):
+        y, dt = [], []  # outputs
+        z = []
+        for ii, m in enumerate(self.model):
+            if m.f != -1:  # if not from previous layer
+                x = y[m.f] if isinstance(m.f, int) else [x if j == -1 else y[j] for j in m.f]  # from earlier layers
+            if profile:
+                self._profile_one_layer(m, x, dt)
+            x = m(x)  # run
+            y.append(x if m.i in self.save else None)  # save output
+            if self.out_indices is not None:
+                if m.i in self.out_indices:
+                    z.append(x)
+        if self.out_indices is not None:
+            if detect:
+                return x, z
+            else:
+                return z
+        else:
+            return x
+    def _descale_pred(self, p, flips, scale, img_size):
+        # de-scale predictions following augmented inference (inverse operation)
+        if self.inplace:
+            p[..., :4] /= scale  # de-scale
+            if flips == 2:
+                p[..., 1] = img_size[0] - p[..., 1]  # de-flip ud
+            elif flips == 3:
+                p[..., 0] = img_size[1] - p[..., 0]  # de-flip lr
+        else:
+            x, y, wh = p[..., 0:1] / scale, p[..., 1:2] / scale, p[..., 2:4] / scale  # de-scale
+            if flips == 2:
+                y = img_size[0] - y  # de-flip ud
+            elif flips == 3:
+                x = img_size[1] - x  # de-flip lr
+            p = torch.cat((x, y, wh, p[..., 4:]), -1)
+        return p
+    def _clip_augmented(self, y):
+        # Clip YOLOv5 augmented inference tails
+        nl = self.model[-1].nl  # number of detection layers (P3-P5)
+        g = sum(4 ** x for x in range(nl))  # grid points
+        e = 1  # exclude layer count
+        i = (y[0].shape[1] // g) * sum(4 ** x for x in range(e))  # indices
+        y[0] = y[0][:, :-i]  # large
+        i = (y[-1].shape[1] // g) * sum(4 ** (nl - 1 - x) for x in range(e))  # indices
+        y[-1] = y[-1][:, i:]  # small
+        return y
+    def _profile_one_layer(self, m, x, dt):
+        c = isinstance(m, Detect)  # is final layer, copy input as inplace fix
+        for _ in range(10):
+            m(x.copy() if c else x)
+    def _initialize_biases(self, cf=None):  # initialize biases into Detect(), cf is class frequency
+        # https://arxiv.org/abs/1708.02002 section 3.3
+        # cf = torch.bincount(torch.tensor(np.concatenate(dataset.labels, 0)[:, 0]).long(), minlength=nc) + 1.
+        m = self.model[-1]  # Detect() module
+        for mi, s in zip(m.m, m.stride):  # from
+            b = mi.bias.view(m.na, -1)  # conv.bias(255) to (3,85)
+            b.data[:, 4] += math.log(8 / (640 / s) ** 2)  # obj (8 objects per 640 image)
+            b.data[:, 5:] += math.log(0.6 / (m.nc - 0.999999)) if cf is None else torch.log(cf / cf.sum())  # cls
+            mi.bias = torch.nn.Parameter(b.view(-1), requires_grad=True)
+    def _print_biases(self):
+        m = self.model[-1]  # Detect() module
+        for mi in m.m:  # from
+            b = mi.bias.detach().view(m.na, -1).T  # conv.bias(255) to (3,85)
+    def fuse(self):  # fuse model Conv2d() + BatchNorm2d() layers
+        for m in self.model.modules():
+            if isinstance(m, (Conv, DWConv)) and hasattr(m, 'bn'):
+                m.conv = fuse_conv_and_bn(m.conv, m.bn)  # update conv
+                delattr(m, 'bn')  # remove batchnorm
+                m.forward = m.forward_fuse  # update forward
+        # self.info()
+        return self
+    # def info(self, verbose=False, img_size=640):  # print model information
+    #     model_info(self, verbose, img_size)
+    def _apply(self, fn):
+        # Apply to(), cpu(), cuda(), half() to model tensors that are not parameters or registered buffers
+        self = super()._apply(fn)
+        m = self.model[-1]  # Detect()
+        if isinstance(m, Detect):
+            m.stride = fn(m.stride)
+            m.grid = list(map(fn, m.grid))
+            if isinstance(m.anchor_grid, list):
+                m.anchor_grid = list(map(fn, m.anchor_grid))
+        return self
+def parse_model(d, ch):  # model_dict, input_channels(3)
+    # LOGGER.info(f"\n{'':>3}{'from':>18}{'n':>3}{'params':>10}  {'module':<40}{'arguments':<30}")
+    anchors, nc, gd, gw = d['anchors'], d['nc'], d['depth_multiple'], d['width_multiple']
+    na = (len(anchors[0]) // 2) if isinstance(anchors, list) else anchors  # number of anchors
+    no = na * (nc + 5)  # number of outputs = anchors * (classes + 5)
+    layers, save, c2 = [], [], ch[-1]  # layers, savelist, ch out
+    for i, (f, n, m, args) in enumerate(d['backbone'] + d['head']):  # from, number, module, args
+        m = eval(m) if isinstance(m, str) else m  # eval strings
+        for j, a in enumerate(args):
+            try:
+                args[j] = eval(a) if isinstance(a, str) else a  # eval strings
+            except NameError:
+                pass
+        n = n_ = max(round(n * gd), 1) if n > 1 else n  # depth gain
+        if m in [Conv, GhostConv, Bottleneck, GhostBottleneck, SPP, SPPF, DWConv, Focus,
+                 BottleneckCSP, C3, C3TR, C3SPP, C3Ghost]:
+            c1, c2 = ch[f], args[0]
+            if c2 != no:  # if not output
+                c2 = make_divisible(c2 * gw, 8)
+            args = [c1, c2, *args[1:]]
+            if m in [BottleneckCSP, C3, C3TR, C3Ghost]:
+                args.insert(2, n)  # number of repeats
+                n = 1
+        elif m is nn.BatchNorm2d:
+            args = [ch[f]]
+        elif m is Concat:
+            c2 = sum(ch[x] for x in f)
+        elif m is Detect:
+            args.append([ch[x] for x in f])
+            if isinstance(args[1], int):  # number of anchors
+                args[1] = [list(range(args[1] * 2))] * len(f)
+        elif m is Contract:
+            c2 = ch[f] * args[0] ** 2
+        elif m is Expand:
+            c2 = ch[f] // args[0] ** 2
+        else:
+            c2 = ch[f]
+        m_ = nn.Sequential(*(m(*args) for _ in range(n))) if n > 1 else m(*args)  # module
+        t = str(m)[8:-2].replace('__main__.', '')  # module type
+        np = sum(x.numel() for x in m_.parameters())  # number params
+        m_.i, m_.f, m_.type, m_.np = i, f, t, np  # attach index, 'from' index, type, number params
+        # LOGGER.info(f'{i:>3}{str(f):>18}{n_:>3}{np:10.0f}  {t:<40}{str(args):<30}')  # print
+        save.extend(x % i for x in ([f] if isinstance(f, int) else f) if x != -1)  # append to savelist
+        layers.append(m_)
+        if i == 0:
+            ch = []
+        ch.append(c2)
+    return nn.Sequential(*layers), sorted(save)
+def load_yolov5(weights, map_location='cuda', fuse=True, inplace=True, out_indices=[1, 3, 5, 7, 9]):
+    if isinstance(weights, str):
+        ckpt = torch.load(weights, map_location=map_location)  # load
+    else:
+        ckpt = weights
+    if fuse:
+        model = ckpt['model'].float().fuse().eval()  # FP32 model
+    else:
+        model = ckpt['model'].float().eval()  # without layer fuse
+    # Compatibility updates
+    for m in model.modules():
+        if type(m) in [nn.Hardswish, nn.LeakyReLU, nn.ReLU, nn.ReLU6, nn.SiLU, Detect, Model]:
+            m.inplace = inplace  # pytorch 1.7.0 compatibility
+            if type(m) is Detect:
+                if not isinstance(m.anchor_grid, list):  # new Detect Layer compatibility
+                    delattr(m, 'anchor_grid')
+                    setattr(m, 'anchor_grid', [torch.zeros(1)] * m.nl)
+        elif type(m) is Conv:
+            m._non_persistent_buffers_set = set()  # pytorch 1.6.0 compatibility
+    model.out_indices = out_indices
+    return model
+@torch.no_grad()
+def load_yolov5_ckpt(weights, map_location='cpu', fuse=True, inplace=True, out_indices=[1, 3, 5, 7, 9]):
+    if isinstance(weights, str):
+        ckpt = torch.load(weights, map_location=map_location)  # load
+    else:
+        ckpt = weights
+    model = Model(ckpt['cfg'])
+    model.load_state_dict(ckpt['weights'], strict=True)
+    if fuse:
+        model = model.float().fuse().eval()  # FP32 model
+    else:
+        model = model.float().eval()  # without layer fuse
+    # Compatibility updates
+    for m in model.modules():
+        if type(m) in [nn.Hardswish, nn.LeakyReLU, nn.ReLU, nn.ReLU6, nn.SiLU, Detect, Model]:
+            m.inplace = inplace  # pytorch 1.7.0 compatibility
+            if type(m) is Detect:
+                if not isinstance(m.anchor_grid, list):  # new Detect Layer compatibility
+                    delattr(m, 'anchor_grid')
+                    setattr(m, 'anchor_grid', [torch.zeros(1)] * m.nl)
+        elif type(m) is Conv:
+            m._non_persistent_buffers_set = set()  # pytorch 1.6.0 compatibility
+    model.out_indices = out_indices
+    return model

manga_translator/detection/dbnet_convnext.py ADDED Viewed

	@@ -0,0 +1,596 @@

+from functools import partial
+import shutil
+from typing import Callable, Optional, Tuple, Union
+import cv2
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.nn.init as init
+from torchvision.models import resnet34
+import einops
+import math
+from timm.layers import trunc_normal_, AvgPool2dSame, DropPath, Mlp, GlobalResponseNormMlp, \
+	LayerNorm2d, LayerNorm, create_conv2d, get_act_layer, make_divisible, to_ntuple
+class Downsample(nn.Module):
+	def __init__(self, in_chs, out_chs, stride=1, dilation=1):
+		super().__init__()
+		avg_stride = stride if dilation == 1 else 1
+		if stride > 1 or dilation > 1:
+			avg_pool_fn = AvgPool2dSame if avg_stride == 1 and dilation > 1 else nn.AvgPool2d
+			self.pool = avg_pool_fn(2, avg_stride, ceil_mode=True, count_include_pad=False)
+		else:
+			self.pool = nn.Identity()
+		if in_chs != out_chs:
+			self.conv = create_conv2d(in_chs, out_chs, 1, stride=1)
+		else:
+			self.conv = nn.Identity()
+	def forward(self, x):
+		x = self.pool(x)
+		x = self.conv(x)
+		return x
+class ConvNeXtBlock(nn.Module):
+	""" ConvNeXt Block
+	There are two equivalent implementations:
+	  (1) DwConv -> LayerNorm (channels_first) -> 1x1 Conv -> GELU -> 1x1 Conv; all in (N, C, H, W)
+	  (2) DwConv -> Permute to (N, H, W, C); LayerNorm (channels_last) -> Linear -> GELU -> Linear; Permute back
+	Unlike the official impl, this one allows choice of 1 or 2, 1x1 conv can be faster with appropriate
+	choice of LayerNorm impl, however as model size increases the tradeoffs appear to change and nn.Linear
+	is a better choice. This was observed with PyTorch 1.10 on 3090 GPU, it could change over time & w/ different HW.
+	"""
+	def __init__(
+			self,
+			in_chs: int,
+			out_chs: Optional[int] = None,
+			kernel_size: int = 7,
+			stride: int = 1,
+			dilation: Union[int, Tuple[int, int]] = (1, 1),
+			mlp_ratio: float = 4,
+			conv_mlp: bool = False,
+			conv_bias: bool = True,
+			use_grn: bool = False,
+			ls_init_value: Optional[float] = 1e-6,
+			act_layer: Union[str, Callable] = 'gelu',
+			norm_layer: Optional[Callable] = None,
+			drop_path: float = 0.,
+	):
+		"""
+		Args:
+			in_chs: Block input channels.
+			out_chs: Block output channels (same as in_chs if None).
+			kernel_size: Depthwise convolution kernel size.
+			stride: Stride of depthwise convolution.
+			dilation: Tuple specifying input and output dilation of block.
+			mlp_ratio: MLP expansion ratio.
+			conv_mlp: Use 1x1 convolutions for MLP and a NCHW compatible norm layer if True.
+			conv_bias: Apply bias for all convolution (linear) layers.
+			use_grn: Use GlobalResponseNorm in MLP (from ConvNeXt-V2)
+			ls_init_value: Layer-scale init values, layer-scale applied if not None.
+			act_layer: Activation layer.
+			norm_layer: Normalization layer (defaults to LN if not specified).
+			drop_path: Stochastic depth probability.
+		"""
+		super().__init__()
+		out_chs = out_chs or in_chs
+		dilation = to_ntuple(2)(dilation)
+		act_layer = get_act_layer(act_layer)
+		if not norm_layer:
+			norm_layer = LayerNorm2d if conv_mlp else LayerNorm
+		mlp_layer = partial(GlobalResponseNormMlp if use_grn else Mlp, use_conv=conv_mlp)
+		self.use_conv_mlp = conv_mlp
+		self.conv_dw = create_conv2d(
+			in_chs,
+			out_chs,
+			kernel_size=kernel_size,
+			stride=stride,
+			dilation=dilation[0],
+			depthwise=True if out_chs >= in_chs else False,
+			bias=conv_bias,
+		)
+		self.norm = norm_layer(out_chs)
+		self.mlp = mlp_layer(out_chs, int(mlp_ratio * out_chs), act_layer=act_layer)
+		self.gamma = nn.Parameter(ls_init_value * torch.ones(out_chs)) if ls_init_value is not None else None
+		if in_chs != out_chs or stride != 1 or dilation[0] != dilation[1]:
+			self.shortcut = Downsample(in_chs, out_chs, stride=stride, dilation=dilation[0])
+		else:
+			self.shortcut = nn.Identity()
+		self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+	def forward(self, x):
+		shortcut = x
+		x = self.conv_dw(x)
+		if self.use_conv_mlp:
+			x = self.norm(x)
+			x = self.mlp(x)
+		else:
+			x = x.permute(0, 2, 3, 1)
+			x = self.norm(x)
+			x = self.mlp(x)
+			x = x.permute(0, 3, 1, 2)
+		if self.gamma is not None:
+			x = x.mul(self.gamma.reshape(1, -1, 1, 1))
+		x = self.drop_path(x) + self.shortcut(shortcut)
+		return x
+class ConvNeXtStage(nn.Module):
+	def __init__(
+			self,
+			in_chs,
+			out_chs,
+			kernel_size=7,
+			stride=2,
+			depth=2,
+			dilation=(1, 1),
+			drop_path_rates=None,
+			ls_init_value=1.0,
+			conv_mlp=False,
+			conv_bias=True,
+			use_grn=False,
+			act_layer='gelu',
+			norm_layer=None,
+			norm_layer_cl=None
+	):
+		super().__init__()
+		self.grad_checkpointing = False
+		if in_chs != out_chs or stride > 1 or dilation[0] != dilation[1]:
+			ds_ks = 2 if stride > 1 or dilation[0] != dilation[1] else 1
+			pad = 'same' if dilation[1] > 1 else 0  # same padding needed if dilation used
+			self.downsample = nn.Sequential(
+				norm_layer(in_chs),
+				create_conv2d(
+					in_chs,
+					out_chs,
+					kernel_size=ds_ks,
+					stride=stride,
+					dilation=dilation[0],
+					padding=pad,
+					bias=conv_bias,
+				),
+			)
+			in_chs = out_chs
+		else:
+			self.downsample = nn.Identity()
+		drop_path_rates = drop_path_rates or [0.] * depth
+		stage_blocks = []
+		for i in range(depth):
+			stage_blocks.append(ConvNeXtBlock(
+				in_chs=in_chs,
+				out_chs=out_chs,
+				kernel_size=kernel_size,
+				dilation=dilation[1],
+				drop_path=drop_path_rates[i],
+				ls_init_value=ls_init_value,
+				conv_mlp=conv_mlp,
+				conv_bias=conv_bias,
+				use_grn=use_grn,
+				act_layer=act_layer,
+				norm_layer=norm_layer if conv_mlp else norm_layer_cl,
+			))
+			in_chs = out_chs
+		self.blocks = nn.Sequential(*stage_blocks)
+	def forward(self, x):
+		x = self.downsample(x)
+		x = self.blocks(x)
+		return x
+class ConvNeXt(nn.Module):
+	r""" ConvNeXt
+		A PyTorch impl of : `A ConvNet for the 2020s`  - https://arxiv.org/pdf/2201.03545.pdf
+	"""
+	def __init__(
+			self,
+			in_chans: int = 3,
+			num_classes: int = 1000,
+			global_pool: str = 'avg',
+			output_stride: int = 32,
+			depths: Tuple[int, ...] = (3, 3, 9, 3),
+			dims: Tuple[int, ...] = (96, 192, 384, 768),
+			kernel_sizes: Union[int, Tuple[int, ...]] = 7,
+			ls_init_value: Optional[float] = 1e-6,
+			stem_type: str = 'patch',
+			patch_size: int = 4,
+			head_init_scale: float = 1.,
+			head_norm_first: bool = False,
+			head_hidden_size: Optional[int] = None,
+			conv_mlp: bool = False,
+			conv_bias: bool = True,
+			use_grn: bool = False,
+			act_layer: Union[str, Callable] = 'gelu',
+			norm_layer: Optional[Union[str, Callable]] = None,
+			norm_eps: Optional[float] = None,
+			drop_rate: float = 0.,
+			drop_path_rate: float = 0.,
+	):
+		"""
+		Args:
+			in_chans: Number of input image channels.
+			num_classes: Number of classes for classification head.
+			global_pool: Global pooling type.
+			output_stride: Output stride of network, one of (8, 16, 32).
+			depths: Number of blocks at each stage.
+			dims: Feature dimension at each stage.
+			kernel_sizes: Depthwise convolution kernel-sizes for each stage.
+			ls_init_value: Init value for Layer Scale, disabled if None.
+			stem_type: Type of stem.
+			patch_size: Stem patch size for patch stem.
+			head_init_scale: Init scaling value for classifier weights and biases.
+			head_norm_first: Apply normalization before global pool + head.
+			head_hidden_size: Size of MLP hidden layer in head if not None and head_norm_first == False.
+			conv_mlp: Use 1x1 conv in MLP, improves speed for small networks w/ chan last.
+			conv_bias: Use bias layers w/ all convolutions.
+			use_grn: Use Global Response Norm (ConvNeXt-V2) in MLP.
+			act_layer: Activation layer type.
+			norm_layer: Normalization layer type.
+			drop_rate: Head pre-classifier dropout rate.
+			drop_path_rate: Stochastic depth drop rate.
+		"""
+		super().__init__()
+		assert output_stride in (8, 16, 32)
+		kernel_sizes = to_ntuple(4)(kernel_sizes)
+		if norm_layer is None:
+			norm_layer = LayerNorm2d
+			norm_layer_cl = norm_layer if conv_mlp else LayerNorm
+			if norm_eps is not None:
+				norm_layer = partial(norm_layer, eps=norm_eps)
+				norm_layer_cl = partial(norm_layer_cl, eps=norm_eps)
+		else:
+			assert conv_mlp,\
+				'If a norm_layer is specified, conv MLP must be used so all norm expect rank-4, channels-first input'
+			norm_layer_cl = norm_layer
+			if norm_eps is not None:
+				norm_layer_cl = partial(norm_layer_cl, eps=norm_eps)
+		self.num_classes = num_classes
+		self.drop_rate = drop_rate
+		self.feature_info = []
+		assert stem_type in ('patch', 'overlap', 'overlap_tiered')
+		if stem_type == 'patch':
+			# NOTE: this stem is a minimal form of ViT PatchEmbed, as used in SwinTransformer w/ patch_size = 4
+			self.stem = nn.Sequential(
+				nn.Conv2d(in_chans, dims[0], kernel_size=patch_size, stride=patch_size, bias=conv_bias),
+				norm_layer(dims[0]),
+			)
+			stem_stride = patch_size
+		else:
+			mid_chs = make_divisible(dims[0] // 2) if 'tiered' in stem_type else dims[0]
+			self.stem = nn.Sequential(
+				nn.Conv2d(in_chans, mid_chs, kernel_size=3, stride=2, padding=1, bias=conv_bias),
+				nn.Conv2d(mid_chs, dims[0], kernel_size=3, stride=2, padding=1, bias=conv_bias),
+				norm_layer(dims[0]),
+			)
+			stem_stride = 4
+		self.stages = nn.Sequential()
+		dp_rates = [x.tolist() for x in torch.linspace(0, drop_path_rate, sum(depths)).split(depths)]
+		stages = []
+		prev_chs = dims[0]
+		curr_stride = stem_stride
+		dilation = 1
+		# 4 feature resolution stages, each consisting of multiple residual blocks
+		for i in range(4):
+			stride = 2 if curr_stride == 2 or i > 0 else 1
+			if curr_stride >= output_stride and stride > 1:
+				dilation *= stride
+				stride = 1
+			curr_stride *= stride
+			first_dilation = 1 if dilation in (1, 2) else 2
+			out_chs = dims[i]
+			stages.append(ConvNeXtStage(
+				prev_chs,
+				out_chs,
+				kernel_size=kernel_sizes[i],
+				stride=stride,
+				dilation=(first_dilation, dilation),
+				depth=depths[i],
+				drop_path_rates=dp_rates[i],
+				ls_init_value=ls_init_value,
+				conv_mlp=conv_mlp,
+				conv_bias=conv_bias,
+				use_grn=use_grn,
+				act_layer=act_layer,
+				norm_layer=norm_layer,
+				norm_layer_cl=norm_layer_cl,
+			))
+			prev_chs = out_chs
+			# NOTE feature_info use currently assumes stage 0 == stride 1, rest are stride 2
+			self.feature_info += [dict(num_chs=prev_chs, reduction=curr_stride, module=f'stages.{i}')]
+		self.stages = nn.Sequential(*stages)
+		self.num_features = prev_chs
+	@torch.jit.ignore
+	def group_matcher(self, coarse=False):
+		return dict(
+			stem=r'^stem',
+			blocks=r'^stages\.(\d+)' if coarse else [
+				(r'^stages\.(\d+)\.downsample', (0,)),  # blocks
+				(r'^stages\.(\d+)\.blocks\.(\d+)', None),
+				(r'^norm_pre', (99999,))
+			]
+		)
+	@torch.jit.ignore
+	def set_grad_checkpointing(self, enable=True):
+		for s in self.stages:
+			s.grad_checkpointing = enable
+	@torch.jit.ignore
+	def get_classifier(self):
+		return self.head.fc
+	def forward_features(self, x):
+		x = self.stem(x)
+		x = self.stages(x)
+		return x
+def _init_weights(module, name=None, head_init_scale=1.0):
+	if isinstance(module, nn.Conv2d):
+		trunc_normal_(module.weight, std=.02)
+		if module.bias is not None:
+			nn.init.zeros_(module.bias)
+	elif isinstance(module, nn.Linear):
+		trunc_normal_(module.weight, std=.02)
+		nn.init.zeros_(module.bias)
+		if name and 'head.' in name:
+			module.weight.data.mul_(head_init_scale)
+			module.bias.data.mul_(head_init_scale)
+class UpconvSkip(nn.Module) :
+	def __init__(self, ch1, ch2, out_ch) -> None:
+		super().__init__()
+		self.conv = ConvNeXtBlock(
+			in_chs=ch1 + ch2,
+			out_chs=out_ch,
+			kernel_size=7,
+			dilation=1,
+			drop_path=0,
+			ls_init_value=1.0,
+			conv_mlp=False,
+			conv_bias=True,
+			use_grn=False,
+			act_layer='gelu',
+			norm_layer=LayerNorm,
+		)
+		self.upconv = nn.ConvTranspose2d(out_ch, out_ch, 2, 2, 0, 0)
+	def forward(self, x) :
+		x = self.conv(x)
+		x = self.upconv(x)
+		return x
+class DBHead(nn.Module):
+	def __init__(self, in_channels, k = 50):
+		super().__init__()
+		self.k = k
+		self.binarize = nn.Sequential(
+			nn.Conv2d(in_channels, in_channels // 4, 3, padding=1),
+			#nn.BatchNorm2d(in_channels // 4),
+			nn.SiLU(inplace=True),
+			nn.ConvTranspose2d(in_channels // 4, in_channels // 4, 4, 2, 1),
+			#nn.BatchNorm2d(in_channels // 4),
+			nn.SiLU(inplace=True),
+			nn.ConvTranspose2d(in_channels // 4, 1, 4, 2, 1),
+			)
+		self.binarize.apply(self.weights_init)
+		self.thresh = self._init_thresh(in_channels)
+		self.thresh.apply(self.weights_init)
+	def forward(self, x):
+		shrink_maps = self.binarize(x)
+		threshold_maps = self.thresh(x)
+		if self.training:
+			binary_maps = self.step_function(shrink_maps.sigmoid(), threshold_maps)
+			y = torch.cat((shrink_maps, threshold_maps, binary_maps), dim=1)
+		else:
+			y = torch.cat((shrink_maps, threshold_maps), dim=1)
+		return y
+	def weights_init(self, m):
+		classname = m.__class__.__name__
+		if classname.find('Conv') != -1:
+			nn.init.kaiming_normal_(m.weight.data)
+		elif classname.find('BatchNorm') != -1:
+			m.weight.data.fill_(1.)
+			m.bias.data.fill_(1e-4)
+	def _init_thresh(self, inner_channels, serial=False, smooth=False, bias=False):
+		in_channels = inner_channels
+		if serial:
+			in_channels += 1
+		self.thresh = nn.Sequential(
+			nn.Conv2d(in_channels, inner_channels // 4, 3, padding=1, bias=bias),
+			#nn.GroupNorm(inner_channels // 4),
+			nn.SiLU(inplace=True),
+			self._init_upsample(inner_channels // 4, inner_channels // 4, smooth=smooth, bias=bias),
+			#nn.GroupNorm(inner_channels // 4),
+			nn.SiLU(inplace=True),
+			self._init_upsample(inner_channels // 4, 1, smooth=smooth, bias=bias),
+			nn.Sigmoid())
+		return self.thresh
+	def _init_upsample(self, in_channels, out_channels, smooth=False, bias=False):
+		if smooth:
+			inter_out_channels = out_channels
+			if out_channels == 1:
+				inter_out_channels = in_channels
+			module_list = [
+				nn.Upsample(scale_factor=2, mode='bilinear'),
+				nn.Conv2d(in_channels, inter_out_channels, 3, 1, 1, bias=bias)]
+			if out_channels == 1:
+				module_list.append(nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=1, bias=True))
+			return nn.Sequential(module_list)
+		else:
+			return nn.ConvTranspose2d(in_channels, out_channels, 4, 2, 1)
+	def step_function(self, x, y):
+		return torch.reciprocal(1 + torch.exp(-self.k * (x - y)))
+class DBNetConvNext(nn.Module) :
+	def __init__(self) :
+		super(DBNetConvNext, self).__init__()
+		self.backbone = ConvNeXt(depths=[3, 3, 27, 3], dims=[128, 256, 512, 1024])
+		self.conv_mask = nn.Sequential(
+			nn.Conv2d(64, 64, kernel_size=3, padding=1), nn.SiLU(inplace=True),
+			nn.Conv2d(64, 32, kernel_size=3, padding=1), nn.SiLU(inplace=True),
+			nn.Conv2d(32, 1, kernel_size=1),
+			nn.Sigmoid()
+		)
+		self.down_conv1 = ConvNeXtStage(1024, 1024, depth = 2, norm_layer = LayerNorm2d)
+		self.down_conv2 = ConvNeXtStage(1024, 1024, depth = 2, norm_layer = LayerNorm2d)
+		self.upconv1 = UpconvSkip(0, 1024, 128)
+		self.upconv2 = UpconvSkip(128, 1024, 128)
+		self.upconv3 = UpconvSkip(128, 1024, 128)
+		self.upconv4 = UpconvSkip(128, 512, 128)
+		self.upconv5 = UpconvSkip(128, 256, 128)
+		self.upconv6 = UpconvSkip(128, 128, 64)
+		self.conv_db = DBHead(128)
+	def forward(self, x) :
+		# in 3@1536
+		x = self.backbone.stem(x) # 128@384
+		h4 = self.backbone.stages[0](x) # 128@384
+		h8 = self.backbone.stages[1](h4) # 256@192
+		h16 = self.backbone.stages[2](h8) # 512@96
+		h32 = self.backbone.stages[3](h16) # 1024@48
+		h64 = self.down_conv1(h32) # 1024@24
+		h128 = self.down_conv2(h64) # 1024@12
+		up128 = self.upconv1(h128)
+		up64 = self.upconv2(torch.cat([up128, h64], dim = 1))
+		up32 = self.upconv3(torch.cat([up64, h32], dim = 1))
+		up16 = self.upconv4(torch.cat([up32, h16], dim = 1))
+		up8 = self.upconv5(torch.cat([up16, h8], dim = 1))
+		up4 = self.upconv6(torch.cat([up8, h4], dim = 1))
+		return self.conv_db(up8), self.conv_mask(up4)
+import os
+from .default_utils import imgproc, dbnet_utils, craft_utils
+from .common import OfflineDetector
+from ..utils import TextBlock, Quadrilateral, det_rearrange_forward
+MODEL = None
+def det_batch_forward_default(batch: np.ndarray, device: str):
+    global MODEL
+    if isinstance(batch, list):
+        batch = np.array(batch)
+    batch = einops.rearrange(batch.astype(np.float32) / 127.5 - 1.0, 'n h w c -> n c h w')
+    batch = torch.from_numpy(batch).to(device)
+    with torch.no_grad():
+        db, mask = MODEL(batch)
+        db = db.sigmoid().cpu().numpy()
+        mask = mask.cpu().numpy()
+    return db, mask
+class DBConvNextDetector(OfflineDetector):
+    _MODEL_MAPPING = {
+        'model': {
+            'url': '',
+            'hash': '',
+            'file': '.',
+        }
+    }
+    def __init__(self, *args, **kwargs):
+        os.makedirs(self.model_dir, exist_ok=True)
+        if os.path.exists('dbnet_convnext.ckpt'):
+            shutil.move('dbnet_convnext.ckpt', self._get_file_path('dbnet_convnext.ckpt'))
+        super().__init__(*args, **kwargs)
+    async def _load(self, device: str):
+        self.model = DBNetConvNext()
+        sd = torch.load(self._get_file_path('dbnet_convnext.ckpt'), map_location='cpu')
+        self.model.load_state_dict(sd['model'] if 'model' in sd else sd)
+        self.model.eval()
+        self.device = device
+        if device == 'cuda' or device == 'mps':
+            self.model = self.model.to(self.device)
+        global MODEL
+        MODEL = self.model
+    async def _unload(self):
+        del self.model
+    async def _infer(self, image: np.ndarray, detect_size: int, text_threshold: float, box_threshold: float,
+                     unclip_ratio: float, verbose: bool = False):
+        # TODO: Move det_rearrange_forward to common.py and refactor
+        db, mask = det_rearrange_forward(image, det_batch_forward_default, detect_size, 4, device=self.device, verbose=verbose)
+        if db is None:
+            # rearrangement is not required, fallback to default forward
+            img_resized, target_ratio, _, pad_w, pad_h = imgproc.resize_aspect_ratio(cv2.bilateralFilter(image, 17, 80, 80), detect_size, cv2.INTER_LINEAR, mag_ratio = 1)
+            img_resized_h, img_resized_w = img_resized.shape[:2]
+            ratio_h = ratio_w = 1 / target_ratio
+            db, mask = det_batch_forward_default([img_resized], self.device)
+        else:
+            img_resized_h, img_resized_w = image.shape[:2]
+            ratio_w = ratio_h = 1
+            pad_h = pad_w = 0
+        self.logger.info(f'Detection resolution: {img_resized_w}x{img_resized_h}')
+        mask = mask[0, 0, :, :]
+        det = dbnet_utils.SegDetectorRepresenter(text_threshold, box_threshold, unclip_ratio=unclip_ratio)
+        # boxes, scores = det({'shape': [(img_resized.shape[0], img_resized.shape[1])]}, db)
+        boxes, scores = det({'shape':[(img_resized_h, img_resized_w)]}, db)
+        boxes, scores = boxes[0], scores[0]
+        if boxes.size == 0:
+            polys = []
+        else:
+            idx = boxes.reshape(boxes.shape[0], -1).sum(axis=1) > 0
+            polys, _ = boxes[idx], scores[idx]
+            polys = polys.astype(np.float64)
+            polys = craft_utils.adjustResultCoordinates(polys, ratio_w, ratio_h, ratio_net=1)
+            polys = polys.astype(np.int16)
+        textlines = [Quadrilateral(pts.astype(int), '', score) for pts, score in zip(polys, scores)]
+        textlines = list(filter(lambda q: q.area > 16, textlines))
+        mask_resized = cv2.resize(mask, (mask.shape[1] * 2, mask.shape[0] * 2), interpolation=cv2.INTER_LINEAR)
+        if pad_h > 0:
+            mask_resized = mask_resized[:-pad_h, :]
+        elif pad_w > 0:
+            mask_resized = mask_resized[:, :-pad_w]
+        raw_mask = np.clip(mask_resized * 255, 0, 255).astype(np.uint8)
+        # if verbose:
+        #     img_bbox_raw = np.copy(image)
+        #     for txtln in textlines:
+        #         cv2.polylines(img_bbox_raw, [txtln.pts], True, color=(255, 0, 0), thickness=2)
+        #     cv2.imwrite(f'result/bboxes_unfiltered.png', cv2.cvtColor(img_bbox_raw, cv2.COLOR_RGB2BGR))
+        return textlines, raw_mask, None
+if __name__ == '__main__' :
+	net = DBNetConvNext().cuda()
+	img = torch.randn(2, 3, 1536, 1536).cuda()
+	ret1, ret2 = net.forward(img)
+	print(ret1.shape)
+	print(ret2.shape)

manga_translator/detection/default.py ADDED Viewed

	@@ -0,0 +1,103 @@

+import os
+import shutil
+import numpy as np
+import torch
+import cv2
+import einops
+from typing import List, Tuple
+from .default_utils.DBNet_resnet34 import TextDetection as TextDetectionDefault
+from .default_utils import imgproc, dbnet_utils, craft_utils
+from .common import OfflineDetector
+from ..utils import TextBlock, Quadrilateral, det_rearrange_forward
+MODEL = None
+def det_batch_forward_default(batch: np.ndarray, device: str):
+    global MODEL
+    if isinstance(batch, list):
+        batch = np.array(batch)
+    batch = einops.rearrange(batch.astype(np.float32) / 127.5 - 1.0, 'n h w c -> n c h w')
+    batch = torch.from_numpy(batch).to(device)
+    with torch.no_grad():
+        db, mask = MODEL(batch)
+        db = db.sigmoid().cpu().numpy()
+        mask = mask.cpu().numpy()
+    return db, mask
+class DefaultDetector(OfflineDetector):
+    _MODEL_MAPPING = {
+        'model': {
+            'url': 'https://github.com/zyddnys/manga-image-translator/releases/download/beta-0.3/detect.ckpt',
+            'hash': '69080aea78de0803092bc6b751ae283ca463011de5f07e1d20e6491b05571a30',
+            'file': '.',
+        }
+    }
+    def __init__(self, *args, **kwargs):
+        os.makedirs(self.model_dir, exist_ok=True)
+        if os.path.exists('detect.ckpt'):
+            shutil.move('detect.ckpt', self._get_file_path('detect.ckpt'))
+        super().__init__(*args, **kwargs)
+    async def _load(self, device: str):
+        self.model = TextDetectionDefault()
+        sd = torch.load(self._get_file_path('detect.ckpt'), map_location='cpu')
+        self.model.load_state_dict(sd['model'] if 'model' in sd else sd)
+        self.model.eval()
+        self.device = device
+        if device == 'cuda' or device == 'mps':
+            self.model = self.model.to(self.device)
+        global MODEL
+        MODEL = self.model
+    async def _unload(self):
+        del self.model
+    async def _infer(self, image: np.ndarray, detect_size: int, text_threshold: float, box_threshold: float,
+                     unclip_ratio: float, verbose: bool = False):
+        # TODO: Move det_rearrange_forward to common.py and refactor
+        db, mask = det_rearrange_forward(image, det_batch_forward_default, detect_size, 4, device=self.device, verbose=verbose)
+        if db is None:
+            # rearrangement is not required, fallback to default forward
+            img_resized, target_ratio, _, pad_w, pad_h = imgproc.resize_aspect_ratio(cv2.bilateralFilter(image, 17, 80, 80), detect_size, cv2.INTER_LINEAR, mag_ratio = 1)
+            img_resized_h, img_resized_w = img_resized.shape[:2]
+            ratio_h = ratio_w = 1 / target_ratio
+            db, mask = det_batch_forward_default([img_resized], self.device)
+        else:
+            img_resized_h, img_resized_w = image.shape[:2]
+            ratio_w = ratio_h = 1
+            pad_h = pad_w = 0
+        self.logger.info(f'Detection resolution: {img_resized_w}x{img_resized_h}')
+        mask = mask[0, 0, :, :]
+        det = dbnet_utils.SegDetectorRepresenter(text_threshold, box_threshold, unclip_ratio=unclip_ratio)
+        # boxes, scores = det({'shape': [(img_resized.shape[0], img_resized.shape[1])]}, db)
+        boxes, scores = det({'shape':[(img_resized_h, img_resized_w)]}, db)
+        boxes, scores = boxes[0], scores[0]
+        if boxes.size == 0:
+            polys = []
+        else:
+            idx = boxes.reshape(boxes.shape[0], -1).sum(axis=1) > 0
+            polys, _ = boxes[idx], scores[idx]
+            polys = polys.astype(np.float64)
+            polys = craft_utils.adjustResultCoordinates(polys, ratio_w, ratio_h, ratio_net=1)
+            polys = polys.astype(np.int16)
+        textlines = [Quadrilateral(pts.astype(int), '', score) for pts, score in zip(polys, scores)]
+        textlines = list(filter(lambda q: q.area > 16, textlines))
+        mask_resized = cv2.resize(mask, (mask.shape[1] * 2, mask.shape[0] * 2), interpolation=cv2.INTER_LINEAR)
+        if pad_h > 0:
+            mask_resized = mask_resized[:-pad_h, :]
+        elif pad_w > 0:
+            mask_resized = mask_resized[:, :-pad_w]
+        raw_mask = np.clip(mask_resized * 255, 0, 255).astype(np.uint8)
+        # if verbose:
+        #     img_bbox_raw = np.copy(image)
+        #     for txtln in textlines:
+        #         cv2.polylines(img_bbox_raw, [txtln.pts], True, color=(255, 0, 0), thickness=2)
+        #     cv2.imwrite(f'result/bboxes_unfiltered.png', cv2.cvtColor(img_bbox_raw, cv2.COLOR_RGB2BGR))
+        return textlines, raw_mask, None

manga_translator/detection/default_utils/CRAFT_resnet34.py ADDED Viewed

	@@ -0,0 +1,153 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.nn.init as init
+from torchvision.models import resnet34
+import einops
+import math
+class ImageMultiheadSelfAttention(nn.Module):
+    def __init__(self, planes):
+        super(ImageMultiheadSelfAttention, self).__init__()
+        self.attn = nn.MultiheadAttention(planes, 4)
+    def forward(self, x):
+        res = x
+        n, c, h, w = x.shape
+        x = einops.rearrange(x, 'n c h w -> (h w) n c')
+        x = self.attn(x, x, x)[0]
+        x = einops.rearrange(x, '(h w) n c -> n c h w', n = n, c = c, h = h, w = w)
+        return res + x
+class double_conv(nn.Module):
+    def __init__(self, in_ch, mid_ch, out_ch, stride = 1, planes = 256):
+        super(double_conv, self).__init__()
+        self.planes = planes
+        # down = None
+        # if stride > 1:
+        #     down = nn.Sequential(
+        #         nn.AvgPool2d(2, 2),
+        #         nn.Conv2d(in_ch + mid_ch, self.planes * Bottleneck.expansion, kernel_size=1, stride=1, bias=False),nn.BatchNorm2d(self.planes * Bottleneck.expansion)
+        #         )
+        self.down = None
+        if stride > 1:
+            self.down = nn.AvgPool2d(2,stride=2)
+        self.conv = nn.Sequential(
+            nn.Conv2d(in_ch + mid_ch, mid_ch, kernel_size=3, padding=1, stride = 1, bias=False),
+            nn.BatchNorm2d(mid_ch),
+            nn.ReLU(inplace=True),
+            #Bottleneck(mid_ch, self.planes, stride, down, 2, 1, avd = True, norm_layer = nn.BatchNorm2d),
+            nn.Conv2d(mid_ch, out_ch, kernel_size=3, stride = 1, padding=1, bias=False),
+            nn.BatchNorm2d(out_ch),
+            nn.ReLU(inplace=True),
+        )
+    def forward(self, x):
+        if self.down is not None:
+            x = self.down(x)
+        x = self.conv(x)
+        return x
+class CRAFT_net(nn.Module):
+    def __init__(self):
+        super(CRAFT_net, self).__init__()
+        self.backbone = resnet34()
+        self.conv_rs = nn.Sequential(
+            nn.Conv2d(64, 32, kernel_size=3, padding=1), nn.ReLU(inplace=True),
+            nn.Conv2d(32, 32, kernel_size=3, padding=1), nn.ReLU(inplace=True),
+            nn.Conv2d(32, 32, kernel_size=3, padding=1), nn.ReLU(inplace=True),
+            nn.Conv2d(32, 32, kernel_size=1), nn.ReLU(inplace=True),
+            nn.Conv2d(32, 1, kernel_size=1),
+            nn.Sigmoid()
+        )
+        self.conv_as = nn.Sequential(
+            nn.Conv2d(64, 32, kernel_size=3, padding=1), nn.ReLU(inplace=True),
+            nn.Conv2d(32, 32, kernel_size=3, padding=1), nn.ReLU(inplace=True),
+            nn.Conv2d(32, 32, kernel_size=3, padding=1), nn.ReLU(inplace=True),
+            nn.Conv2d(32, 32, kernel_size=1), nn.ReLU(inplace=True),
+            nn.Conv2d(32, 1, kernel_size=1),
+            nn.Sigmoid()
+        )
+        self.conv_mask = nn.Sequential(
+            nn.Conv2d(64, 64, kernel_size=3, padding=1), nn.ReLU(inplace=True),
+            nn.Conv2d(64, 64, kernel_size=3, padding=1), nn.ReLU(inplace=True),
+            nn.Conv2d(64, 32, kernel_size=3, padding=1), nn.ReLU(inplace=True),
+            nn.Conv2d(32, 1, kernel_size=1),
+            nn.Sigmoid()
+        )
+        self.down_conv1 = double_conv(0, 512, 512, 2)
+        self.down_conv2 = double_conv(0, 512, 512, 2)
+        self.down_conv3 = double_conv(0, 512, 512, 2)
+        self.upconv1 = double_conv(0, 512, 256)
+        self.upconv2 = double_conv(256, 512, 256)
+        self.upconv3 = double_conv(256, 512, 256)
+        self.upconv4 = double_conv(256, 512, 256, planes = 128)
+        self.upconv5 = double_conv(256, 256, 128, planes = 64)
+        self.upconv6 = double_conv(128, 128, 64, planes = 32)
+        self.upconv7 = double_conv(64, 64, 64, planes = 16)
+    def forward_train(self, x):
+        x = self.backbone.conv1(x)
+        x = self.backbone.bn1(x)
+        x = self.backbone.relu(x)
+        x = self.backbone.maxpool(x) # 64@384
+        h4 = self.backbone.layer1(x) # 64@384
+        h8 = self.backbone.layer2(h4) # 128@192
+        h16 = self.backbone.layer3(h8) # 256@96
+        h32 = self.backbone.layer4(h16) # 512@48
+        h64 = self.down_conv1(h32) # 512@24
+        h128 = self.down_conv2(h64) # 512@12
+        h256 = self.down_conv3(h128) # 512@6
+        up256 = F.interpolate(self.upconv1(h256), scale_factor = (2, 2), mode = 'bilinear', align_corners = False) # 512@12
+        up128 = F.interpolate(self.upconv2(torch.cat([up256, h128], dim = 1)), scale_factor = (2, 2), mode = 'bilinear', align_corners = False) #51264@24
+        up64 = F.interpolate(self.upconv3(torch.cat([up128, h64], dim = 1)), scale_factor = (2, 2), mode = 'bilinear', align_corners = False) # 256@48
+        up32 = F.interpolate(self.upconv4(torch.cat([up64, h32], dim = 1)), scale_factor = (2, 2), mode = 'bilinear', align_corners = False) # 256@96
+        up16 = F.interpolate(self.upconv5(torch.cat([up32, h16], dim = 1)), scale_factor = (2, 2), mode = 'bilinear', align_corners = False) # 128@192
+        up8 = F.interpolate(self.upconv6(torch.cat([up16, h8], dim = 1)), scale_factor = (2, 2), mode = 'bilinear', align_corners = False) # 64@384
+        up4 = F.interpolate(self.upconv7(torch.cat([up8, h4], dim = 1)), scale_factor = (2, 2), mode = 'bilinear', align_corners = False) # 64@768
+        ascore = self.conv_as(up4)
+        rscore = self.conv_rs(up4)
+        return torch.cat([rscore, ascore], dim = 1), self.conv_mask(up4)
+    def forward(self, x):
+        x = self.backbone.conv1(x)
+        x = self.backbone.bn1(x)
+        x = self.backbone.relu(x)
+        x = self.backbone.maxpool(x) # 64@384
+        h4 = self.backbone.layer1(x) # 64@384
+        h8 = self.backbone.layer2(h4) # 128@192
+        h16 = self.backbone.layer3(h8) # 256@96
+        h32 = self.backbone.layer4(h16) # 512@48
+        h64 = self.down_conv1(h32) # 512@24
+        h128 = self.down_conv2(h64) # 512@12
+        h256 = self.down_conv3(h128) # 512@6
+        up256 = F.interpolate(self.upconv1(h256), scale_factor = (2, 2), mode = 'bilinear', align_corners = False) # 512@12
+        up128 = F.interpolate(self.upconv2(torch.cat([up256, h128], dim = 1)), scale_factor = (2, 2), mode = 'bilinear', align_corners = False) #51264@24
+        up64 = F.interpolate(self.upconv3(torch.cat([up128, h64], dim = 1)), scale_factor = (2, 2), mode = 'bilinear', align_corners = False) # 256@48
+        up32 = F.interpolate(self.upconv4(torch.cat([up64, h32], dim = 1)), scale_factor = (2, 2), mode = 'bilinear', align_corners = False) # 256@96
+        up16 = F.interpolate(self.upconv5(torch.cat([up32, h16], dim = 1)), scale_factor = (2, 2), mode = 'bilinear', align_corners = False) # 128@192
+        up8 = F.interpolate(self.upconv6(torch.cat([up16, h8], dim = 1)), scale_factor = (2, 2), mode = 'bilinear', align_corners = False) # 64@384
+        up4 = F.interpolate(self.upconv7(torch.cat([up8, h4], dim = 1)), scale_factor = (2, 2), mode = 'bilinear', align_corners = False) # 64@768
+        ascore = self.conv_as(up4)
+        rscore = self.conv_rs(up4)
+        return torch.cat([rscore, ascore], dim = 1), self.conv_mask(up4)
+if __name__ == '__main__':
+    net = CRAFT_net().cuda()
+    img = torch.randn(2, 3, 1536, 1536).cuda()
+    print(net.forward_train(img)[0].shape)