Upload folder using huggingface_hub

Browse files

Files changed (16) hide show

LICENSE +202 -0
README.md +73 -0
config.json +48 -0
configuration_clyp.py +199 -0
image_processing_clyp.py +226 -0
ja-imagenet-1k-classnames.txt +1000 -0
ja-imagenet-1k-templates.txt +37 -0
model.py +505 -0
model.safetensors +3 -0
model_rinna.py +400 -0
modeling_clyp.py +160 -0
preprocessor_config.json +8 -0
special_tokens_map.json +1 -0
spiece.model +3 -0
tokenization_clyp.py +125 -0
tokenizer_config.json +15 -0

LICENSE ADDED Viewed

	@@ -0,0 +1,202 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright 2024 LY Corporation
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

README.md CHANGED Viewed

@@ -1,3 +1,76 @@
 ---
 license: apache-2.0
 ---

 ---
+language: ja
 license: apache-2.0
+tags:
+- clip
+- japanese-clip
+pipeline_tag: feature-extraction
 ---
+# clip-japanese-base
+This is a Japanese [CLIP (Contrastive Language-Image Pre-training)](https://arxiv.org/abs/2103.00020) model developed by [LY Corporation](https://www.lycorp.co.jp/en/). This model was trained on ~1B web-collected image-text pairs, and it is applicable to various visual tasks including zero-shot image classification, text-to-image or image-to-text retrieval.
+## How to use
+1. Install packages
+```
+pip install pillow requests sentencepiece transformers torch timm
+```
+2. Run
+```python
+import io
+import requests
+from PIL import Image
+import torch
+from transformers import AutoImageProcessor, AutoModel, AutoTokenizer
+HF_MODEL_PATH = 'line-corporation/clip-japanese-base'
+tokenizer = AutoTokenizer.from_pretrained(HF_MODEL_PATH, trust_remote_code=True)
+processor = AutoImageProcessor.from_pretrained(HF_MODEL_PATH, trust_remote_code=True)
+model = AutoModel.from_pretrained(HF_MODEL_PATH, trust_remote_code=True)
+device = "cuda" if torch.cuda.is_available() else "cpu"
+image = Image.open(io.BytesIO(requests.get('https://images.pexels.com/photos/2253275/pexels-photo-2253275.jpeg?auto=compress&cs=tinysrgb&dpr=3&h=750&w=1260').content))
+image = processor(image, return_tensors="pt")
+text = tokenizer(["犬", "猫", "象"])
+with torch.no_grad():
+    image_features = model.get_image_features(**image)
+    text_features = model.get_text_features(**text)
+    text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)
+print("Label probs:", text_probs)
+# [[1., 0., 0.]]
+```
+## Model architecture
+The model uses an [Eva02-B](https://huggingface.co/timm/eva02_base_patch16_clip_224.merged2b_s8b_b131k) Transformer architecture as the image encoder and a 12-layer BERT as the text encoder. The text encoder was initialized from [rinna/japanese-clip-vit-b-16](https://huggingface.co/rinna/japanese-clip-vit-b-16).
+## Evaluation
+### Dataset
+- [STAIR Captions](http://captions.stair.center/) (v2014 val set of MSCOCO) for image-to-text (i2t) and text-to-image (t2i) retrieval. We measure performance using R@1, which is the average recall of i2t and t2i retrieval.
+- [Recruit Datasets](https://huggingface.co/datasets/recruit-jp/japanese-image-classification-evaluation-dataset) for image classification.
+- [ImageNet-1K](https://www.image-net.org/download.php) for image classification. We translated all classnames into Japanese. The classnames and templates can be found in `ja-imagenet-1k-classnames.txt` and `ja-imagenet-1k-templates.txt`.
+### Result
+| Model             | Image Encoder Params | Text Encoder params | STAIR Captions (R@1) | Recruit Datasets (acc@1) | ImageNet-1K (acc@1) |
+|-------------------|----------------------|---------------------|----------------|------------------|-----------------|
+| [Ours](https://huggingface.co/line-corporation/clip-japanese-base) | 86M(Eva02-B)         | 100M(BERT)          | **0.30**           | **0.89**             | 0.58            |
+| [Stable-ja-clip](https://huggingface.co/stabilityai/japanese-stable-clip-vit-l-16)    | 307M(ViT-L)          | 100M(BERT)          | 0.24           | 0.77             | **0.68**            |
+| [Rinna-ja-clip](https://huggingface.co/rinna/japanese-clip-vit-b-16)     | 86M(ViT-B)           | 100M(BERT)         | 0.13           | 0.54             | 0.56            |
+| [Laion-clip](https://huggingface.co/laion/CLIP-ViT-H-14-frozen-xlm-roberta-large-laion5B-s13B-b90k)        | 632M(ViT-H)          | 561M(XLM-RoBERTa)   | **0.30**           | 0.83             | 0.58            |
+| [Hakuhodo-ja-clip](https://huggingface.co/hakuhodo-tech/japanese-clip-vit-h-14-bert-wider)  | 632M(ViT-H)          | 100M(BERT)          | 0.21           | 0.82             | 0.46            |
+## Licenses
+[The Apache License, Version 2.0](https://www.apache.org/licenses/LICENSE-2.0)
+## Citation
+```
+@misc{clip-japanese-base,
+    title = {CLIP Japanese Base},
+    author={Shuhei Yokoo, Shuntaro Okada, Peifei Zhu, Shuhei Nishimura and Naoki Takayama}
+    url = {https://huggingface.co/line-corporation/clip-japanese-base},
+}
+```

config.json ADDED Viewed

	@@ -0,0 +1,48 @@

+{
+  "_name_or_path": "./lycorp/clyp-eva02-b-16",
+  "architectures": [
+    "CLYPModel"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_clyp.CLYPConfig",
+    "AutoModel": "modeling_clyp.CLYPModel"
+  },
+  "itc_loss_config": null,
+  "learn_temperature": true,
+  "model_type": "clyp",
+  "temperature_init": 0.07,
+  "temperature_max": 1000.0,
+  "temperature_min": 0.01,
+  "text_encoder_config": {
+    "backbone_config": {
+      "model_name": "rinna/japanese-clip-vit-b-16"
+    },
+    "neck_config": {
+      "bias": false,
+      "in_channels": 768,
+      "out_channels": 512
+    },
+    "pooler_config": {
+      "input_type": "huggingface",
+      "return_patch_features": false
+    }
+  },
+  "torch_dtype": "float32",
+  "transformers_version": "4.39.1",
+  "vision_encoder_config": {
+    "backbone_config": {
+      "extra_kwargs": {},
+      "model_name": "eva02_base_patch16_clip_224.merged2b",
+      "pretrained": true
+    },
+    "neck_config": {
+      "bias": false,
+      "in_channels": 768,
+      "out_channels": 512
+    },
+    "pooler_config": {
+      "input_type": "timm",
+      "return_patch_features": false
+    }
+  }
+}

configuration_clyp.py ADDED Viewed

	@@ -0,0 +1,199 @@

+# coding=utf-8
+# Copyright 2024 LY Corporation.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+from typing import Any, Literal, Optional
+from transformers import PretrainedConfig
+class CLYPConfig(PretrainedConfig):
+    model_type = "clyp"
+    def __init__(
+        self,
+        vision_encoder_config: Optional[dict] = None,
+        text_encoder_config: Optional[dict] = None,
+        itc_loss_config: Optional[dict] = None,
+        learn_temperature: bool = True,
+        temperature_init: float = 0.07,
+        temperature_min: float = 0.01,
+        temperature_max: float = 1000.0,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        vision_encoder_config = vision_encoder_config or {}
+        text_encoder_config = text_encoder_config or {}
+        self.vision_encoder_config = CLYPVisionEncoderConfig(**vision_encoder_config)
+        self.text_encoder_config = CLYPTextEncoderConfig(**text_encoder_config)
+        self.itc_loss_config = (
+            CLYPLossConfig(**itc_loss_config) if itc_loss_config else None
+        )
+        self.learn_temperature = learn_temperature
+        self.temperature_init = temperature_init
+        self.temperature_min = temperature_min
+        self.temperature_max = temperature_max
+    def to_diff_dict(self) -> dict[str, Any]:
+        serializable_config_dict = super().to_diff_dict()
+        sub_serializable_config_dict = {
+            "vision_encoder_config": _to_diff_dict(self.vision_encoder_config),
+            "text_encoder_config": _to_diff_dict(self.text_encoder_config),
+        }
+        self.dict_torch_dtype_to_str(sub_serializable_config_dict)
+        serializable_config_dict.update(sub_serializable_config_dict)
+        return serializable_config_dict
+class CLYPVisionEncoderConfig(PretrainedConfig):
+    def __init__(
+        self,
+        backbone_config: Optional[dict] = None,
+        pooler_config: Optional[dict] = None,
+        neck_config: Optional[dict] = None,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        backbone_config = backbone_config or {}
+        pooler_config = pooler_config or {"input_type": "timm"}
+        neck_config = neck_config or {}
+        self.backbone_config = CLYPVisionBackboneConfig(**backbone_config)
+        self.pooler_config = CLYPPoolerConfig(**pooler_config)
+        self.neck_config = CLYPNeckConfig(**neck_config)
+    def to_diff_dict(self) -> dict[str, Any]:
+        serializable_config_dict = {
+            "backbone_config": _to_diff_dict(self.backbone_config),
+            "pooler_config": _to_diff_dict(self.pooler_config),
+            "neck_config": _to_diff_dict(self.neck_config),
+        }
+        self.dict_torch_dtype_to_str(serializable_config_dict)
+        return serializable_config_dict
+class CLYPTextEncoderConfig(PretrainedConfig):
+    def __init__(
+        self,
+        backbone_config: Optional[dict] = None,
+        pooler_config: Optional[dict] = None,
+        neck_config: Optional[dict] = None,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        backbone_config = backbone_config or {}
+        pooler_config = pooler_config or {"input_type": "huggingface"}
+        neck_config = neck_config or {}
+        self.backbone_config = CLYPTextBackboneConfig(**backbone_config)
+        self.pooler_config = CLYPPoolerConfig(**pooler_config)
+        self.neck_config = CLYPNeckConfig(**neck_config)
+    def to_diff_dict(self) -> dict[str, Any]:
+        serializable_config_dict = {
+            "backbone_config": _to_diff_dict(self.backbone_config),
+            "pooler_config": _to_diff_dict(self.pooler_config),
+            "neck_config": _to_diff_dict(self.neck_config),
+        }
+        self.dict_torch_dtype_to_str(serializable_config_dict)
+        return serializable_config_dict
+class CLYPVisionBackboneConfig(PretrainedConfig):
+    def __init__(
+        self,
+        model_name: str = "eva02_base_patch16_clip_224.merged2b",
+        pretrained: bool = True,
+        extra_kwargs: Optional[dict] = None,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        self.model_name = model_name
+        self.pretrained = pretrained
+        self.extra_kwargs = extra_kwargs or {}
+class CLYPTextBackboneConfig(PretrainedConfig):
+    def __init__(
+        self,
+        model_name: str = "rinna/japanese-clip-vit-b-16",
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        self.model_name = model_name
+class CLYPPoolerConfig(PretrainedConfig):
+    def __init__(
+        self,
+        input_type: Literal["timm", "huggingface"] | None = None,
+        return_patch_features: bool = False,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        self.input_type = input_type
+        self.return_patch_features = return_patch_features
+class CLYPNeckConfig(PretrainedConfig):
+    def __init__(
+        self,
+        in_channels: int = 768,
+        out_channels: int = 512,
+        bias: bool = False,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.bias = bias
+class CLYPLossConfig(PretrainedConfig):
+    def __init__(
+        self,
+        learn_temperature: bool = True,
+        init_temperature: float = 0.07,
+        max_temperature: Optional[float] = None,
+        min_temperature: Optional[float] = None,
+        label_smoothing: float = 0.0,
+        gather_with_grad: bool = True,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        self.learn_temperature = learn_temperature
+        self.init_temperature = init_temperature
+        self.max_temperature = max_temperature
+        self.min_temperature = min_temperature
+        self.label_smoothing = label_smoothing
+        self.gather_with_grad = gather_with_grad
+def _to_diff_dict(c: PretrainedConfig) -> dict:
+    """Function to override PretrainedConfig.to_diff_dict()
+    NOTE
+    ----
+    In transformers==4.38.1,
+    PretrainedConfig.__repr__ may not be able to show configs that has some sub-configs
+    """
+    d = c.to_diff_dict()
+    if "transformers_version" in d:
+        d.pop("transformers_version")
+    return d
+if __name__ == "__main__":
+    conf = CLYPConfig.from_pretrained("config.json")
+    print(conf)

image_processing_clyp.py ADDED Viewed

	@@ -0,0 +1,226 @@

+# coding=utf-8
+# Copyright 2024 LY Corporation.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+from typing import Literal, Optional
+import torch
+import torch.nn as nn
+import torchvision.transforms as T
+import torchvision.transforms.functional as F
+from PIL import Image
+from timm.data import (
+    IMAGENET_INCEPTION_MEAN,
+    IMAGENET_INCEPTION_STD,
+    OPENAI_CLIP_MEAN,
+    OPENAI_CLIP_STD,
+)
+from timm.data.transforms_factory import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
+from transformers.image_processing_utils import BaseImageProcessor, BatchFeature
+from transformers.image_utils import ImageInput, make_list_of_images
+from transformers.utils import TensorType
+NormalizationType = Literal["imagenet", "imagenet_inception", "openai_clip"]
+class CLYPImageProcessor(BaseImageProcessor):
+    def __init__(
+        self,
+        image_size: int = 224,
+        normalization_type: NormalizationType = "imagenet",
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.image_size = image_size
+        self.normalization_type: NormalizationType = normalization_type
+    def preprocess(
+        self,
+        images: ImageInput | list[ImageInput],
+        return_tensors: Optional[str | TensorType] = None,
+        **kwargs,
+    ) -> BatchFeature:
+        images = make_list_of_images(images, expected_ndims=3)
+        # TODO: Support train
+        transforms = TestTransform(
+            self.image_size, normalization_type=self.normalization_type
+        )
+        images = [transforms(image).numpy() for image in images]
+        return BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors)
+class TrainTransform:
+    def __init__(
+        self,
+        image_size: int,
+        scale_range_min: float,
+        scale_range_max: float,
+        normalization_type: NormalizationType = "imagenet",
+    ) -> None:
+        """
+        Args:
+            image_size (int): output-image size.
+            scale_range_min (float): minimum value of the scale to crop an input image.
+            scale_range_max (float): maximum value of the scale to crop an input image.
+            normalization_type (str): select mean and std for normalization (see get_mean_and_std).
+        """
+        scale = (scale_range_min, scale_range_max)
+        mean_and_std = get_mean_and_std(normalization_type)
+        self.transform = T.Compose(
+            [
+                T.RandomResizedCrop(
+                    image_size, scale=scale, interpolation=T.InterpolationMode.BICUBIC
+                ),
+                _convert_to_rgb,
+                T.ToTensor(),
+                T.Normalize(**mean_and_std),
+            ]
+        )
+    def __call__(self, img):
+        return self.transform(img)
+class TestTransform:
+    def __init__(
+        self, image_size: int, normalization_type: NormalizationType = "imagenet"
+    ) -> None:
+        """
+        Args:
+            image_size (int): output-image size.
+            normalization_type (str): select mean and std for normalization (see get_mean_and_std).
+        """
+        mean_and_std = get_mean_and_std(normalization_type)
+        self.transform = T.Compose(
+            [
+                ResizeMaxSize(image_size, fill=0),
+                T.CenterCrop(image_size),
+                _convert_to_rgb,
+                T.ToTensor(),
+                T.Normalize(**mean_and_std),
+            ]
+        )
+    def __call__(self, img):
+        return self.transform(img)
+class SmallestMaxSize(T.Resize):
+    """Resize shorter side of an input image.
+    The shorter side of an input image is resized to the max_size.
+    Note that an large part of the input image is discarded when an aspect-ratio value of the input image is extremely small or large.
+    """
+    def __init__(self, max_size: int, **kwargs):
+        super().__init__(max_size, **kwargs)
+    @staticmethod
+    def target_size(w: int, h: int, size: int) -> tuple[int, int]:
+        if h < w:
+            w, h = int(size * w / h), size
+        else:
+            w, h = size, int(size * h / w)
+        return (h, w)
+    def __call__(self, img):
+        size = self.size
+        assert isinstance(size, int)
+        w, h = img.size
+        target_size = self.target_size(w, h, size)
+        return F.resize(img, list(target_size), self.interpolation)
+class ResizeMaxSize(nn.Module):
+    """Resize longer side of an input image.
+    The longer side of an input image is resized to the max_size.
+    Note that an large part of the output image is padded when an aspect-ration value of the input image is extremely small or large.
+    Adapted from https://github.com/mlfoundations/open_clip/blob/main/src/open_clip/transform.py
+    """
+    def __init__(
+        self,
+        max_size: int,
+        interpolation: T.InterpolationMode = T.InterpolationMode.BICUBIC,
+        fn: str = "max",
+        fill: int = 0,
+    ):
+        super().__init__()
+        if not isinstance(max_size, int):
+            raise TypeError(f"Size should be int. Got {type(max_size)}")
+        self.max_size = max_size
+        self.interpolation = interpolation
+        self.fn = min if fn == "min" else min
+        self.fill = fill
+    def forward(self, img):
+        if isinstance(img, torch.Tensor):
+            height, width = img.shape[:2]
+        else:
+            width, height = img.size
+        scale = self.max_size / float(max(height, width))
+        if scale != 1.0:
+            new_size = tuple(round(dim * scale) for dim in (height, width))
+            img = F.resize(img, new_size, self.interpolation)  # type: ignore
+            pad_h = self.max_size - new_size[0]
+            pad_w = self.max_size - new_size[1]
+            img = F.pad(
+                img,
+                padding=[
+                    pad_w // 2,
+                    pad_h // 2,
+                    pad_w - pad_w // 2,
+                    pad_h - pad_h // 2,
+                ],
+                fill=self.fill,
+            )
+        return img
+def get_mean_and_std(normalization_type: NormalizationType) -> dict:
+    """Return mean and std tensors for T.Normalize()
+    NOTE:
+        IMAGENET_DEFAULT_MEAN = (0.485, 0.456, 0.406)
+        IMAGENET_DEFAULT_STD = (0.229, 0.224, 0.225)
+        IMAGENET_INCEPTION_MEAN = (0.5, 0.5, 0.5)
+        IMAGENET_INCEPTION_STD = (0.5, 0.5, 0.5)
+        OPENAI_CLIP_MEAN = (0.48145466, 0.4578275, 0.40821073)
+        OPENAI_CLIP_STD = (0.26862954, 0.26130258, 0.27577711)
+    """
+    if normalization_type == "imagenet":
+        return {
+            "mean": torch.tensor(IMAGENET_DEFAULT_MEAN),
+            "std": torch.tensor(IMAGENET_DEFAULT_STD),
+        }
+    elif normalization_type == "imagenet_inception":
+        return {
+            "mean": torch.tensor(IMAGENET_INCEPTION_MEAN),
+            "std": torch.tensor(IMAGENET_INCEPTION_STD),
+        }
+    elif normalization_type == "openai_clip":
+        return {
+            "mean": torch.tensor(OPENAI_CLIP_MEAN),
+            "std": torch.tensor(OPENAI_CLIP_STD),
+        }
+    else:
+        raise ValueError(normalization_type)
+def _convert_to_rgb(image: Image.Image) -> Image.Image:
+    return image.convert("RGB")

ja-imagenet-1k-classnames.txt ADDED Viewed

	@@ -0,0 +1,1000 @@

+テンチ
+金魚
+ホホジロザメ
+イタチザメ
+シュモクザメ
+シビレエイ
+アカエイ
+雄鶏,おんどり
+雌鶏,めんどり
+ダチョウ
+アトリ
+ゴシキヒワ
+メキシコマシコ
+ユキヒメドリ
+ルリノジコ
+コマツグミ
+目黒
+カケス
+カササギ
+四十雀,シジュウカラ
+カワガラス
+トビ
+ハクトウワシ,白頭鷲
+ハゲワシ
+カラフトフクロウ
+ファイアサラマンダー
+スベイモリ,オビイモリ
+イモリ
+スポテッドサラマンダー,キボシサンショウウオ
+アホロートル
+ウシガエル
+アマガエル
+オガエル
+アカウミガメ
+オサガメ
+鼈,ドロガメ
+スッポン
+ハコガメ
+バンドトカゲモドキ
+イグアナ
+グリーンアノール
+ハシリトカゲ
+アガマトカゲ
+エリマキトカゲ
+アシナシトカゲ
+アメリカドクトカゲ
+ミドリカナヘビ
+カメレオン
+コモドオオトカゲ
+ナイルワニ
+ミシシッピワニ
+トリケラトプス
+盲蛇,ミミズヘビ
+リングネックスネーク
+トウブシシバナヘビ
+緑のヘビ
+キングスネーク
+ガータースネーク
+ミズヘビ
+ツルヘビ
+夜行性のヘビ
+ボアコンストリクター
+アフリカニシキヘビ
+インドコブラ
+グリーンマンバ
+ウミヘビ
+サハラツノクサリヘビ
+ダイヤガラガラヘビ
+ヨコバイガラガラヘビ
+三葉虫
+ザトウムシ
+サソリ
+コガネグモ
+納屋クモ
+オニグモ
+クロゴケグモ
+タランチュラ
+ドクグモ
+ダニ
+ムカデ
+クロライチョウ
+ライチョウ,雷鳥
+エリマキライチョウ
+茶色の斑紋のあるライチョウ
+クジャク
+ウズラ
+ヤマウズラ
+ヨウム
+コンゴウインコ
+キバタン
+ヒインコ
+バンケン
+ハチクイ
+サイチョウ
+ハチドリ
+キリハシ,錐嘴
+オオハシ
+アヒル
+ウミアイサ
+ガチョウ
+コクチョウ,黒鳥
+牙を持つ動物
+ハリモグラ
+カモノハシ
+ワラビー
+コアラ
+ウォンバット
+クラゲ
+イソギンチャク
+脳珊瑚
+扁形動物
+線虫
+ホラガイ,巻き貝
+カタツムリ
+ナメクジ
+ウミウシ
+ヒザラガイ,多板綱
+オウムガイ
+アメリカイチョウガニ
+イワガニ
+シオマネキ
+タラバガニ
+アメリカンロブスター
+伊勢エビ
+ザリガニ
+ヤドカリ
+ワラジムシ,等脚類
+コウノトリ
+ナベコウ
+ヘラサギ
+フラミンゴ
+ヒメアカクロサギ
+ダイサギ
+ヨシゴイ
+ツル
+ツルモドキ
+バン,鷭
+アメリカオオバン
+ノガン
+キョウジョシギ
+ハマシギ
+アカアシシギ
+オオハシシギ
+ミヤコドリ
+ペリカン
+キングペンギン
+アホウドリ,アルバトロス
+コククジラ
+シャチ,鯱
+ジュゴン
+アシカ
+チワワ
+狆
+マルチーズ
+ペキニーズ
+シーズー
+キングチャールズスパニエル
+パピヨン
+トイテリア
+ローデシアン・リッジバック
+アフガンハウンド
+バセットハウンド
+ビーグル
+ブラッドハウンド
+ブルーティッククーンハウンド
+ブラック・アンド・タン・クーンハウンド
+ツリーイング・ウォーカー・クーンハウンド
+イングリッシュ・フォックスハウンド
+レッドボーン・クーンハウンド
+ボルゾイ
+アイリッシュウルフハウンド
+イタリアン・グレーハウンド
+ウィペット
+イビサン・ハウンド
+ノルウェージャン・エルクハウンド
+オッターハウンド
+サルーキ
+スコティッシュ・ディアハウンド
+ワイマラナー
+スタッフォードシャーブルテリア
+アメリカンスタッフォードシャーテリア
+ベドリントンテリア
+ボーダーテリア
+ケリーブルーテリア
+アイリッシュテリア
+ノーフォークテリア
+ノーリッチテリア
+ヨークシャーテリア
+ワイヤーフォックステリア
+レークランドテリア
+シーリーハムテリア
+エアデールテリア
+ケアーン・テリア
+オーストラリアン・テリア
+ダンディ・ディンモント・テリア
+ボストンテリア
+ミニチュア・シュナウザー
+ジャイアント・シュナウザー
+スタンダード・シュナウザー
+スコッチテリア
+チベタンテリア
+オーストラリアン・シルキー・テリア
+ソフトコーテッド・ウィートン・テリア
+ウエスト・ハイランド・ホワイト・テリア
+ラサ・アプソ
+フラットコーテッド・レトリーバー
+カーリーコーテッド・レトリーバー
+ゴールデン・レトリバー
+ラブラドール・レトリバー
+チェサピーク・ベイ・レトリーバー
+ジャーマン・ショートヘア・ポインタ
+ビズラ
+イングリッシュ・セッター
+アイリッシュ・セッター
+ゴードン・セッター
+ブリタニー・スパニエル
+クラムバー・スパニエル
+イングリッシュ・スプリンガー・スパニ��ル
+ウェルシュ・スプリンガー・スパニエル
+コッカー・スパニエル
+サセックス・スパニエル
+アイリッシュ・ウォーター・スパニエル
+クバース犬
+スキッパーキー
+ベルジアン・シェパード・ドッグ・グローネンダール
+マリノア
+ブリアール
+オーストラリアン・ケルピー
+コモンドール
+オールドイングリッシュシープドッグ
+シェットランド・シープドッグ
+コリー
+ボーダー・コリー
+ブーヴィエ・デ・フランドル
+ロットワイラー
+ジャーマンシェパード
+ドーベルマン
+ミニチュア・ピンシャー
+グレータースイス・マウンテンドッグ
+バーニーズ・マウンテン・ドッグ
+アッペンツェラー・キャトル・ドッグ
+エントレブッハー・キャトル・ドッグ
+ボクサー犬
+ブルマスティフ
+チベタンマスティフ
+フレンチブルドッグ
+グレートデン
+セントバーナード
+エスキモー犬
+アラスカン・マラミュート
+シベリアンハスキー
+ダルメシアン
+アーフェンピンシャー
+バセンジー
+パグ
+レオンベルガー
+ニューファンドランド犬
+グレートピレニーズ
+サモエド
+ポメラニアン
+チャウチャウ
+キースホンド
+ブラバンソングリフォン
+ペンブローク
+ウェルシュコーギーカーディガン
+トイプードル
+ミニチュアプードル
+スタンダードプードル
+メキシカン・ヘアレス・ドッグ
+ハイイロオオカミ
+白いオオカミ
+レッドウルフ
+コヨーテ
+ディンゴ
+ドール,豺
+リカオン
+ハイエナ
+アカギツネ
+キットギツネ
+ホッキョクギツネ
+ハイイロギツネ
+トラネコ
+ジャガーネコ
+ペルシャ猫
+シャム猫
+エジプシャンマウ
+ピューマ,クーガー
+オオヤマネコ
+ヒョウ
+ユキヒョウ
+ジャガー
+ライオン
+虎
+チーター
+ヒグマ
+アメリカグマ
+ホッキョクグマ
+ナマケグマ
+マングース
+ミーアキャット
+ハンミョウ
+てんとう虫
+オサムシ
+カミキリムシ
+ハムシ
+スカラベ,フンコロガシ
+カブトムシ
+ゾウムシ
+ハエ
+蜂
+蟻
+バッタ
+コオロギ
+ナナフシ
+ゴキブリ
+カマキリ
+蝉
+ヨコバイ
+クサカゲロウ
+トンボ
+イトトンボ
+ヨーロッパアカタテハ
+ジャノメチョウ
+オオカバマダラ
+モンシロチョウ
+キチョウ,黄色の蝶
+ゴイシシジミ,シジミチョウ
+ヒトデ
+ウニ,海胆,雲丹
+ナマコ
+ワタオウサギ
+野ウサギ
+アンゴラウサギ
+ハムスター
+ヤマアラシ
+キツネリス
+マーモット
+ビーバー
+モルモット
+栗毛の馬
+シマウマ
+豚
+イノシシ
+イボイノシシ
+カバ
+雄牛
+水牛
+バイソン
+牡羊,雄羊
+ビッグホーン
+アイベックス
+ハーテビースト
+インパラ
+ガゼル
+アラビアラクダ
+ラマ
+イタチ
+ミンク
+ヨーロッパケナガイタチ
+クロアシイタチ
+カワウソ
+スカンク
+アナグマ
+アルマジロ
+ミユビナマケモノ
+オランウータン
+ゴリラ
+チンパンジー
+テナガザル
+フクロテナガザル
+オナガザル
+パタスモンキー
+ヒヒ
+マカク
+ラングール,ヤセザル
+コロブス
+テングザル
+マーモセット
+オマキザル
+ハウラ,ホエザル
+ティティ
+クモザル
+リスザル
+ワオキツネザル
+インドリ
+インドゾウ
+アフリカゾウ
+レッサーパンダ
+ジャイアントパンダ
+スヌーク
+ウナギ
+ギンザケ,銀鮭
+ロックビューティーエンゼルフィッシュ
+クマノミ
+チョウザメ
+ガーフィッシュ
+ミノカサゴ
+フグ
+そろばん
+アバヤ,アラブの民族衣装
+アカデミックガウン,法服
+アコーディオン
+アコースティックギター
+空母
+旅客機
+飛行船
+祭壇
+救急車
+水陸両用車
+アナログ時計
+養蜂場
+エプロン
+ごみ箱
+アサルトライフル
+リュック,バックパック
+パン屋,ベーカリー
+平均台
+バルーン,気球,風船
+ボールペン
+絆創膏
+バンジョー
+手すり
+バーベル
+理髪店のいす
+理髪店
+納屋
+バロメーター,気圧計
+樽
+手押し車
+野球ボール
+バスケットボール
+バシネット
+バスーン,ファゴット
+水泳帽
+バスタオル
+浴槽
+ステーションワゴン
+灯台
+ビーカー
+シャコー帽
+ビール瓶
+ビールグラス
+鐘塔,鐘楼
+よだれ掛け
+タンデム自転車
+ビキニ
+バインダー
+双眼鏡
+巣箱,鳥小屋
+ボートハウス
+ボブスレー
+ループタイ
+ボンネット
+本棚
+書店
+瓶の蓋
+狩猟弓
+蝶ネクタイ
+真鍮記念プレート
+ブラジャー
+防波堤
+鎧の胸当て
+ほうき
+バケツ
+バックル
+防弾チョッキ
+新幹線
+精肉店
+タクシー
+大釜
+キャンドル
+大砲
+カヌー
+缶切り
+カーディガン
+車のミラー
+メリーゴーランド,回転���馬
+工具セット
+段ボール箱
+車輪
+ATM
+カセットテープ
+カセットプレーヤー
+城
+カタマラン
+CDプレーヤー
+チェロ
+携帯電話
+鎖
+金網フェンス
+鎖帷子,鎖かたびら
+チェーンソー
+チェスト,収納
+西洋だんす,シフォニア
+チャイム,ベル,鐘
+食器棚
+クリスマスストッキング
+教会
+映画館
+チョッパー,肉包丁,クリーバー
+崖の住居
+マント
+サボ,下駄
+カクテルシェーカー
+コーヒーマグ
+コーヒーポット
+コイル
+組み合わせ錠,ダイヤル錠
+コンピュータキーボード
+菓子屋
+コンテナ船
+オープンカー,コンバーチブル
+コルク抜き
+コルネット
+カウボーイブーツ
+カウボーイハット
+ゆりかご
+クレーン
+クラッシュヘルメット
+木箱
+ベビーベッド
+スロークッカー
+クロケットボール
+松葉杖
+キュイラス,胸当て
+ダム
+机
+デスクトップコンピューター
+ダイヤル電話
+おむつ
+デジタル時計
+デジタル腕時計
+ダイニングテーブル
+布巾
+食器洗い機
+ディスクブレーキ
+ドック,船着き場
+犬ぞり
+ドーム
+玄関マット
+掘削リグ
+ドラム
+ドラムスティック
+ダンベル
+ダッチオーブン
+扇風機
+エレキギター
+電気機関車
+娯楽施設
+封筒
+エスプレッソマシーン
+フェースパウダー
+フェザーボア
+バインダー,書類キャビネット
+消防艇
+消防車
+防火用スクリーン
+旗竿
+フルート
+折りたたみ椅子
+アメリカンフットボールのヘルメット
+フォークリフト
+噴水
+万年筆
+四柱ベッド
+貨車
+フレンチホルン
+フライパン
+毛皮のコート
+ごみ収集車
+ガスマスク
+ガソリンポンプ
+ゴブレット
+ゴーカート
+ゴルフボール
+ゴルフカート
+ゴンドラ
+ゴング
+ガウン
+グランドピアノ
+植木室,温室
+ラジエーターグリル
+食料品店
+ギロチン
+ヘアクリップ
+ヘアスプレー
+ハーフトラック
+ハンマー
+洗濯かご
+ヘアドライヤー
+携帯コンピュータ
+ハンカチ
+ハードディスクドライブ,HDD
+ハーモニカ
+ハープ,竪琴
+刈り取り機,コンバイン
+斧
+ホルスター
+ホームシアター
+ハニカム
+フック
+フープスカート
+鉄棒
+馬車
+砂時計
+iPod,アイポッド
+衣類用アイロン
+ジャックオーランタン
+ジーンズ
+ジープ
+Tシャツ
+ジグソーパズル
+人力車
+ジョイスティック
+着物
+膝パッド
+結び目
+白衣
+レードル,ひしゃく
+ランプシェード,秉燭
+ノートパソコン
+芝刈り機
+レンズキャップ
+レターオープナー
+図書館
+救命ボート
+ライター
+リムジン
+定期船
+口紅
+ローファー
+ローション
+スピーカー
+ルーペ
+製材所
+磁気コンパス
+メッセンジャーバッグ
+郵便受け
+タイツ
+ワンピース水着
+マンホールの蓋
+マラカス
+マリンバ
+マスク,仮面
+マッチ棒
+メイポール,五月柱
+迷路
+計量カップ
+薬箱
+巨石
+マイク
+電子レンジ
+軍服
+ミルク缶
+ミニバス
+ミニスカート
+ミニバン
+ミサイル
+ミトン
+ミキシングボウル
+移動式住宅
+フォード・モデルT
+モデム
+修道院
+モニター
+モペット
+乳鉢と乳棒
+卒業帽
+モスク
+蚊帳
+スクーター
+マウンテンバイク
+山のテント
+コンピュータマウス
+ネズミ捕り
+引っ越しトラック
+銃口
+金属釘
+ネックブレース
+ネックレス
+おしゃぶり
+ノートパソコン
+オベリスク
+オーボエ
+オカリナ
+オドメーター
+オイルフィルター
+パイプオルガン
+オシロスコープ
+オーバースカート
+牛車
+酸素マスク
+小包
+パドル
+パドルホイール
+南京錠
+絵筆
+パジャマ
+宮殿
+パンフルート
+ペーパータオル
+パラシュート
+平行棒
+公園のベンチ
+パーキングメーター
+客車,鉄道車両
+パティオ
+公衆電話
+台座
+筆箱
+鉛筆削り
+香水
+ペトリ皿
+コピー機
+ピック
+ピッケルハウベ,スパイク付き鉄かぶと
+ピケットフェンス
+ピックアップトラック
+桟橋
+貯金箱
+錠剤瓶
+枕
+ピンポン球
+風車
+海賊船
+ピッチャー,水差し
+角鉋,かんな
+プラネタリウム
+ビニール袋
+皿立て
+農耕用プラウ
+プランジャー
+ポラロイドカメラ
+ポール
+警察車
+ポンチョ
+ビリヤード台
+ソーダボトル
+植木鉢
+ろくろ
+電動ドリル
+礼拝用敷物
+プリンタ
+刑務所
+ミサイル
+プロジェクター
+ホッケーパック
+サンドバッグ
+がま口,銭入れ
+羽ペン
+キルト
+レーシングカー
+ラケット
+ラジエーター
+ラジオ,無線
+電波望遠鏡
+天水桶
+キャンピングカー
+釣りリール
+一眼レフカメラ
+冷蔵庫
+リモコン
+レストラン
+リボルバー
+ライフル
+ロッキングチェア
+焼��料理店
+消しゴム
+ラグビーボール
+定規
+スニーカー
+金庫
+安全ピン
+塩入れ
+サンダル
+サロン
+サックス
+鞘
+体重計
+スクールバス
+スクーナー
+スコアボード
+CRTモニター
+ねじ,スクリュー
+ドライバー
+シートベルト
+ミシン
+盾
+靴屋
+障子
+買い物かご
+ショッピングカート
+シャベル
+シャワーキャップ
+シャワーカーテン
+スキー
+スキーマスク
+寝袋
+計算尺
+引戸
+スロットマシン
+シュノーケル
+スノーモービル
+除雪機
+ソープディスペンサー
+サッカーボール
+靴下
+太陽炉
+ソンブレロ
+スープ皿
+スペースキー
+スペースヒーター
+スペースシャトル
+スパチュラ,へら
+レース艇,モーターボート
+クモの巣
+紡錘
+スポーツカー
+スポットライト
+ステージ
+蒸気機関車
+通り抜けアーチ橋
+スチールドラム
+聴診器
+ストール
+石垣
+ストップウォッチ
+ストーブ
+ろ過器,ストレーナー
+路面電車
+担架,ストレッチャー
+カウチ,ソファ
+仏舎利塔
+潜水艦
+スーツ
+日時計,日晷儀,晷針
+サングラス
+サングラス
+日焼け止め
+吊り橋
+モップ
+スウェットシャツ,トレーナー
+海パン
+ブランコ
+スイッチ
+注射器
+電気スタンド
+タンク,戦車
+テーププレーヤー
+ティーポット,急須
+テディベア
+テレビ
+テニスボール
+茅葺屋根
+劇場のカーテン
+指ぬき
+脱穀機
+玉座
+瓦屋根
+トースター
+タバコ屋
+便座
+たいまつ
+トーテムポール
+レッカー車
+玩具屋
+トラクター
+トレーラートラック
+お盆,トレイ
+トレンチコート
+三輪車
+トリマラン,三胴船
+三脚
+凱旋門
+トロリーバス
+トロンボーン
+バスタブ
+回転ドア
+タイプライターのキーボード
+傘
+一輪車
+アップライトピアノ
+掃除機
+花瓶
+丸天井,円蓋
+ベルベット
+自動販売機
+祭服,礼服
+高架橋
+バイオリン
+バレーボール
+ワッフルメーカー
+壁掛け時計
+財布
+ワードローブ
+軍用機
+シンク,洗面器
+ワッシャー,洗濯機
+水筒
+水差し
+ウォータータワー,給水塔
+ウイスキージャグ
+ホイッスル
+かつら
+窓網戸
+ブラインド
+ウィンザーネクタイ
+ワインボトル
+飛行機の翼
+中華鍋
+木製スプーン
+ウール
+ワームフェンス
+難破船
+帆船
+ユルト
+ウェブサイト
+漫画本
+クロスワードパズル
+道路標識
+信号機
+ブックカバー
+メニュー
+お皿
+ワカモレ
+コンソメ
+ホットポット,火鍋
+パフェ,トライフル
+アイスクリーム
+アイスキャンディー
+フランスパン
+ベーグル
+プレッツェル
+チーズバーガー
+ホットドッグ
+マッシュポテト
+キャベツ
+ブロッコリー
+カリフラワー
+ズッキーニ
+そうめんかぼちゃ
+ドングリかぼちゃ
+バターナッツかぼちゃ
+キュウリ
+アーティチョーク
+ピーマン
+カルドン
+キノコ
+青リンゴ
+イチゴ
+オレンジ,ミカン
+レモン,檸檬
+イチジク
+パイナップル
+バナナ
+ジャックフルーツ,パラミツ
+カスタードアップル
+ザクロ
+干し草
+カルボナーラ
+チョコレートソース
+生地
+ミートローフ
+ピザ
+ポットパイ
+ブリトー
+赤ワイン
+エスプレッソ
+カップ
+エッグノッグ
+山
+泡
+崖
+サンゴ礁
+間欠泉
+湖畔
+岬
+砂州
+海岸
+谷
+火山
+野球選手
+婿,新郎
+スキューバダイバー
+菜種
+デイジー,ヒナギク,雛菊
+パフィオペディラム
+コーン,トウキビ,トウモロコシ
+ドングリ
+ローズヒップ
+セイヨウトチノキ
+ホウキタケ
+ハラタケ
+シャグマアミガサタケ
+スッポンタケ
+ツチグリ
+マイタケ
+ヤマドリタケ
+トウモロコシの穂・芯
+ちり紙,トイレットペーパー

ja-imagenet-1k-templates.txt ADDED Viewed

	@@ -0,0 +1,37 @@

+{c}の悪い写真
+多くの{c}の写真
+{c}の彫刻
+見づらい{c}の写真
+{c}の低解像度写真
+{c}のレンダリング
+{c}の落書き
+{c}のトリミング写真
+{c}のタトゥー
+刺繍された{c}
+{c}の明るい写真
+きれいな{c}の写真
+汚れた{c}の写真
+{c}の暗い写真
+{c}の絵
+私の{c}の写真
+プラスチック製の{c}
+かっこいい{c}の写真
+{c}のクローズアップ写真
+{c}の白黒写真
+{c}のピクセル写真
+jpegで加工した{c}の写真
+{c}のぼやけた写真
+{c}の写真
+{c}の良い写真
+ゲームに登場する{c}
+折り紙で作った{c}
+{c}のスケッチ
+おもちゃの{c}
+{c}の演出
+大きな{c}の写真
+素敵な{c}の写真
+奇妙な{c}の写真
+漫画の{c}
+{c}の芸術
+{c}のぬいぐるみ
+小さな{c}の写真

model.py ADDED Viewed

	@@ -0,0 +1,505 @@

+# coding=utf-8
+# Copyright 2024 LY Corporation.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+import logging
+from typing import Optional, Union
+import timm
+import torch
+import torch.distributed as dist
+import torch.distributed.nn
+import torch.nn as nn
+import torch.nn.functional as F
+from timm.models.swin_transformer import SwinTransformer as TimmSwinTransformer
+from transformers import PreTrainedModel
+from transformers.utils.logging import get_logger
+from .configuration_clyp import (
+    CLYPTextBackboneConfig,
+    CLYPTextEncoderConfig,
+    CLYPVisionBackboneConfig,
+    CLYPVisionEncoderConfig,
+)
+from .model_rinna import RinnaCLIPConfig, RinnaCLIPModel
+DEFAULT_LOGGER = get_logger(__name__)
+class VisionEncoder(nn.Module):
+    """Vision encoder to extract image feateurs.
+    Pooler and neck are optional.
+    Instead of defining pooler and neck in VisionEncoder, you can define them in algorithm classes.
+    Attributes:
+        backbone (nn.Module): backbone loaded from timm, huggingface or registry.
+        pooler (nn.Module): module to extract image-level features.
+        neck (nn.Module): module to adjust feature dimensions.
+    """
+    def __init__(
+        self,
+        backbone: nn.Module,
+        pooler: Optional[nn.Module] = None,
+        neck: Optional[nn.Module] = None,
+    ) -> None:
+        super().__init__()
+        self.backbone = backbone
+        self.pooler = pooler
+        self.neck = neck
+    def forward(self, imgs: torch.Tensor):
+        """A method to extract image features.
+        Args:
+            imgs (torch.Tensor): shape=(batch_size, channels, height, width).
+        Returns:
+            out (torch.Tensor): the output shape changes depending on pooler, and the following shapes are usually expected.
+                - output only image-level features like CLIP: shape=(batch_size, embed_dim)
+                - output image-level and local patch features like BLIP2: shape=(batch_size, embed_dim, length)
+        """
+        out = self.backbone(imgs)  # Shape=(batch_size, channels, height, width)
+        if self.pooler:
+            out = self.pooler(out)
+        if self.neck:
+            out = self.neck(out)
+        return out
+class SwinTransformerPerm(nn.Module):
+    """Wrapper for SwinTransformer in timm.
+    This wrapper changes the output shape to (batch_size, channels, height, width).
+    The original shape of timm SwinTransformer is (batch_size, height, width, channels).
+    """
+    def __init__(self, swin: nn.Module) -> None:
+        super().__init__()
+        self.swin = swin
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        out = self.swin(x)
+        out = out.permute(0, 3, 1, 2)
+        return out
+def load_from_timm(
+    config: CLYPVisionBackboneConfig,
+    use_gradient_checkpointing: bool = False,
+    path_weights: Optional[str] = None,
+    logger: logging.Logger = DEFAULT_LOGGER,
+):
+    """Create a backbone using a method: timm.create_model.
+    Args:
+        config (TimmBackboneConfig): config fed to timm.create_model.
+        use_gradient_checkpointing (bool): True if use gradient checkpointing.
+        path_weights (str): path to weights for backbone initialization.
+    """
+    # backbone
+    assert config is not None
+    backbone = timm.create_model(
+        model_name=config.model_name,
+        pretrained=config.pretrained,
+        **config.extra_kwargs,
+    )
+    backbone.reset_classifier(0, "")
+    logger.info(
+        f"    - load from timm: model_name={config.model_name}, pretrained={config.pretrained}"
+    )
+    # gradient checkpointing
+    backbone.set_grad_checkpointing(enable=use_gradient_checkpointing)
+    if use_gradient_checkpointing:
+        logger.info("    - gradient checkpointing is enebled.")
+    # init weights
+    if path_weights:
+        state_dict = torch.load(path_weights, map_location="cpu")
+        checks = backbone.load_state_dict(state_dict, strict=False)
+        logger.info(f"    - load weights from {path_weights}")
+        logger.info(f"    - state dict checks: {checks}")
+    # swin
+    if isinstance(backbone, TimmSwinTransformer):
+        backbone = SwinTransformerPerm(backbone)
+    return backbone
+def create_vision_encoder(
+    config: CLYPVisionEncoderConfig, logger: logging.Logger = DEFAULT_LOGGER
+) -> VisionEncoder:
+    assert config.pooler_config.input_type
+    backbone = load_from_timm(config.backbone_config, logger=logger)
+    pooler = CLSTokenPooling(
+        config.pooler_config.input_type, config.pooler_config.return_patch_features
+    )
+    neck = Linear(
+        config.neck_config.in_channels,
+        config.neck_config.out_channels,
+        config.neck_config.bias,
+    )
+    return VisionEncoder(backbone, pooler=pooler, neck=neck)
+class TextEncoder(nn.Module):
+    """Text encoder to extract text features.
+    Pooler and neck are optional.
+    Instead of defining pooler and neck in TextEncoder, you can define them in algorithm classes.
+    Attributes:
+        backbone (nn.Module): backbone loaded from timm, huggingface or registry.
+        pooler (nn.Module): module to extract image-level features.
+        neck (nn.Module): module to adjust feature dimensions.
+    """
+    def __init__(
+        self,
+        backbone: nn.Module,
+        pooler: Optional[nn.Module] = None,
+        neck: Optional[nn.Module] = None,
+    ) -> None:
+        super().__init__()
+        self.backbone = backbone
+        self.pooler = pooler
+        self.neck = neck
+    def forward(self, inputs: dict) -> torch.Tensor:
+        """A method to extract text features.
+        Args:
+            inputs (dict): basic keys are shown below:
+                - input_ids (torch.Tensor)
+                - attention_mask (Optional[torch.Tensor])
+                - position_ids (Optional[torch.Tensor])
+                - token_type_ids (Optional[torch.Tensor])
+                - output_attentions Optional[bool]
+                - output_hidden_states Optional[bool]
+        Returns:
+            out (torch.Tensor): the output shape changes depending on pooler, and the following shapes are usually expected.
+                - output only class token like CLIP: shape=(batch_size, embed_dim)
+                - output all token features like BLIP2: shape=(batch_size, embed_dim, length)
+        """
+        out = self.backbone(**inputs)
+        if self.pooler:
+            out = self.pooler(out)
+        if self.neck:
+            out = self.neck(out)
+        return out
+class TextBackboneModelWrapper(nn.Module):
+    def __init__(self, model: nn.Module) -> None:
+        super().__init__()
+        self.model = model.text_model
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        out = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+        )
+        return out
+    def set_gradient_checkpointing(self, enabled: bool) -> None:
+        if enabled:
+            self.model.gradient_checkpointing_enable()
+def load_from_huggingface(
+    config: CLYPTextBackboneConfig,
+    use_gradient_checkpointing: bool = False,
+    path_weights: Optional[str] = None,
+    logger: logging.Logger = DEFAULT_LOGGER,
+) -> nn.Module:
+    """Load a backbone from huggingface.
+    Args:
+        config (HuggingfaceBackboneConfig): config fed to AutoModel.from_pretrained.
+        use_gradient_checkpointing (bool): True if use gradient checkpointing.
+        path_weights (str): path to weights for backbone initialization.
+    """
+    # NOTE:
+    # Initialize Rinna CLIP without pretrained weights here,
+    # because CLYP model loads its whole weights afterward
+    auto_config = RinnaCLIPConfig.from_pretrained(config.model_name)
+    backbone = RinnaCLIPModel(auto_config)
+    logger.info(f"    - load from huggingface: model_name={config.model_name}")
+    # gradient checkpointing
+    if isinstance(backbone, PreTrainedModel):
+        if use_gradient_checkpointing:
+            backbone.gradient_checkpointing_enable()
+            logger.info("    - gradient checkpointing is enabled")
+    else:
+        raise NotImplementedError()
+    # init weights
+    if path_weights:
+        raise NotImplementedError()
+    return backbone
+def create_text_encoder(
+    config: CLYPTextEncoderConfig, logger: logging.Logger = DEFAULT_LOGGER
+) -> TextEncoder:
+    assert config.pooler_config.input_type
+    backbone = TextBackboneModelWrapper(
+        load_from_huggingface(config.backbone_config, logger=logger)
+    )
+    pooler = CLSTokenPooling(
+        config.pooler_config.input_type, config.pooler_config.return_patch_features
+    )
+    neck = Linear(
+        config.neck_config.in_channels,
+        config.neck_config.out_channels,
+        bias=config.neck_config.bias,
+    )
+    return TextEncoder(backbone, pooler=pooler, neck=neck)
+class Linear(nn.Module):
+    """Linear layer."""
+    def __init__(self, in_channels: int, out_channels: int, bias: bool) -> None:
+        """
+        Args:
+            in_channels (int): input feature dimension.
+            out_channels (out): output feature dimension.
+            bias (bool): True if use bias in nn.Linear.
+        """
+        super().__init__()
+        self.linear = nn.Linear(in_channels, out_channels, bias=bias)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            x (torch.Tensor): shape=(batch_size, ..., in_channels).
+        Returns:
+            out (torch.Tensor): shape=(batch_size, ..., out_channels).
+        """
+        out = self.linear(x)
+        return out
+class CLSTokenPooling(nn.Module):
+    """A module to extract class token."""
+    def __init__(self, input_type: str, return_patch_features: bool) -> None:
+        """
+        Args:
+            input_type (str): timm or huggingface.
+                - If input_type is timm, x[:, 0] is extracted as a class token.
+                - If input_type is huggingface, x.last_hidden_state[:,0] is extracted as a class token.
+            return_patch_features (bool): True if output local features.
+        """
+        super().__init__()
+        assert input_type in ["timm", "huggingface"]
+        self.input_type = input_type
+        self.return_patch_features = return_patch_features
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            x (torch.Tensor): shape=(batch_size, length, dim).
+        Returns:
+            out (torch.Tensor): shape=(batch_size, dim).
+        """
+        # tensor: shape=(batch_size, length, dim)
+        if self.input_type == "timm":
+            assert x.ndim == 3, "CLSTokenPooling: dimension of input tensor must be 3."
+            if self.return_patch_features:
+                return x
+            else:
+                return x[:, 0]
+        # huggingface
+        elif self.input_type == "huggingface":
+            out = x.last_hidden_state
+            if self.return_patch_features:
+                return out
+            else:
+                return out[:, 0]
+class InfoNCELoss(nn.Module):
+    def __init__(
+        self,
+        learn_temperature: bool,
+        init_temperature: float,
+        max_temperature: Optional[float] = None,
+        min_temperature: Optional[float] = None,
+        label_smoothing: float = 0.0,
+        gather_with_grad: bool = False,
+    ):
+        super().__init__()
+        self.label_smoothing = label_smoothing
+        self.gather_with_grad = gather_with_grad
+        # set temperature
+        self.learn_temperature = learn_temperature
+        self.temperature = torch.ones([]) * init_temperature
+        if self.learn_temperature:
+            self.temperature = nn.Parameter(self.temperature)
+            self.max_temperature = max_temperature
+            self.min_temperature = min_temperature
+        # whether clip temperature or not
+        self.require_temperature_clipping = self.learn_temperature and (
+            self.max_temperature or self.min_temperature
+        )
+    def clip_temperature(self):
+        if self.require_temperature_clipping:
+            self.temperature.data = torch.clamp(
+                self.temperature, self.min_temperature, self.max_temperature
+            )
+    def forward(
+        self,
+        image_feats: torch.Tensor,
+        text_feats: torch.Tensor,
+        return_similarity: bool = False,
+    ) -> Union[torch.Tensor, tuple[torch.Tensor]]:
+        # gather image and text features
+        image_feats_all = concat_all_gather(
+            image_feats, with_grad=self.gather_with_grad
+        )
+        text_feats_all = concat_all_gather(text_feats, with_grad=self.gather_with_grad)
+        # compute cosine similarity
+        sim_i2t = image_to_text_similarity(
+            image_feats=image_feats,
+            text_feats=text_feats_all,
+        )
+        sim_t2i = text_to_image_similarity(
+            text_feats=text_feats,
+            image_feats=image_feats_all,
+        )
+        # logits, scaled cosine similarity
+        logits_i2t = sim_i2t / self.temperature
+        logits_t2i = sim_t2i / self.temperature
+        # obtain targets
+        rank = dist.get_rank()
+        batch_size = image_feats.size(0)
+        targets = torch.arange(batch_size) + batch_size * rank
+        targets = targets.to(dtype=torch.long, device=image_feats.device)
+        # calculate loss
+        loss_i2t = F.cross_entropy(
+            logits_i2t, targets, label_smoothing=self.label_smoothing
+        )
+        loss_t2i = F.cross_entropy(
+            logits_t2i, targets, label_smoothing=self.label_smoothing
+        )
+        loss = (loss_i2t + loss_t2i) / 2.0
+        if not return_similarity:
+            return loss
+        else:
+            return loss, sim_i2t, sim_t2i
+def image_to_text_similarity(
+    image_feats: torch.Tensor, text_feats: torch.Tensor
+) -> torch.Tensor:
+    """
+    Args:
+        image_feats (torch.Tensor): shape=(num_imgs, embed_dim) or (num_imgs, num_query_tokens, embed_dim).
+        text_feats (torch.Tensor): shape=(num_texts, embed_dim).
+    Returns:
+        sim_i2t (torch.Tensor): shape=(num_imgs, num_texts).
+    """
+    assert image_feats.ndim in [2, 3]
+    assert text_feats.ndim == 2
+    # normalize features
+    image_feats = F.normalize(image_feats, dim=-1)
+    text_feats = F.normalize(text_feats, dim=-1)
+    if image_feats.ndim == 2:
+        sim_i2t = image_feats @ text_feats.T
+    else:
+        # a query token with maximum cosine similarity is selected
+        sim_i2t = torch.matmul(
+            image_feats.unsqueeze(1), text_feats.unsqueeze(0).unsqueeze(-1)
+        ).squeeze()  # shape=(num_imgs, num_texts, num_query_tokens)
+        sim_i2t, _ = sim_i2t.max(dim=-1)  # shape=(num_imgs, num_texts)
+    return sim_i2t
+def text_to_image_similarity(text_feats: torch.Tensor, image_feats: torch.Tensor):
+    """
+    Args:
+        text_feats (torch.Tensor): shape=(num_texts, embed_dim).
+        image_feats (torch.Tensor): shape=(num_imgs, embed_dim) or (num_imgs, num_query_tokens, embed_dim).
+    Returns:
+        similarity_maxtrix (torch.Tensor): shape=(num_texts, num_imgs).
+    """
+    assert image_feats.ndim in [2, 3]
+    assert text_feats.ndim == 2
+    # normalize features
+    image_feats = F.normalize(image_feats, dim=-1)
+    text_feats = F.normalize(text_feats, dim=-1)
+    if image_feats.ndim == 2:
+        sim_t2i = text_feats @ image_feats.T
+    else:
+        # a query token with maximum cosine similarity is selected
+        sim_t2i = torch.matmul(
+            text_feats.unsqueeze(1).unsqueeze(1),
+            image_feats.permute(0, 2, 1).unsqueeze(0),
+        ).squeeze()
+        sim_t2i, _ = sim_t2i.max(dim=-1)
+    return sim_t2i
+def concat_all_gather(tensor: torch.Tensor, with_grad: bool):
+    """
+    Performs all_gather operation on the provided tensors.
+    *** Warning ***: torch.distributed.all_gather has no gradient.
+    Another implementation: https://github.com/salesforce/LAVIS/blob/main/lavis/models/base_model.py#L202-L237
+    """
+    if with_grad:
+        output = torch.cat(torch.distributed.nn.all_gather(tensor), dim=0)
+    else:
+        tensors_gather = [torch.ones_like(tensor) for _ in range(dist.get_world_size())]
+        dist.all_gather(tensors_gather, tensor, async_op=False)
+        output = torch.cat(tensors_gather, dim=0)
+    return output

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f0997d294a0723358c5622fc51caa0b8589de2d36295b1ff40cfa11f9c9f8e9c
+size 786788708

model_rinna.py ADDED Viewed

	@@ -0,0 +1,400 @@

+# coding=utf-8
+# Copyright 2024 LY Corporation.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Almost copied from https://github.com/rinnakk/japanese-clip/blob/master/src/japanese_clip/clip/modeling_clip.py
+# This code is distributed under the Apache License 2.0.
+from __future__ import annotations
+import copy
+from typing import Optional
+import torch
+import torch.distributed.nn
+import torch.nn as nn
+from transformers import AutoConfig, AutoModel, PreTrainedModel
+from transformers.configuration_utils import PretrainedConfig
+from transformers.models.clip import (
+    CLIPVisionConfig,
+    CLIPVisionModel,
+)
+from transformers.models.clip.modeling_clip import CLIPOutput
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+# Copied from transformers.models.clip.modeling_clip.contrastive_loss
+def contrastive_loss(logits: torch.Tensor) -> torch.Tensor:
+    return nn.functional.cross_entropy(
+        logits, torch.arange(len(logits), device=logits.device)
+    )
+# Copied from transformers.models.clip.modeling_clip.clip_loss
+def clip_loss(similarity: torch.Tensor) -> torch.Tensor:
+    caption_loss = contrastive_loss(similarity)
+    image_loss = contrastive_loss(similarity.T)
+    return (caption_loss + image_loss) / 2.0
+class RinnaCLIPConfig(PretrainedConfig):
+    model_type = "clip"
+    is_composition = True
+    def __init__(self, projection_dim=512, logit_scale_init_value=2.6592, **kwargs):
+        super().__init__(**kwargs)
+        if "vision_config" not in kwargs:
+            raise ValueError("`vision_config` can not be `None`.")
+        if "text_config" not in kwargs:
+            raise ValueError("`text_config` can not be `None`.")
+        vision_config = kwargs.pop("vision_config")
+        text_config = kwargs.pop("text_config")
+        vision_model_type = vision_config.pop("model_type")
+        text_model_type = text_config.pop("model_type")
+        if vision_model_type == "clip":
+            self.vision_config = AutoConfig.for_model(
+                vision_model_type, **vision_config
+            ).vision_config
+        elif vision_model_type == "clip_vision_model":
+            self.vision_config = CLIPVisionConfig(**vision_config)
+        else:
+            self.vision_config = AutoConfig.for_model(
+                vision_model_type, **vision_config
+            )
+        self.text_config = AutoConfig.for_model(text_model_type, **text_config)
+        self.projection_dim = projection_dim
+        self.logit_scale_init_value = logit_scale_init_value
+    @classmethod
+    def from_vision_text_configs(
+        cls, vision_config: PretrainedConfig, text_config: PretrainedConfig, **kwargs
+    ):
+        r"""
+        Instantiate a [`VisionTextDualEncoderConfig`] (or a derived class) from text model configuration and vision
+        model configuration.
+        Returns:
+            [`VisionTextDualEncoderConfig`]: An instance of a configuration object
+        """
+        return cls(
+            vision_config=vision_config.to_dict(),
+            text_config=text_config.to_dict(),
+            **kwargs,
+        )
+    def to_dict(self):
+        """
+        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
+        Returns:
+            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+        """
+        output = copy.deepcopy(self.__dict__)
+        output["vision_config"] = self.vision_config.to_dict()
+        output["text_config"] = self.text_config.to_dict()
+        output["model_type"] = self.__class__.model_type
+        return output
+class RinnaCLIPModel(PreTrainedModel):
+    config_class = RinnaCLIPConfig
+    base_model_prefix = "clip"
+    def __init__(
+        self,
+        config: Optional[RinnaCLIPConfig] = None,
+        vision_model: Optional[PreTrainedModel] = None,
+        text_model: Optional[PreTrainedModel] = None,
+    ):
+        if config is None and (vision_model is None or text_model is None):
+            raise ValueError(
+                "Either a configuration or an vision and a text model has to be provided"
+            )
+        if config is None:
+            config = RinnaCLIPConfig.from_vision_text_configs(
+                vision_model.config,
+                text_model.config,  # type: ignore[union-attr]
+            )
+        else:
+            if not isinstance(config, self.config_class):
+                raise ValueError(
+                    f"config: {config} has to be of type {self.config_class}"
+                )
+        # initialize with config
+        super().__init__(config)
+        if vision_model is None:
+            if isinstance(config.vision_config, CLIPVisionConfig):
+                vision_model = CLIPVisionModel(
+                    config.vision_config, add_pooling_layer=False
+                )
+            else:
+                vision_model = AutoModel.from_config(
+                    config.vision_config, add_pooling_layer=False
+                )
+        if text_model is None:
+            text_model = AutoModel.from_config(
+                config.text_config, add_pooling_layer=False
+            )
+        self.vision_model = vision_model
+        self.text_model = text_model
+        # make sure that the individual model's config refers to the shared config
+        # so that the updates to the config will be synced
+        self.vision_model.config = self.config.vision_config
+        self.text_model.config = self.config.text_config
+        self.vision_embed_dim = config.vision_config.hidden_size
+        self.text_embed_dim = config.text_config.hidden_size
+        self.projection_dim = config.projection_dim
+        self.visual_projection = nn.Linear(
+            self.vision_embed_dim, self.projection_dim, bias=False
+        )
+        self.text_projection = nn.Linear(
+            self.text_embed_dim, self.projection_dim, bias=False
+        )
+        self.logit_scale = nn.Parameter(
+            torch.ones([]) * self.config.logit_scale_init_value
+        )
+    def get_text_features(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        position_ids=None,
+        token_type_ids=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        out=False,
+    ):
+        text_outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        pooled_output = text_outputs.last_hidden_state[:, 0, :]
+        text_features = self.text_projection(pooled_output)
+        if out:
+            return text_features, text_outputs
+        return text_features
+    def get_image_features(
+        self,
+        pixel_values=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        pooled_output = vision_outputs.last_hidden_state[:, 0, :]
+        image_features = self.visual_projection(pooled_output)
+        return image_features
+    def forward(
+        self,
+        input_ids=None,
+        pixel_values=None,
+        attention_mask=None,
+        position_ids=None,
+        return_loss=None,
+        token_type_ids=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        return_dict = (
+            return_dict if return_dict is not None else self.config.return_dict
+        )
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        text_outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        image_embeds = vision_outputs.last_hidden_state[:, 0, :]
+        image_embeds = self.visual_projection(image_embeds)
+        text_embeds = text_outputs.last_hidden_state[:, 0, :]
+        text_embeds = self.text_projection(text_embeds)
+        # normalized features
+        image_embeds = image_embeds / image_embeds.norm(dim=-1, keepdim=True)
+        text_embeds = text_embeds / text_embeds.norm(dim=-1, keepdim=True)
+        # cosine similarity as logits
+        logit_scale = self.logit_scale.exp()
+        # logit_scale = self.logit_scale
+        logits_per_text = torch.matmul(text_embeds, image_embeds.t()) * logit_scale
+        logits_per_image = logits_per_text.T
+        loss = None
+        if return_loss:
+            loss = clip_loss(logits_per_text)
+        if not return_dict:
+            output = (
+                logits_per_image,
+                logits_per_text,
+                text_embeds,
+                image_embeds,
+                text_outputs,
+                vision_outputs,
+            )
+            return ((loss,) + output) if loss is not None else output
+        return CLIPOutput(
+            loss=loss,
+            logits_per_image=logits_per_image,
+            logits_per_text=logits_per_text,
+            text_embeds=text_embeds,
+            image_embeds=image_embeds,
+            text_model_output=text_outputs,
+            vision_model_output=vision_outputs,
+        )
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        # At the moment fast initialization is not supported
+        # for composite models
+        kwargs["_fast_init"] = False
+        return super().from_pretrained(*args, **kwargs)
+    @classmethod
+    def from_vision_text_pretrained(
+        cls,
+        vision_model_name_or_path: Optional[str] = None,
+        text_model_name_or_path: Optional[str] = None,
+        *model_args,
+        **kwargs,
+    ) -> PreTrainedModel:
+        kwargs_vision = {
+            argument[len("vision_") :]: value
+            for argument, value in kwargs.items()
+            if argument.startswith("vision_")
+        }
+        kwargs_text = {
+            argument[len("text_") :]: value
+            for argument, value in kwargs.items()
+            if argument.startswith("text_")
+        }
+        # remove vision, text kwargs from kwargs
+        for key in kwargs_vision.keys():
+            del kwargs["vision_" + key]
+        for key in kwargs_text.keys():
+            del kwargs["text_" + key]
+        # Load and initialize the vision and text model
+        vision_model = kwargs_vision.pop("model", None)
+        if vision_model is None:
+            if vision_model_name_or_path is None:
+                raise ValueError(
+                    "If `vision_model` is not defined as an argument, a `vision_model_name_or_path` has to be defined"
+                )
+            if "config" not in kwargs_vision:
+                vision_config = AutoConfig.from_pretrained(vision_model_name_or_path)
+            if vision_config.model_type == "clip":
+                kwargs_vision["config"] = vision_config.vision_config
+                vision_model = CLIPVisionModel.from_pretrained(
+                    vision_model_name_or_path,
+                    add_pooling_layer=False,
+                    *model_args,
+                    **kwargs_vision,
+                )
+                # TODO: Should we use the pre-trained projection as well ?
+            else:
+                kwargs_vision["config"] = vision_config
+                vision_model = AutoModel.from_pretrained(
+                    vision_model_name_or_path,
+                    add_pooling_layer=False,
+                    *model_args,
+                    **kwargs_vision,
+                )
+        text_model = kwargs_text.pop("model", None)
+        if text_model is None:
+            if text_model_name_or_path is None:
+                raise ValueError(
+                    "If `text_model` is not defined as an argument, a `text_model_name_or_path` has to be defined"
+                )
+            if "config" not in kwargs_text:
+                text_config = AutoConfig.from_pretrained(text_model_name_or_path)
+                kwargs_text["config"] = text_config
+            text_model = AutoModel.from_pretrained(
+                text_model_name_or_path,
+                add_pooling_layer=False,
+                *model_args,
+                **kwargs_text,
+            )
+        # instantiate config with corresponding kwargs
+        config = RinnaCLIPConfig.from_vision_text_configs(
+            vision_model.config, text_model.config, **kwargs
+        )
+        # init model
+        model = cls(config=config, vision_model=vision_model, text_model=text_model)
+        # the projection layers are always newly initialized when loading the model
+        # using pre-trained vision and text model.
+        # logger.warning(
+        #    "The projection layer and logit scale weights `['visual_projection.weight', 'text_projection.weight', 'logit_scale']` "
+        #    "are newly initialized. You should probably TRAIN this model on a down-stream task "
+        #    "to be able to use it for predictions and inference."
+        # )
+        return model

modeling_clyp.py ADDED Viewed

	@@ -0,0 +1,160 @@

+# coding=utf-8
+# Copyright 2024 LY Corporation.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Any, Optional
+import torch
+import torch.nn.functional as F
+from transformers import PreTrainedModel
+from transformers.models.clip.modeling_clip import CLIPOutput
+from .configuration_clyp import CLYPConfig, CLYPLossConfig
+from .model import InfoNCELoss, create_text_encoder, create_vision_encoder
+from .model_rinna import RinnaCLIPModel  # noqa
+@dataclass
+class CLYPOutput(CLIPOutput):
+    ...
+class CLYPPreTrainedModel(PreTrainedModel):
+    config_class = CLYPConfig
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+    def _init_weights(self, module: Any) -> None:
+        pass
+class CLYPModel(CLYPPreTrainedModel):
+    def __init__(self, config: CLYPConfig):
+        super().__init__(config)
+        self.vision_encoder = create_vision_encoder(config.vision_encoder_config)
+        self.text_encoder = create_text_encoder(config.text_encoder_config)
+        self.initialize_clip(
+            learn_temperature=config.learn_temperature,
+            temperature_init=config.temperature_init,
+            temperature_min=config.temperature_min,
+            temperature_max=config.temperature_max,
+            itc_loss_config=config.itc_loss_config,
+        )
+    def initialize_clip(
+        self,
+        learn_temperature: Optional[bool] = None,
+        temperature_init: Optional[float] = None,
+        temperature_min: Optional[float] = None,
+        temperature_max: Optional[float] = None,
+        itc_loss_config: Optional[CLYPLossConfig] = None,
+    ) -> None:
+        # create contrastive loss function
+        if itc_loss_config:
+            raise NotImplementedError
+        else:
+            assert learn_temperature is not None
+            assert temperature_init is not None
+            self.itc_loss_fn = InfoNCELoss(
+                learn_temperature=learn_temperature,
+                init_temperature=temperature_init,
+                max_temperature=temperature_max,
+                min_temperature=temperature_min,
+                gather_with_grad=True,
+            )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        return_loss: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> tuple | CLYPOutput:
+        image_feats = self.vision_encoder(pixel_values)
+        text_feats = self.text_encoder(
+            {
+                "input_ids": input_ids,
+                "attention_mask": attention_mask,
+                "position_ids": position_ids,
+            }
+        )
+        loss = None
+        if return_loss:
+            loss = self.itc_loss_fn(image_feats, text_feats)
+        image_embeds = F.normalize(image_feats, dim=-1)
+        text_embeds = F.normalize(text_feats, dim=-1)
+        sim_i2t = image_embeds @ text_embeds.T
+        sim_t2i = text_embeds @ image_embeds.T
+        logits_per_image = sim_i2t / self.itc_loss_fn.temperature
+        logits_per_text = sim_t2i / self.itc_loss_fn.temperature
+        if not return_dict:
+            if loss is None:
+                return (logits_per_image, logits_per_text, text_embeds, image_embeds)
+            return (loss, logits_per_image, logits_per_text, text_embeds, image_embeds)
+        # TODO:
+        #   - Support vision_model_output and text_model_output
+        #   - Improve type: torch.Tensor -> torch.FloatTensor
+        return CLYPOutput(
+            loss=loss,
+            logits_per_image=logits_per_image,  # type: ignore
+            logits_per_text=logits_per_text,  # type: ignore
+            text_embeds=text_embeds,  # type: ignore
+            image_embeds=image_embeds,  # type: ignore
+        )
+    def get_text_features(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> torch.FloatTensor:
+        text_feats = self.text_encoder(
+            {
+                "input_ids": input_ids,
+                "attention_mask": attention_mask,
+                "position_ids": position_ids,
+            }
+        )
+        return text_feats
+    def get_image_features(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> torch.FloatTensor:
+        image_feats = self.vision_encoder(pixel_values)
+        return image_feats
+if __name__ == "__main__":
+    model = CLYPModel.from_pretrained(".")

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "auto_map": {
+    "AutoImageProcessor": "image_processing_clyp.CLYPImageProcessor"
+  },
+  "image_processor_type": "CLYPImageProcessor",
+  "image_size": 224,
+  "normalization_type": "imagenet"
+}

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {}

spiece.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b5cbdfa8aa7c54c8c5af85b78c309c54a5f2749a20468bf6f60eee007fe6fec1
+size 805634

tokenization_clyp.py ADDED Viewed

	@@ -0,0 +1,125 @@

+# coding=utf-8
+# Copyright 2024 LY Corporation.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+from typing import Optional
+import torch
+from transformers import BatchEncoding, PreTrainedTokenizer, T5Tokenizer
+from transformers.tokenization_utils_base import (
+    PaddingStrategy,
+    PreTokenizedInput,
+    TextInput,
+    TruncationStrategy,
+)
+class CLYPTokenizer(PreTrainedTokenizer):
+    """CLYPTokenizer based on rinna/japanese-roberta-base
+    This tokenizer is registered as a custom tokenizer to manually add CLS token to each text.
+    """
+    def __init__(self, max_length: int, padding: str, truncation: bool, **kwargs):
+        # tokenizer
+        self.tokenizer = T5Tokenizer.from_pretrained("rinna/japanese-roberta-base")
+        self.tokenizer.do_lower_case = True
+        super().__init__(
+            max_length=max_length, padding=padding, truncation=truncation, **kwargs
+        )
+        self.max_length = max_length
+        self.padding = padding
+        self.truncation = truncation
+    @property
+    def vocab_size(self):
+        return self.tokenizer.vocab_size
+    def get_vocab(self) -> dict[str, int]:
+        return self.tokenizer.get_vocab()
+    def save_vocabulary(
+        self, save_directory: str, filename_prefix: Optional[str] = None
+    ) -> tuple[str]:
+        return self.tokenizer.save_vocabulary(
+            save_directory, filename_prefix=filename_prefix
+        )
+    def _tokenize(self, text, **kwargs):
+        return self.tokenizer._tokenize(text, **kwargs)
+    def _convert_token_to_id(self, token):
+        return self.tokenizer._convert_token_to_id(token)
+    def _convert_id_to_token(self, index: int) -> str:
+        return self.tokenizer._convert_id_to_token(index)
+    def __call__(
+        self,
+        text: TextInput | PreTokenizedInput | list[TextInput] | list[PreTokenizedInput],
+        add_special_tokens: bool = True,
+        padding: bool | str | PaddingStrategy | None = None,
+        truncation: bool | str | TruncationStrategy | None = None,
+        max_length: Optional[int] = None,
+        **kwargs,
+    ):
+        if max_length is None:
+            max_length = self.max_length
+        if padding is None:
+            padding = self.padding
+        if truncation is None:
+            truncation = self.truncation
+        if add_special_tokens:
+            max_length = max_length - 1
+        if not isinstance(text, list):
+            # TODO: Review
+            text = [text]
+        out = self.tokenizer(
+            text,
+            max_length=max_length,
+            padding=padding,
+            truncation=truncation,
+            add_special_tokens=False,
+            **kwargs,
+        )
+        if add_special_tokens:
+            input_ids = [
+                [self.tokenizer.cls_token_id] + ids for ids in out["input_ids"]
+            ]
+            attention_mask = [[1] + am for am in out["attention_mask"]]
+            position_ids = [list(range(0, len(input_ids[0])))] * len(input_ids)
+        else:
+            input_ids = out["input_ids"]
+            attention_mask = out["attention_mask"]
+            position_ids = [list(range(0, len(input_ids[0])))] * len(input_ids)
+        # tensor
+        input_ids = torch.tensor(input_ids, dtype=torch.long)
+        attention_mask = torch.tensor(attention_mask, dtype=torch.long)
+        position_ids = torch.tensor(position_ids, dtype=torch.long)
+        # retrn
+        data = {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "position_ids": position_ids,
+        }
+        return BatchEncoding(data=data)

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+  "added_tokens_decoder": {},
+  "auto_map": {
+    "AutoTokenizer": [
+      "tokenization_clyp.CLYPTokenizer",
+      null
+    ]
+  },
+  "clean_up_tokenization_spaces": true,
+  "max_length": 77,
+  "model_max_length": 1000000000000000019884624838656,
+  "padding": "longest",
+  "tokenizer_class": "CLYPTokenizer",
+  "truncation": true
+}