--- license: apache-2.0 --- Finetuned from p1atdev/siglip-tagger-test-3 https://huggingface.co/p1atdev/siglip-tagger-test-3 test work Usage: ``` import torch import torch.nn as nn import numpy as np from dataclasses import dataclass from transformers import SiglipVisionModel, SiglipPreTrainedModel, SiglipVisionConfig, AutoImageProcessor from transformers.utils import ModelOutput @dataclass class SiglipForImageClassifierOutput(ModelOutput): loss: torch.FloatTensor | None = None logits: torch.FloatTensor | None = None pooler_output: torch.FloatTensor | None = None hidden_states: tuple[torch.FloatTensor, ...] | None = None attentions: tuple[torch.FloatTensor, ...] | None = None class SiglipForImageClassification(SiglipPreTrainedModel): config_class = SiglipVisionConfig main_input_name = "pixel_values" def __init__( self, config, ): super().__init__(config) # self.num_labels = config.num_labels self.siglip = SiglipVisionModel(config) # Classifier head self.classifier = ( nn.Linear(config.hidden_size, config.num_labels) if config.num_labels > 0 else nn.Identity() ) # Initialize weights and apply final processing self.post_init() def forward( self, pixel_values: torch.FloatTensor, labels: torch.LongTensor | None = None ): outputs = self.siglip(pixel_values) pooler_output = outputs.pooler_output logits = self.classifier(pooler_output) loss = None if labels is not None: loss_fct = nn.BCEWithLogitsLoss() loss = loss_fct(logits, labels) return SiglipForImageClassifierOutput( loss=loss, logits=logits, pooler_output=outputs.pooler_output, hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) # モデル設定のロード device = torch.device("cuda" if torch.cuda.is_available() else "cpu") config = SiglipVisionConfig.from_pretrained('cella110n/siglip-tagger-FT3ep') processor = AutoImageProcessor.from_pretrained("cella110n/siglip-tagger-FT3ep", config=config) model = SiglipForImageClassification.from_pretrained('cella110n/siglip-tagger-FT3ep', torch_dtype=torch.bfloat16).to(device) model.eval() print("Model Loaded. device:", model.device) from PIL import Image # 入力画像サイズの確認と調整 img_path = "path/to/image" img = Image.open(img_path). inputs = processor(images=img, return_tensors="pt") # 画像をモデルに適した形式に変換 print("Image processed.") # inputs.pixel_valuesの画像を表示 img = inputs.pixel_values[0].permute(1, 2, 0).cpu().numpy() plt.imshow(img) plt.axis('off') plt.show() # # モデルの予測実行 with torch.no_grad(): logits = (model( **inputs.to( model.device, model.dtype ) ) .logits.detach() .cpu() .float() ) logits = np.clip(logits, 0.0, 1.0) # オーバーフローを防ぐためにlogitsをクリップ prob_cutoff = 0.3 # この確率以上のクラスのみを表示 result = {} for prediction in logits: for i, prob in enumerate(prediction): if prob.item() > prob_cutoff: result[model.config.id2label[i]] = prob.item() # resultを、高いほうから表示 sorted_result = sorted(result.items(), key=lambda x: x[1], reverse=True) sorted_result ```