from typing import Dict, List, Any
from ultralytics import YOLO
import os
import torch
import torch.nn as nn
import torchvision.transforms as T
from PIL import Image

class LinearClassifier(torch.nn.Module):
    def __init__(self, input_dim=384, output_dim=7):
        super(LinearClassifier, self).__init__()

        self.linear = torch.nn.Linear(input_dim, output_dim)
        self.linear.weight.data.normal_(mean=0.0, std=0.01)
        self.linear.bias.data.zero_()

    def forward(self, x):
        return self.linear(x)

class EndpointHandler():
    def __init__(self, path=""):
        # Preload all the elements you are going to need at inference.
        self.dinov2_vits14 = torch.hub.load('facebookresearch/dinov2', 'dinov2_vits14')
        self.device = torch.device('cuda' if torch.cuda.is_available() else "cpu")
        self.dinov2_vits14.to(self.device)
        print('Successfully load dinov2_vits14 model')
        
        self.yolov8_model = YOLO(os.path.join(path, 'yolov8_2023-07-19_yolov8m.pt'))

        self.linear_model = LinearClassifier()
        self.linear_model.load_state_dict(torch.load(os.path.join(path, 'linear_2023-07-18_v0.2.pt')))
        self.linear_model.eval()
        
        self.transform_image = T.Compose([
            T.ToTensor(), 
            T.Resize(244), 
            T.CenterCrop(224), 
            T.Normalize([0.5], [0.5])
        ])

        with open(os.path.join(path, 'labels.txt'), 'r') as f:
            self.labels = f.read().split(',') # loggerhead,green,leatherback...

        self.name_en2vi = {
            "loggerhead": "Quản đồng",
            "green": "Vích",
            "leatherback": "Rùa da",
            "hawksbill": "Đồi mồi",
            "kemp_ridley": "Vích Kemp",
            "olive_ridley": "Đồi mồi dứa",
            "flatback": "Rùa lưng phẳng"
        }
    
    def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
        """
        data args:
            inputs (:obj: `str` | `PIL.Image` | `np.array`)
            kwargs
        Return:
            A :obj:`list` | `dict`: will be serialized and returned
        """
        # Get the prediction
        result = self.yolov8_model(data['inputs'])
        # Get the original image with channel shifted
        img = result[0].orig_img[:,:,::-1]
        H, W, _ = img.shape
        annotated = img.copy()
        # Modify crop so that it is square
        try:
            x1, y1, x2, y2 = result[0].boxes.xyxy.numpy().astype('int')[0]
            if result[0].boxes.conf[0].item() < 0.75: # if low in confidence
                x1, y1, x2, y2 = 0, 0, W, H
            else:
                annotated = result[0].plot(labels=False, conf=False)[:,:,::-1]
        except: # in case there is no detection
            x1, y1, x2, y2 = 0, 0, W, H

        h, w = y2-y1, x2-x1
        offset = abs(h-w) // 2
        if h > w:
            x1 = max(x1 - offset, 0)
            x2 = min(x2 + offset, W)
        else:
            y1 = max(y1 - offset, 0)
            y2 = min(y2 + offset, H)
        cropped = img[y1:y2, x1:x2]

        new_image = self.transform_image(Image.fromarray(cropped))[:3].unsqueeze(0)
        embedding = self.dinov2_vits14(new_image.to(self.device))
        prediction = self.linear_model(embedding)
        percentage = nn.Softmax(dim=1)(prediction).detach().numpy().round(2)[0].tolist()
        result = {}
        
        for i in range(len(self.labels)):
            result[self.name_en2vi[self.labels[i]]] = percentage[i]

        # Return the annotated original image with the square cropped and result dict
        return annotated.tolist(), result