File size: 6,740 Bytes
685cc58
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
import time
import numpy as np
from PIL import Image
from scipy.spatial.distance import cdist
from scipy.optimize import linear_sum_assignment


class SimpleAffineTransform:
    """
    simple affine transform, only translation and scale.
    """
    def __init__(self, translation=(0, 0), scale=1.0):
        self.translation = np.array(translation)
        self.scale = scale

    def estimate(self, src, dst):
        src_center = np.mean(src, axis=0)
        dst_center = np.mean(dst, axis=0)
        self.translation = dst_center - src_center

        src_dists = np.linalg.norm(src - src_center, axis=1)
        dst_dists = np.linalg.norm(dst - dst_center, axis=1)
        self.scale = np.mean(dst_dists) / (np.mean(src_dists) + 1e-10)

    def inverse(self):
        inverse_transform = AffineTransform(-self.translation, 1.0/self.scale)
        return inverse_transform

    def __call__(self, coords):
        return self.scale * (coords - np.mean(coords, axis=0)) + np.mean(coords, axis=0) + self.translation

    def residuals(self, src, dst):
        return np.sqrt(np.sum((self(src) - dst) ** 2, axis=1))


def norm_coords(x, left, right):
    if x < left:
        return left
    if x > right:
        return right
    return x

def norm_same_token(token):
    special_map = {
        "\\cdot": ".",
        "\\mid": "|",
        "\\to": "\\rightarrow",
        "\\top": "T",
        "\\Tilde": "\\tilde",
        "\\cdots": "\\dots",
        "\\prime": "'",
        "\\ast": "*",
        "\\left<": "\\langle",
        "\\right>": "\\rangle"
    }
    if token in special_map.keys():
        token = special_map[token]
    if token.startswith('\\left') or token.startswith('\\right'):
        token = token.replace("\\left", "").replace("\\right", "")
    if token.startswith('\\big') or token.startswith('\\Big'):
        if "\\" in token[4:]:
            token = "\\"+token[4:].split("\\")[-1]
        else:
            token = token[-1]
    
    if token in ['\\leq', '\\geq']:
        return token[0:-1]
    if token in ['\\lVert', '\\rVert', '\\Vert']:
        return '\\|'
    if token in ['\\lvert', '\\rvert', '\\vert']:
        return '|'
    if token.endswith("rightarrow"):
        return "\\rightarrow"
    if token.endswith("leftarrow"):
        return "\\leftarrow"
    if token.startswith('\\wide'):
        return token.replace("wide", "")
    if token.startswith('\\var'):
        return token.replace("\\var", "")
    return token


class HungarianMatcher:
    def __init__(
        self, 
        cost_token: float = 1,
        cost_position: float = 0.05,
        cost_order: float = 0.15,
    ):
        self.cost_token = cost_token
        self.cost_position = cost_position
        self.cost_order = cost_order
        self.cost = {}
    
    def calculate_token_cost_old(self, box_gt, box_pred):
        token_cost = np.ones((len(box_gt), len(box_pred)))
        for i in range(token_cost.shape[0]):
            box1 = box_gt[i]
            for j in range(token_cost.shape[1]):
                box2 = box_pred[j]
                if box1['token'] == box2['token']:
                    token_cost[i, j] = 0
                elif norm_same_token(box1['token']) == norm_same_token(box2['token']):
                    token_cost[i, j] = 0.05
        return np.array(token_cost)
        
    def calculate_token_cost(self, box_gt, box_pred):
        token2id = {}
        for data in box_gt+box_pred:
            if data['token'] not in token2id:
                token2id[data['token']] = len(token2id)
        num_classes = len(token2id)
        
        token2id_norm = {}
        for data in box_gt+box_pred:
            if norm_same_token(data['token']) not in token2id_norm:
                token2id_norm[norm_same_token(data['token'])] = len(token2id_norm)
        num_classes_norm = len(token2id_norm)
        
        gt_token_array = []
        norm_gt_token_array = []    
        for data in box_gt:
            gt_token_array.append(token2id[data['token']])
            norm_gt_token_array.append(token2id_norm[norm_same_token(data['token'])])
            
        pred_token_logits = []
        norm_pred_token_logits = []
        for data in box_pred:
            logits = [0] * num_classes
            logits[token2id[data['token']]] = 1
            pred_token_logits.append(logits)
            
            logits_norm = [0] * num_classes_norm
            logits_norm[token2id_norm[norm_same_token(data['token'])]] = 1
            norm_pred_token_logits.append(logits_norm)
            
        gt_token_array = np.array(gt_token_array)
        pred_token_logits = np.array(pred_token_logits)
        
        norm_gt_token_array = np.array(norm_gt_token_array)
        norm_pred_token_logits = np.array(norm_pred_token_logits)
        
        token_cost = 1.0 - pred_token_logits[:, gt_token_array]
        norm_token_cost = 1.0 - norm_pred_token_logits[:, norm_gt_token_array]

        token_cost[np.logical_and(token_cost==1, norm_token_cost==0)] = 0.05
        return token_cost.T
        
        
    def box2array(self, box_list, size):
        W, H = size
        box_array = []
        for box in box_list:
            x_min, y_min, x_max, y_max = box['bbox']
            box_array.append([x_min/W, y_min/H, x_max/W, y_max/H])
        return np.array(box_array)
        
    def order2array(self, box_list):
        order_array = []
        for idx, box in enumerate(box_list):
            order_array.append([idx / len(box_list)])
        return np.array(order_array)
    
    def calculate_l1_cost(self, gt_array, pred_array):
        scale = gt_array.shape[-1]
        l1_cost = cdist(gt_array, pred_array, 'minkowski', p=1)
        return l1_cost / scale
        
    def __call__(self, box_gt, box_pred, gt_size, pred_size):
        aa = time.time()
        gt_box_array = self.box2array(box_gt, gt_size)
        pred_box_array = self.box2array(box_pred, pred_size)
        gt_order_array = self.order2array(box_gt)
        pred_order_array = self.order2array(box_pred)

        token_cost = self.calculate_token_cost(box_gt, box_pred)
        position_cost = self.calculate_l1_cost(gt_box_array, pred_box_array)
        order_cost = self.calculate_l1_cost(gt_order_array, pred_order_array)

        self.cost["token"] = token_cost
        self.cost["position"] = position_cost
        self.cost["order"] = order_cost
        
        cost = self.cost_token * token_cost + self.cost_position * position_cost + self.cost_order * order_cost
        cost[np.isnan(cost) | np.isinf(cost)] = 100
        indexes = linear_sum_assignment(cost)
        matched_idxes = []
        for a, b in zip(*indexes):
            matched_idxes.append((a, b))
        
        return matched_idxes