File size: 8,162 Bytes
0449a8e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
import math
import numpy as np
from scipy.optimize import root

day_ratio = 24 * 3600

depth_width_ratio = 128

constants_per_gpu = {
    "V100": [2.21527743e+07, 1.18538628e+00, 1.43150104e+00, 1.66015023e+00,
             1.32808220e+00, 5.91503856e+00],
    "V100 (without tensor cores and cudnn.benchmark)": [1.82997989e+07, 1.05349588e+00, 1.25312127e+00, 1.67071294e+00,
                                                        1.44610885e+00, 5.55824273e+00],
    "P100": [6.01863899e+07, 9.23656025e-01, 1.03230702e+00, 1.46733667e+00,
             1.03031298e+00, 5.38021875e+00],
    "P4": [4.84472202e+07, 9.86822195e-01, 1.23474901e+00, 1.38493518e+00,
           1.04630858e+00, 1.03572754e+01],
    "K80": [2.58592374e+07, 6.42050890e-01, 7.06115162e-01, 1.44360777e+00,
            7.50695980e-01, 6.25951436e+00]

}

price_per_gpu = {
    "K80": 0.584,
    "P4": 0.689,
    "V100": 2.005,
    "V100 (without tensor cores and cudnn.benchmark)": 2.005,
    "P100": 1.416,
}

optimal_batch_size_per_gpu = {
    "P4": 16,
    "V100": 64,
    "V100 (without tensor cores and cudnn.benchmark)": 64,
    "P100": 64,
    "K80": 16
}

features_per_amp_mode = {
    "O0": (1, 0, 0),
    "O1": (0, 1, 0),
    "O2": (0, 0, 1)
}

gpu_consumption = {
    "V100": 119.3495934959e-3,
    "V100 (without tensor cores and cudnn.benchmark)": 119.3495934959e-3,
    "K80": 142.42e-3,
    "P4": 55.27e-3,
    "P100": 139.65e-3
}

co2_intensity = 534 * 1e-3


def flo_speed(features, constants):
    k, k1, k2, b, c, layer_base = constants
    o0, o1, o2, x, y, z = features
    return k * np.power(k1, o1) * np.power(k2, o2) * x / (x + layer_base) * np.power(y, b) * np.power(np.log(z + 1), c)


def param_polynomial(width, depth=None, inner=None):
    if depth is not None:
        if inner is not None:
            return 5 * depth * (width ** 2) + 2 * depth * (width * inner) + 7 * depth * width + depth * inner + 3 * width + 3
        else:
            return 7 * depth * (width ** 2) + 8 * depth * width + 3 * width + 3
    else:
        if inner is not None:
            return 5 * depth_width_ratio * (width ** 3) + 2 * depth_width_ratio * (width ** 2 * inner) + 7 * depth_width_ratio * width ** 2 + depth_width_ratio * width * inner + 3 * width + 3
        else:
            return 7 / depth_width_ratio * (width ** 3) + 8 / depth_width_ratio * (width ** 2) + 3 * width + 3


def optimal_model_shape(width, param_number, base=8):
    depth = max(1, math.floor(width / depth_width_ratio))
    poly_params = np.array([depth * 7, depth * 8 + 3, 3 - param_number])
    roots = np.roots(poly_params)
    corresponding_width = int(base * round(max(roots) / base))
    return depth, corresponding_width


def alternate_model_shape(width, param_number, base=8):
    linear_depth = max(1, math.floor(width / depth_width_ratio))
    depth = max(linear_depth + 1, math.floor(0.3 * width ** 1.25 / depth_width_ratio))
    poly_params = np.array([depth * 7, depth * 8 + 3, 3 - param_number])
    roots = np.roots(poly_params)
    corresponding_width = int(base * round(max(roots) / base))
    return depth, corresponding_width


def hours_to_width(hours, gpu, amp_mode, param_popt):
    seconds = hours * 3600
    d, e, f = param_popt
    constants = constants_per_gpu[gpu]
    amp_features = features_per_amp_mode[amp_mode]

    def equation_function(width):
        return np.power((param_polynomial(width) - f) / d, 1 / e) / flo_speed(
            (*amp_features, width / depth_width_ratio, width, optimal_batch_size_per_gpu[gpu]),
            constants) * day_ratio - seconds

    width = iterative_solutions(equation_function, initial_guess=128)
    # print("width: {}".format(math.floor(width)))
    # print("depth: {}".format(width / depth_width_ratio))
    # print("param number: {:.4e}".format(param_polynomial(width)))
    speed = flo_speed((*amp_features, width / depth_width_ratio, width, optimal_batch_size_per_gpu[gpu]), constants)
    # print("speed: {:.4e}".format(speed))
    # print("flos from speed: {:.4e}".format(seconds * speed))
    # print("flos from params: {:.4e}".format(np.power((param_polynomial(width) - f) / d, 1 / e) * day_ratio))
    # print("params from flos: {:.4e}".format(np.exp(param_fit(speed * seconds / day_ratio, *param_popt))))
    return width


def iterative_solutions(equation_function, initial_guess):
    while initial_guess > 16:
        solution_array = root(equation_function, np.array([initial_guess]), method="hybr").x
        width = solution_array[0]
        should_be_zero = equation_function(width)
        if np.abs(should_be_zero) < 1e0:
            return width
        else:
            initial_guess *= 0.5
    return width


def width_to_flo(width, d, e, f):
    return np.power((param_polynomial(width) - f) / d, 1 / e) * day_ratio


def loss_fit(x, a, b, c):
    return a * np.power(x, -b) + c


def param_fit(x, d, e, f):
    return np.log(d * np.power(x, e) + f)


def hours_to_dollars(hours, gpu):
    return hours * price_per_gpu[gpu]


def dollars_to_hours(dollars, gpu):
    return dollars / price_per_gpu[gpu]


def hours_to_kWh(hours, gpu):
    return hours * gpu_consumption[gpu]


def hours_to_co2(hours, gpu):
    return hours * gpu_consumption[gpu] * co2_intensity


def loss_to_flo(loss, a, b, c):
    return ((loss - c) / a) ** (-1 / b)


def param_to_flo(param_number, d, e, f):
    return ((param_number - f) / d) ** (1 / e)


def safe_flo_to_param(flo, d, e, f):
    return d * np.power(flo, e) + f


def param_to_width(param_number):
    poly_params = np.array([7 / depth_width_ratio, 8 / depth_width_ratio, 3, 3 - param_number])
    roots = np.roots(poly_params)
    real_roots = [np.real(candidate) for candidate in roots if np.imag(candidate) < 1e-5]
    width = max(real_roots)
    return width


def safe_param_to_width(param_number):
    try:
        return param_to_width(param_number)
    except np.linalg.LinAlgError:
        return safe_param_to_width(1.5 * param_number)


def width_to_hours(width, gpu, amp_mode, param_popt):
    d, e, f = param_popt
    constants = constants_per_gpu[gpu]
    amp_features = features_per_amp_mode[amp_mode]
    flos_from_params = np.power((param_polynomial(width) - f) / d, 1 / e) * day_ratio
    speed = flo_speed((*amp_features, width / depth_width_ratio, width, optimal_batch_size_per_gpu[gpu]), constants)
    seconds = flos_from_params / speed

    return seconds / 3600


def param_prime(width, depth=None):
    if depth is not None:
        return 14 * depth * (width ** 2) + 8 * depth + 3
    else:
        return 21 / depth_width_ratio * (width ** 2) + 16 / depth_width_ratio * width + 3


def flo_speed_prime(width, gpu, amp_mode):
    k, k1, k2, b, c, layer_base = constants_per_gpu[gpu]
    o0, o1, o2 = features_per_amp_mode[amp_mode]
    mult_constant = k * np.power(k1, o1) * np.power(k2, o2) * np.power(np.log(optimal_batch_size_per_gpu[gpu] + 1), c)
    return mult_constant * ((b + 1) * np.power(width, b) / (width + layer_base * depth_width_ratio)
                            - np.power(width, b + 1) / (width + layer_base * depth_width_ratio) ** 2)


# awful equation; we're trying to find the width for which lowering width actually makes the model less efficient
def tipping_point(gpu, amp_mode, param_popt):
    d, e, f = param_popt
    o0, o1, o2 = features_per_amp_mode[amp_mode]

    def equation_function(width):
        return np.power((param_polynomial(width) - f) / d, -1) / e * param_prime(width) / d \
               * flo_speed((o0, o1, o2, width / depth_width_ratio, width, optimal_batch_size_per_gpu[gpu]),
                           constants_per_gpu[gpu]) - \
               flo_speed_prime(width, gpu, amp_mode)

    tipping_width = iterative_solutions(equation_function, initial_guess=100)
    return tipping_width


def update_tip(tip, width, gpu, amp_mode, loss_popt, param_popt):
    a, b, c = loss_popt
    d, e, f = param_popt
    tip["width"] = width
    tip["param_number"] = param_polynomial(width)
    tip["flo"] = np.power((param_polynomial(tip["param_number"]) - f) / d, 1 / e)
    tip["loss"] = loss_fit(tip["flo"], a, b, c)
    tip["hours"] = width_to_hours(width, gpu, amp_mode, param_popt)