import math
import numpy as np
from scipy.optimize import root

day_ratio = 24 * 3600

depth_width_ratio = 128

constants_per_gpu = {
    "V100": [2.21527743e+07, 1.18538628e+00, 1.43150104e+00, 1.66015023e+00,
             1.32808220e+00, 5.91503856e+00],
    "V100 (without tensor cores and cudnn.benchmark)": [1.82997989e+07, 1.05349588e+00, 1.25312127e+00, 1.67071294e+00,
                                                        1.44610885e+00, 5.55824273e+00],
    "P100": [6.01863899e+07, 9.23656025e-01, 1.03230702e+00, 1.46733667e+00,
             1.03031298e+00, 5.38021875e+00],
    "P4": [4.84472202e+07, 9.86822195e-01, 1.23474901e+00, 1.38493518e+00,
           1.04630858e+00, 1.03572754e+01],
    "K80": [2.58592374e+07, 6.42050890e-01, 7.06115162e-01, 1.44360777e+00,
            7.50695980e-01, 6.25951436e+00]

}

price_per_gpu = {
    "K80": 0.584,
    "P4": 0.689,
    "V100": 2.005,
    "V100 (without tensor cores and cudnn.benchmark)": 2.005,
    "P100": 1.416,
}

optimal_batch_size_per_gpu = {
    "P4": 16,
    "V100": 64,
    "V100 (without tensor cores and cudnn.benchmark)": 64,
    "P100": 64,
    "K80": 16
}

features_per_amp_mode = {
    "O0": (1, 0, 0),
    "O1": (0, 1, 0),
    "O2": (0, 0, 1)
}

gpu_consumption = {
    "V100": 119.3495934959e-3,
    "V100 (without tensor cores and cudnn.benchmark)": 119.3495934959e-3,
    "K80": 142.42e-3,
    "P4": 55.27e-3,
    "P100": 139.65e-3
}

co2_intensity = 534 * 1e-3


def flo_speed(features, constants):
    k, k1, k2, b, c, layer_base = constants
    o0, o1, o2, x, y, z = features
    return k * np.power(k1, o1) * np.power(k2, o2) * x / (x + layer_base) * np.power(y, b) * np.power(np.log(z + 1), c)


def param_polynomial(width, depth=None, inner=None):
    if depth is not None:
        if inner is not None:
            return 5 * depth * (width ** 2) + 2 * depth * (width * inner) + 7 * depth * width + depth * inner + 3 * width + 3
        else:
            return 7 * depth * (width ** 2) + 8 * depth * width + 3 * width + 3
    else:
        if inner is not None:
            return 5 * depth_width_ratio * (width ** 3) + 2 * depth_width_ratio * (width ** 2 * inner) + 7 * depth_width_ratio * width ** 2 + depth_width_ratio * width * inner + 3 * width + 3
        else:
            return 7 / depth_width_ratio * (width ** 3) + 8 / depth_width_ratio * (width ** 2) + 3 * width + 3


def optimal_model_shape(width, param_number, base=8):
    depth = max(1, math.floor(width / depth_width_ratio))
    poly_params = np.array([depth * 7, depth * 8 + 3, 3 - param_number])
    roots = np.roots(poly_params)
    corresponding_width = int(base * round(max(roots) / base))
    return depth, corresponding_width


def alternate_model_shape(width, param_number, base=8):
    linear_depth = max(1, math.floor(width / depth_width_ratio))
    depth = max(linear_depth + 1, math.floor(0.3 * width ** 1.25 / depth_width_ratio))
    poly_params = np.array([depth * 7, depth * 8 + 3, 3 - param_number])
    roots = np.roots(poly_params)
    corresponding_width = int(base * round(max(roots) / base))
    return depth, corresponding_width


def hours_to_width(hours, gpu, amp_mode, param_popt):
    seconds = hours * 3600
    d, e, f = param_popt
    constants = constants_per_gpu[gpu]
    amp_features = features_per_amp_mode[amp_mode]

    def equation_function(width):
        return np.power((param_polynomial(width) - f) / d, 1 / e) / flo_speed(
            (*amp_features, width / depth_width_ratio, width, optimal_batch_size_per_gpu[gpu]),
            constants) * day_ratio - seconds

    width = iterative_solutions(equation_function, initial_guess=128)
    # print("width: {}".format(math.floor(width)))
    # print("depth: {}".format(width / depth_width_ratio))
    # print("param number: {:.4e}".format(param_polynomial(width)))
    speed = flo_speed((*amp_features, width / depth_width_ratio, width, optimal_batch_size_per_gpu[gpu]), constants)
    # print("speed: {:.4e}".format(speed))
    # print("flos from speed: {:.4e}".format(seconds * speed))
    # print("flos from params: {:.4e}".format(np.power((param_polynomial(width) - f) / d, 1 / e) * day_ratio))
    # print("params from flos: {:.4e}".format(np.exp(param_fit(speed * seconds / day_ratio, *param_popt))))
    return width


def iterative_solutions(equation_function, initial_guess):
    while initial_guess > 16:
        solution_array = root(equation_function, np.array([initial_guess]), method="hybr").x
        width = solution_array[0]
        should_be_zero = equation_function(width)
        if np.abs(should_be_zero) < 1e0:
            return width
        else:
            initial_guess *= 0.5
    return width


def width_to_flo(width, d, e, f):
    return np.power((param_polynomial(width) - f) / d, 1 / e) * day_ratio


def loss_fit(x, a, b, c):
    return a * np.power(x, -b) + c


def param_fit(x, d, e, f):
    return np.log(d * np.power(x, e) + f)


def hours_to_dollars(hours, gpu):
    return hours * price_per_gpu[gpu]


def dollars_to_hours(dollars, gpu):
    return dollars / price_per_gpu[gpu]


def hours_to_kWh(hours, gpu):
    return hours * gpu_consumption[gpu]


def hours_to_co2(hours, gpu):
    return hours * gpu_consumption[gpu] * co2_intensity


def loss_to_flo(loss, a, b, c):
    return ((loss - c) / a) ** (-1 / b)


def param_to_flo(param_number, d, e, f):
    return ((param_number - f) / d) ** (1 / e)


def safe_flo_to_param(flo, d, e, f):
    return d * np.power(flo, e) + f


def param_to_width(param_number):
    poly_params = np.array([7 / depth_width_ratio, 8 / depth_width_ratio, 3, 3 - param_number])
    roots = np.roots(poly_params)
    real_roots = [np.real(candidate) for candidate in roots if np.imag(candidate) < 1e-5]
    width = max(real_roots)
    return width


def safe_param_to_width(param_number):
    try:
        return param_to_width(param_number)
    except np.linalg.LinAlgError:
        return safe_param_to_width(1.5 * param_number)


def width_to_hours(width, gpu, amp_mode, param_popt):
    d, e, f = param_popt
    constants = constants_per_gpu[gpu]
    amp_features = features_per_amp_mode[amp_mode]
    flos_from_params = np.power((param_polynomial(width) - f) / d, 1 / e) * day_ratio
    speed = flo_speed((*amp_features, width / depth_width_ratio, width, optimal_batch_size_per_gpu[gpu]), constants)
    seconds = flos_from_params / speed

    return seconds / 3600


def param_prime(width, depth=None):
    if depth is not None:
        return 14 * depth * (width ** 2) + 8 * depth + 3
    else:
        return 21 / depth_width_ratio * (width ** 2) + 16 / depth_width_ratio * width + 3


def flo_speed_prime(width, gpu, amp_mode):
    k, k1, k2, b, c, layer_base = constants_per_gpu[gpu]
    o0, o1, o2 = features_per_amp_mode[amp_mode]
    mult_constant = k * np.power(k1, o1) * np.power(k2, o2) * np.power(np.log(optimal_batch_size_per_gpu[gpu] + 1), c)
    return mult_constant * ((b + 1) * np.power(width, b) / (width + layer_base * depth_width_ratio)
                            - np.power(width, b + 1) / (width + layer_base * depth_width_ratio) ** 2)


# awful equation; we're trying to find the width for which lowering width actually makes the model less efficient
def tipping_point(gpu, amp_mode, param_popt):
    d, e, f = param_popt
    o0, o1, o2 = features_per_amp_mode[amp_mode]

    def equation_function(width):
        return np.power((param_polynomial(width) - f) / d, -1) / e * param_prime(width) / d \
               * flo_speed((o0, o1, o2, width / depth_width_ratio, width, optimal_batch_size_per_gpu[gpu]),
                           constants_per_gpu[gpu]) - \
               flo_speed_prime(width, gpu, amp_mode)

    tipping_width = iterative_solutions(equation_function, initial_guess=100)
    return tipping_width


def update_tip(tip, width, gpu, amp_mode, loss_popt, param_popt):
    a, b, c = loss_popt
    d, e, f = param_popt
    tip["width"] = width
    tip["param_number"] = param_polynomial(width)
    tip["flo"] = np.power((param_polynomial(tip["param_number"]) - f) / d, 1 / e)
    tip["loss"] = loss_fit(tip["flo"], a, b, c)
    tip["hours"] = width_to_hours(width, gpu, amp_mode, param_popt)