# cython: profile=True
# cython: cdivision=True
# cython: infer_types=True
cimport cython
from libc.string cimport memcpy, memset
from libc.math cimport exp, sqrt
from libc.stdlib cimport calloc, malloc, free
import math

from collections import defaultdict
import numpy

from ..typedefs cimport weight_t
from .ops import NumpyOps, CupyOps, add_gradient_noise
from .util import get_array_module


def linear_decay(rate, decay, nr_upd):
    return rate * 1./(1. + decay * nr_upd)


def anneal(rate, decay, decay_steps, nr_upd):
    if decay == 0.0:
        return rate
    else:
        return rate * decay ** (nr_upd / decay_steps)

def Adam(*args, **kwargs):
    return Optimizer(*args, **kwargs)


def SGD(*args, **kwargs):
    kwargs.setdefault('beta1', 0.)
    kwargs.setdefault('beta2', 0.)
    return Optimizer(*args, **kwargs)


class Optimizer(object):
    '''Do various flavours of stochastic gradient descent, with first and
    second order momentum.
    
    Examples
    
    * beta1=0., beta2=0.: "vanilla" SGD
    * beta1=0.9, beta2=0.: "Classic momentum"
    * beta1=0.0, beta2=0.2: RMS prop
    * b1=0.999, b2=0.9: Adam
    '''
    def __init__(self, ops, lr, L2=1e-4, beta1=0.90, beta2=0.999, eps=1e-08, decay=0.0,
                 decay_steps=5000,
                 b1_decay=0.0, b2_decay=0.0, max_grad_norm=10., gradient_noise=0.0,
                 nesterov=True, L2_is_weight_decay=False, lookahead_k=0,
                 lookahead_alpha=0.5, use_radam=False, use_lars=False):
        self.ops = ops
        self.mom1 = {}
        self.mom2 = {}
        self.slow_weights = {} # For lookahead
        self.averages = {}
        self.nr_update = defaultdict(int)
        self.last_seen = defaultdict(int)
        self.max_grad_norm = max_grad_norm
        self.alpha = lr
        self.b1 = beta1
        self.b2 = beta2
        self.b1_decay = b1_decay
        self.b2_decay = b1_decay
        self.gradient_noise = gradient_noise
        self.eps = eps
        self.decay = decay
        self.L2 = L2
        self.nesterov = nesterov
        self.decay_steps = decay_steps
        self.L2_is_weight_decay = L2_is_weight_decay
        self.lookahead_k = lookahead_k
        self.lookahead_alpha = lookahead_alpha
        self.use_radam = use_radam
        self.use_lars = use_lars
        self.lars_min = 0
        self.lars_max = 10

    def to_gpu(self):
        self.ops = CupyOps()
        for params in (self.mom1, self.mom2, self.averages):
            for key, value in params.items():
                params[key] = self.ops.xp.asarray(value, dtype=value.dtype)
    
    def to_cpu(self):
        self.ops = NumpyOps()
        for params in (self.mom1, self.mom2, self.averages):
            for key, value in params.items():
                if hasattr(value, 'get'):
                    params[key] = value.get()

    def lr(self, nr_upd):
        alpha = anneal(self.alpha, self.decay, self.decay_steps, nr_upd)
        if self.b1 == 0. or self.b2 == 0.:
            return alpha
        fix1 = 1.- (self.b1 ** nr_upd)
        fix2 = 1.- (self.b2 ** nr_upd)
        return alpha * numpy.sqrt(fix2) / fix1

    def __call__(self, weights, gradient, lr_scale=1., key=None):
        assert len(gradient) >= 1
        xp = get_array_module(weights)
        if xp is not self.ops.xp:
            if xp is numpy:
                self.ops = NumpyOps()
            else:
                self.ops = CupyOps()

        self.nr_update[key] += 1
        nr_upd = self.nr_update[key]
        if self.L2 != 0 and not self.L2_is_weight_decay:
            gradient += self.L2 * weights
        if self.max_grad_norm:
            self.ops.clip_gradient(gradient, self.max_grad_norm)
        if self.gradient_noise:
            add_gradient_noise(gradient, self.gradient_noise, nr_upd)
        if self.use_radam:
            self._radam(xp, weights, gradient, lr_scale, key, nr_upd)
        elif self.b1 > 0. and self.b2 > 0.:
            self._adam(xp, weights, gradient, lr_scale, key, nr_upd)
        elif self.b1 > 0. and not self.nesterov:
            raise NotImplementedError
        elif self.b1 > 0.:
            self._nesterov(xp, weights, gradient, lr_scale, key)
        elif self.b2 > 0.:
            raise NotImplementedError
        else:
            weights -= lr_scale * self.alpha * gradient
        gradient.fill(0.)
        if self.L2 != 0 and self.L2_is_weight_decay:
            weights -= self.L2 * weights
        if self.lookahead_k and self.nr_update[key] % self.lookahead_k == 0:
            if key not in self.slow_weights:
                self.slow_weights[key] = self.ops.allocate((weights.size,), dtype='float32')
            slow = self.slow_weights[key]
            slow += self.lookahead_alpha * (weights - slow)
            weights[:] = slow
        if self.averages is not None:
            if key not in self.averages:
                self.averages[key] = self.ops.allocate((weights.size,), dtype='float32')
            self.ops.update_averages(self.averages[key], weights, nr_upd)

    def _radam(self, xp, weights, gradient, lr_scale, key, nr_upd):
        if key not in self.mom1:
            self.mom1[key] = self.ops.allocate(weights.size)
        if key not in self.mom2:
            self.mom2[key] = self.ops.allocate(weights.size)
 
        beta1 = self.b1
        beta2 = self.b2
        eps = self.eps
        sma_inf = 2 / (1-beta2) - 1

        exp_avg = self.mom1[key]
        exp_avg_sq = self.mom2[key]
        # Decay the first and second moment running average coefficient
        exp_avg *= beta1
        exp_avg += (1-beta1) * gradient
        exp_avg_sq *= beta2
        exp_avg_sq += (1-beta2) * gradient**2
        # Bias correction
        bias_correction1 = 1 - beta1 ** nr_upd
        bias_correction2 = 1 - beta2 ** nr_upd

        # Compute length of SMA
        sma_t = sma_inf - 2 * nr_upd * (1 - bias_correction2) / bias_correction2
        update = self.ops.allocate(weights.shape, dtype="f")
        if sma_t > 4:
            # Variance rectification term
            r_t = math.sqrt((sma_t - 4) * (sma_t - 2) * sma_inf / ((sma_inf - 4) * (sma_inf - 2) * sma_t))
            # Adaptive momentum
            update += r_t * (
                (exp_avg / bias_correction1)
                /
                (self.ops.xp.sqrt(exp_avg_sq / bias_correction2) + eps)
            )
        else:
            # Unadapted momentum
            update += exp_avg / bias_correction1
        if self.use_lars:
            # LARS
            w_norm = self.ops.xp.linalg.norm(weights)
            u_norm = self.ops.xp.linalg.norm(update)
            phi_p = min(max(w_norm, self.lars_min), self.lars_max)
            # Compute the local LR
            if phi_p == 0 or u_norm == 0:
                local_lr = 1
            else:
                local_lr = phi_p / u_norm
            lr = self.alpha * lr_scale * local_lr
        else:
            lr = self.alpha * lr_scale
        weights -= lr * update

    def _nesterov(self, xp, weights, gradient, lr_scale, key):
        # http://cs231n.github.io/neural-networks-3/
        # v_prev = v # back this up
        # v = mu * v - lr * gradient # velocity update stays the same
        # x += -mu * v_prev + (1 + mu) * v # position update changes form
        # Implement this as
        # x += -mu * v
        # v *= mu
        # v -= lr * gradient
        # x += (1+mu) * v
        lr = self.alpha * lr_scale
        if key not in self.mom1:
            self.mom1[key] = self.ops.allocate(weights.size)
        momentum = self.mom1[key]
        weights += -self.b1 * momentum
        momentum *= self.b1
        momentum -= lr * gradient
        weights += (1+self.b1) * momentum

    def _adam(self, xp, weights, gradient, lr_scale, key, nr_upd):
        if key not in self.mom1:
            self.mom1[key] = self.ops.allocate(weights.size)
        if key not in self.mom2:
            self.mom2[key] = self.ops.allocate(weights.size)
        mom1 = self.mom1[key]
        mom2 = self.mom2[key]
        cdef weight_t lr = self.lr(nr_upd)
        cdef weight_t b1 = linear_decay(self.b1, self.b1_decay, nr_upd)
        cdef weight_t b2 = linear_decay(self.b2, self.b2_decay, nr_upd)
        cdef weight_t eps = self.eps
        self.ops.adam(
            weights, gradient, mom1, mom2, b1, b2, eps, lr * lr_scale)
        gradient.fill(0)