# cython: profile=True
# cython: cdivision=True
# cython: infer_types=True
cimport cython
from libc.string cimport memcpy, memset
from libc.math cimport exp, sqrt
from libc.stdlib cimport calloc, malloc, free

from collections import defaultdict
import numpy

from ..typedefs cimport weight_t
from .ops import NumpyOps, CupyOps, add_gradient_noise
from .util import get_array_module


def linear_decay(rate, decay, nr_upd):
    return rate * 1./(1. + decay * nr_upd)


def anneal(rate, decay, decay_steps, nr_upd):
    if decay == 0.0:
        return rate
    else:
        return rate * decay ** (nr_upd / decay_steps)

def Adam(*args, **kwargs):
    return Optimizer(*args, **kwargs)


def SGD(*args, **kwargs):
    kwargs.setdefault('beta1', 0.)
    kwargs.setdefault('beta2', 0.)
    return Optimizer(*args, **kwargs)


class Optimizer(object):
    '''Do various flavours of stochastic gradient descent, with first and
    second order momentum.
    
    Examples
    
    * beta1=0., beta2=0.: "vanilla" SGD
    * beta1=0.9, beta2=0.: "Classic momentum"
    * beta1=0.0, beta2=0.2: RMS prop
    * b1=0.999, b2=0.9: Adam
    '''
    def __init__(self, ops, lr, L2=1e-4, beta1=0.90, beta2=0.999, eps=1e-08, decay=0.0,
                 decay_steps=5000,
                 b1_decay=0.0, b2_decay=0.0, max_grad_norm=10., gradient_noise=0.0,
                 nesterov=True, L2_is_weight_decay=False):
        self.ops = ops
        self.mom1 = {}
        self.mom2 = {}
        self.averages = {}
        self.nr_update = defaultdict(int)
        self.last_seen = defaultdict(int)
        self.max_grad_norm = max_grad_norm
        self.alpha = lr
        self.b1 = beta1
        self.b2 = beta2
        self.b1_decay = b1_decay
        self.b2_decay = b1_decay
        self.gradient_noise = gradient_noise
        self.eps = eps
        self.decay = decay
        self.L2 = L2
        self.nesterov = nesterov
        self.decay_steps = decay_steps
        self.L2_is_weight_decay = L2_is_weight_decay

    def to_gpu(self):
        self.ops = CupyOps()
        for params in (self.mom1, self.mom2, self.averages):
            for key, value in params.items():
                params[key] = self.ops.xp.asarray(value, dtype=value.dtype)
    
    def to_cpu(self):
        self.ops = NumpyOps()
        for params in (self.mom1, self.mom2, self.averages):
            for key, value in params.items():
                if hasattr(value, 'get'):
                    params[key] = value.get()

    def lr(self, nr_upd):
        alpha = anneal(self.alpha, self.decay, self.decay_steps, nr_upd)
        if self.b1 == 0. or self.b2 == 0.:
            return alpha
        fix1 = 1.- (self.b1 ** nr_upd)
        fix2 = 1.- (self.b2 ** nr_upd)
        return alpha * numpy.sqrt(fix2) / fix1

    def __call__(self, weights, gradient, lr_scale=1., key=None):
        assert len(gradient) >= 1
        xp = get_array_module(weights)
        if xp is not self.ops.xp:
            if xp is numpy:
                self.ops = NumpyOps()
            else:
                self.ops = CupyOps()

        self.nr_update[key] += 1
        nr_upd = self.nr_update[key]
        if self.L2 != 0 and not self.L2_is_weight_decay:
            gradient += self.L2 * weights
        if self.max_grad_norm:
            self.ops.clip_gradient(gradient, self.max_grad_norm)
        if self.gradient_noise:
            add_gradient_noise(gradient, self.gradient_noise, nr_upd)
        if self.b1 > 0. and self.b2 > 0.:
            self._adam(xp, weights, gradient, lr_scale, key, nr_upd)
        elif self.b1 > 0. and not self.nesterov:
            raise NotImplementedError
        elif self.b1 > 0.:
            self._nesterov(xp, weights, gradient, lr_scale, key)
        elif self.b2 > 0.:
            raise NotImplementedError
        else:
            weights -= lr_scale * self.alpha * gradient
        gradient.fill(0.)
        if self.L2 != 0 and self.L2_is_weight_decay:
            weights -= self.L2 * weights
        if self.averages is not None:
            if key not in self.averages:
                self.averages[key] = self.ops.allocate((weights.size,), dtype='float32')
            self.ops.update_averages(self.averages[key], weights, nr_upd)

    def _nesterov(self, xp, weights, gradient, lr_scale, key):
        # http://cs231n.github.io/neural-networks-3/
        # v_prev = v # back this up
        # v = mu * v - lr * gradient # velocity update stays the same
        # x += -mu * v_prev + (1 + mu) * v # position update changes form
        # Implement this as
        # x += -mu * v
        # v *= mu
        # v -= lr * gradient
        # x += (1+mu) * v
        lr = self.alpha * lr_scale
        if key not in self.mom1:
            self.mom1[key] = self.ops.allocate(weights.size)
        momentum = self.mom1[key]
        weights += -self.b1 * momentum
        momentum *= self.b1
        momentum -= lr * gradient
        weights += (1+self.b1) * momentum

    def _adam(self, xp, weights, gradient, lr_scale, key, nr_upd):
        if key not in self.mom1:
            self.mom1[key] = self.ops.allocate(weights.size)
        if key not in self.mom2:
            self.mom2[key] = self.ops.allocate(weights.size)
        mom1 = self.mom1[key]
        mom2 = self.mom2[key]
        cdef weight_t lr = self.lr(nr_upd)
        cdef weight_t b1 = linear_decay(self.b1, self.b1_decay, nr_upd)
        cdef weight_t b2 = linear_decay(self.b2, self.b2_decay, nr_upd)
        cdef weight_t eps = self.eps
        self.ops.adam(
            weights, gradient, mom1, mom2, b1, b2, eps, lr * lr_scale)
        gradient.fill(0)