# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

""" Hexagon testing infrastructure """

import tvm
from tvm import te
import numpy


def ceildiv(o, d):
    assert o >= 0
    assert d >= 0
    return tvm.tir.floordiv(o + d - 1, d)


# defines inner block shape: 8h8w32c
def get_block_shape():
    return 8, 8, 32


# defines inner filter block shape: 8i32o41
def get_filter_block_shape():
    return 8, 32, 4


# input: locgical shape in nhwc layout
# output:  physical packed shape in nhw8h8w32c layout
def get_packed_shape(logical_shape_nhwc):
    assert len(logical_shape_nhwc) == 4
    physical_shape_nhwc8h8w32c = [logical_shape_nhwc[0]]
    block_shape = get_block_shape()
    off_h, off_w, off_c = block_shape
    physical_shape_nhwc8h8w32c.append(ceildiv(logical_shape_nhwc[1], off_h))
    physical_shape_nhwc8h8w32c.append(ceildiv(logical_shape_nhwc[2], off_w))
    physical_shape_nhwc8h8w32c.append(ceildiv(logical_shape_nhwc[3], off_c))
    physical_shape_nhwc8h8w32c.extend(block_shape)
    return physical_shape_nhwc8h8w32c


# input: physical packed shape in nhw8h8w32c layout
# output: logical shape in nhwc layout
def get_logical_shape(physical_shape_nhwc8h8w32c):
    assert len(physical_shape_nhwc8h8w32c) == 7
    logical_shape_nhwc = [physical_shape_nhwc8h8w32c[0]]
    logical_shape_nhwc.append(physical_shape_nhwc8h8w32c[1] * physical_shape_nhwc8h8w32c[4])
    logical_shape_nhwc.append(physical_shape_nhwc8h8w32c[2] * physical_shape_nhwc8h8w32c[5])
    logical_shape_nhwc.append(physical_shape_nhwc8h8w32c[3] * physical_shape_nhwc8h8w32c[6])
    return logical_shape_nhwc


# input: logical shape in oihw layout
# output: physical packed shape in oihw8i3204i layout
def get_packed_filter_shape(logical_shape_oihw):
    assert len(logical_shape_oihw) == 4
    filter_block_shape = get_filter_block_shape()
    filter_Cio, filter_Ki, filter_Cii = filter_block_shape
    filter_Ci = filter_Cio * filter_Cii
    physical_shape_oihw8i32o4i = []
    physical_shape_oihw8i32o4i.append(int(ceildiv(logical_shape_oihw[0], filter_Ki)))
    physical_shape_oihw8i32o4i.append(int(ceildiv(logical_shape_oihw[1], filter_Ci)))
    physical_shape_oihw8i32o4i.append(logical_shape_oihw[2])
    physical_shape_oihw8i32o4i.append(logical_shape_oihw[3])
    physical_shape_oihw8i32o4i.extend(filter_block_shape)
    return physical_shape_oihw8i32o4i


def build_and_run(inputs, func, target, target_host, *args, **kwargs):
    schedule, placeholders, binds = func(*args, **kwargs)

    func = tvm.build(
        schedule, placeholders, target=tvm.target.Target(target, host=target_host), binds=binds
    )
    dev = tvm.device(target)
    tensors = []
    for tensor in inputs:
        tensors.append(tvm.nd.array(tensor, dev))
    tensors.append(
        tvm.nd.array(
            numpy.zeros([i.value for i in placeholders[-1].shape], dtype=placeholders[-1].dtype),
            dev,
        )
    )
    func(*tensors)

    return tensors[-1].asnumpy()


def get_conv2d_nhwc_shape(shape_nhwc, kernel_size, strides, padding, dilation, out_channels):
    assert len(shape_nhwc) == 4
    kernel = []
    kernel.append((kernel_size[0] - 1) * dilation[0] + 1)
    kernel.append((kernel_size[1] - 1) * dilation[1] + 1)
    return (
        shape_nhwc[0],
        (shape_nhwc[1] - kernel[0] + padding[0] + padding[1]) // strides[0] + 1,
        (shape_nhwc[2] - kernel[1] + padding[2] + padding[3]) // strides[1] + 1,
        out_channels,
    )


def conv2d_verify(output, ref_output, dtype):
    # nhwc8h8w32c -> nhwc
    logical_output_shape = get_logical_shape(output.shape)
    output = output.transpose(0, 1, 4, 2, 5, 3, 6).reshape(logical_output_shape)

    # slice output to match ref_output shape
    # e.g. 8x8 spatial 3x3 filter = 6x6 ref output
    # but still 8x8 output given the blocked layout
    output = output[
        0 : ref_output.shape[0] : 1,
        0 : ref_output.shape[1] : 1,
        0 : ref_output.shape[2] : 1,
        0 : ref_output.shape[3] : 1,
    ]

    if "int" in dtype:
        tol = {"atol": 0, "rtol": 0}
    elif dtype == "float32":
        tol = {"rtol": 1e-4, "atol": 2e-4}
    tvm.testing.assert_allclose(output, ref_output, **tol)


def conv2d_compute(X, filt, pad, stride, dilation):
    block_shape = get_block_shape()
    block_H, block_W, block_C = block_shape
    filter_Cio, filter_Ki, filter_Cii = get_filter_block_shape()
    filter_Ci = filter_Cio * filter_Cii

    shape_filter = filt.shape
    kernel_size = tuple(shape_filter[2:4])
    out_channels = shape_filter[0] * shape_filter[5]

    logical_input_shape = get_logical_shape(X.shape)
    logical_output_shape = get_conv2d_nhwc_shape(
        logical_input_shape,
        kernel_size,
        stride,
        pad,
        dilation,
        out_channels,
    )

    output_shape = get_packed_shape(logical_output_shape)
    n, ho, wo, ko, hi, wi, ki = output_shape
    rh = te.reduce_axis((0, kernel_size[0]), name="rh")
    rw = te.reduce_axis((0, kernel_size[1]), name="rw")
    rc = te.reduce_axis((0, logical_input_shape[3]), name="rc")

    def compute(n, ho, wo, ko, hi, wi, ki):
        h = ho * block_H + hi
        h_contig = h * stride[0] + rh
        h_block_id = h_contig // block_H
        h_block_offset = h_contig % block_H

        w = wo * block_W + wi
        w_contig = w * stride[1] + rw
        w_block_id = w_contig // block_W
        w_block_offset = w_contig % block_W

        c_block_id = rc // block_C
        c_block_offset = rc % block_C

        rco = rc // filter_Ci
        rcio = (rc % filter_Ci) // filter_Cii
        rcii = rc % filter_Cii

        return te.sum(
            X[
                n,
                h_block_id,
                w_block_id,
                c_block_id,
                h_block_offset,
                w_block_offset,
                c_block_offset,
            ]
            * filt[ko, rco, rh, rw, rcio, ki, rcii],
            axis=[rh, rw, rc],
        )

    return output_shape, compute