/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

/*!
 * \file src/relay/qnn/op/dense.cc
 * \brief Property def of qnn dense operator.
 */

#include <tvm/relay/base.h>
#include <tvm/relay/op.h>
#include <tvm/relay/op_attr_types.h>
#include <tvm/relay/qnn/attrs.h>

#include "../../op/nn/nn.h"
#include "../../transforms/pattern_utils.h"
#include "../utils.h"

namespace tvm {
namespace relay {
namespace qnn {

// relay.op.qnn.dense

bool QnnDenseRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                 const TypeReporter& reporter) {
  // Expected Types: data, weight, input_zero_point, weight_zero_point, input_scale, weight_scale,
  // out_type
  ICHECK_EQ(types.size(), 7);
  const auto* data = types[0].as<TensorTypeNode>();
  const auto* weight = types[1].as<TensorTypeNode>();
  if (data == nullptr || weight == nullptr) return false;
  const auto* param = attrs.as<DenseAttrs>();
  ICHECK(param != nullptr) << "DenseAttrs cannot be nullptr.";
  ICHECK(data->dtype == DataType::Int(8) || data->dtype == DataType::UInt(8))
      << "Expected quantized dense type(int8, uint8) for input but was " << data->dtype;
  ICHECK(weight->dtype == DataType::Int(8) || weight->dtype == DataType::UInt(8))
      << "Expected quantized dense type(int8, uint8) for weight but was " << weight->dtype;
  ICHECK(param->out_dtype == DataType::Int(32))
      << "Expected quantized dense type(int32) for output but was " << param->out_dtype;

  // Check the types of scale and zero points.
  for (size_t i = 2; i < 5; ++i) {
    if (types[i].as<IncompleteTypeNode>()) {
      return false;
    }
  }
  ICHECK(IsScalarType(types[2], DataType::Int(32)));                  // input_zero_point
  ICHECK(IsScalarType(types[4], DataType::Float(32)));                // input_scale
  // weight_zero_point can be a scalar or a vector of the same shape as the weight_scale
  AssignType(types[5], DataType::Float(32), param->units, reporter);  // weight_scale

  ICHECK(param->out_dtype.bits() > 0) << "Output dtype bits should be greater than 0.";

  // Collect the input tensor and output tensor devoid of scale and zero points to reuse Relay
  // Dense infer type function.
  Array<Type> tensor_types = {types[0], types[1], types[6]};
  return MatmulRel<DenseAttrs>(tensor_types, 3, attrs, reporter);
}

// Positional relay function to create quantized dense operator used by frontend FFI.
Expr MakeQuantizedDense(Expr data, Expr weight, Expr input_zero_point, Expr kernel_zero_point,
                        Expr input_scale, Expr kernel_scale, IndexExpr units, DataType out_dtype) {
  auto attrs = make_object<DenseAttrs>();
  attrs->units = std::move(units);
  attrs->out_dtype = out_dtype;
  static const Op& op = Op::Get("qnn.dense");
  return Call(op, {data, weight, input_zero_point, kernel_zero_point, input_scale, kernel_scale},
              Attrs(attrs), {});
}

Expr DenseFirstTerm(const Expr& quantized_data, const Expr& quantized_kernel,
                    const DenseAttrs* attrs) {
  return Dense(quantized_data, quantized_kernel, attrs->units, attrs->out_dtype);
}

Expr DenseSecondTerm(const Expr& quantized_data, const Expr& kernel_zero_point,
                     const int out_dim_size) {
  Array<Integer> axes = {1};
  Expr reduced_t2 = Sum(Cast(quantized_data, DataType::Int(32)), axes, true, false);
  Expr multiplied_t2;
  if (!IsConstScalar(kernel_zero_point)) {
    multiplied_t2 = Multiply(kernel_zero_point, MakeRepeat(reduced_t2, out_dim_size, 1));
  } else {
    multiplied_t2 = Multiply(kernel_zero_point, reduced_t2);
  }
  return multiplied_t2;
}

Expr DenseThirdTerm(const Expr& quantized_kernel, const Expr& input_zero_point) {
  Array<Integer> axes = {1};
  return Multiply(input_zero_point,
                  Sum(Cast(quantized_kernel, DataType::Int(32)), axes, false, false));
}

Expr DenseFourthTerm(int input_zero_point_int, int kernel_zero_point_int, int reduction_dim_size) {
  int32_t scalar_term = input_zero_point_int * kernel_zero_point_int * reduction_dim_size;
  return MakeConstantScalar(DataType::Int(32), scalar_term);
}

Expr DenseFourthTerm(const Expr& input_zero_point, const Expr& kernel_zero_point,
                     int reduction_dim_size) {
  auto reduction_dim = MakeConstantScalar(DataType::Int(32), reduction_dim_size);
  return Multiply(Multiply(input_zero_point, kernel_zero_point), reduction_dim);
}

Expr DenseCombineTerms(const Expr& term1, const Expr& term2, const Expr& term3, const Expr& term4) {
  auto data_term = Subtract(term1, term2);
  // Putting constant terms together, so that constant folding can fold it.
  auto const_term = Subtract(term4, term3);
  return Add(data_term, const_term);
}
/*
 * \brief Forward rewrite the qnn dense op.
 * \param attrs The QNN dense attrs.
 * \param new_args The new mutated args to the call node.
 * \param arg_types The types of input and output.
 * \return The sequence of Relay ops for qnn cov2d op.
 * \note Lowering of the qnn.dense operator
 *       A quantized tensor is represented in following manner
 *          A = scale_a x (QA - zp_A)
 *       where QA is quantized tensor, scale_a and zp_A are quantization
 *       params.
 *
 *       Quantized dense multiplies two quantized tensors and returns a
 *       quantized tensor of default dtype of int32, with scale equaling to the
 *       product of scales of input tensors, and a zero point of zero.
 *
 *       The lowering for asymmetric quantized dense looks as follows. More details at
 *       https://discuss.tvm.ai/t/tf-lite-quantized-conv2d-operator-conversion/2651/8
 *       The computation gets unrolled into following 4 terms
 *          C(m, n) = Sigma(k) (A(m, k) * W(n, k))
 *
 *          RHS becomes
 *            Sigma(k) ([QA(m, k) - zp_a] * [QW(n, k) - zp_w])
 *
 *          Unrolling leads to following sequence
 *            Sigma(k) QA(m, k) * QW(n, k)                         // Term1
 *          - Sigma(k) zp_w * QA(m, k)                             // Term2
 *          - Sigma(k) zp_a * QW(n, k)                             // Term3
 *          - Sigma(k) * zp_a * zp_w                               // Term4
 *
 *       Term3 and Term4 can be computed at compile time.
 */
Expr QnnDenseCanonicalize(const Attrs& attrs, const Array<Expr>& new_args,
                          const Array<tvm::relay::Type>& arg_types) {
  ICHECK_EQ(new_args.size(), 6);
  Expr quantized_data = new_args[0];
  Expr quantized_kernel = new_args[1];
  Expr input_zero_point = new_args[2];
  Expr kernel_zero_point = new_args[3];

  const auto in_shape = get_shape(arg_types[0]);
  const auto w_shape = get_shape(arg_types[1]);
  const int reduction_dim_size = get_const_int(in_shape[1]);
  const int out_dim_size = get_const_int(w_shape[0]);

  const auto* qnn_dense_attrs = attrs.as<DenseAttrs>();

  auto term1 = DenseFirstTerm(quantized_data, quantized_kernel, qnn_dense_attrs);
  auto term2 = DenseSecondTerm(quantized_data, kernel_zero_point, out_dim_size);
  auto term3 = DenseThirdTerm(quantized_kernel, input_zero_point);

  // Extract the integer zero points.

  if (!IsConstScalar(input_zero_point) || !IsConstScalar(kernel_zero_point)) {
    auto term4 = DenseFourthTerm(input_zero_point, kernel_zero_point, reduction_dim_size);
    return DenseCombineTerms(term1, term2, term3, term4);
  }

  auto kernel_zero_point_int = GetScalarFromConstant<int>(kernel_zero_point);
  auto input_zero_point_int = GetScalarFromConstant<int>(input_zero_point);

  // Get all the terms as described in the comments.
  auto term4 = DenseFourthTerm(input_zero_point_int, kernel_zero_point_int, reduction_dim_size);

  // Combine those 4 terms depending on the zero points to get the best lowering.
  if (input_zero_point_int == 0 && kernel_zero_point_int == 0) {
    // term 2, 3 and 4 become zero.
    return term1;
  } else if (input_zero_point_int == 0 && kernel_zero_point_int != 0) {
    // term 3 and term 4 become zero.
    return Subtract(term1, term2);
  } else if (input_zero_point_int != 0 && kernel_zero_point_int == 0) {
    // term 2 and term 4 become zero.
    return Subtract(term1, term3);
  } else {
    return DenseCombineTerms(term1, term2, term3, term4);
  }
}

RELAY_REGISTER_OP("qnn.dense")
    .describe(R"code(Applies a linear transformation: :math:`Y = XW^T`.
- **data**: quantized(int8, unit8) `(x1, x2, ..., xn, input_dim)`
- **weight**: quantized(int8, unit8) `(units, input_dim)`
- **out**: quantized(int32) `(x1, x2, ..., xn, units)`.
)code" TVM_ADD_FILELINE)
    .set_attrs_type<DenseAttrs>()
    .set_num_inputs(6)
    .add_argument("data", "quantized nD Tensor", "Input data.")
    .add_argument("weight", "quantized 2D Tensor", "Weight matrix.")
    .add_argument("input_scale", "Tensor", "The quantization scale of the input tensor.")
    .add_argument("input_zero_point", "Tensor", "The quantization zero_point of the input tensor.")
    .add_argument("weight_scale", "Tensor", "The quantization scale of the weight tensor.")
    .add_argument("weight_zero_point", "Tensor",
                  "The quantization zero_point of the weight tensor.")
    .set_support_level(11)
    .add_type_rel("QDense", QnnDenseRel)
    .set_attr<TNonComputational>("TNonComputational", true)
    .set_attr<FTVMLegalize>("FTVMQnnCanonicalize", QnnDenseCanonicalize);

TVM_REGISTER_GLOBAL("relay.qnn.op._make.dense").set_body_typed(MakeQuantizedDense);

}  // namespace qnn
}  // namespace relay
}  // namespace tvm