# Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. import tvm from tvm import te import numpy as np from tvm import relay from tvm.contrib import graph_executor from tvm.relay.testing.temp_op_attr import TempOpAttr # We use llvm target for testing functionality. `llvm` points to an older Intel # generation machine, that legalizes to a simple lowering. Therefore, the # legalization is overwritten such that it can be skipped and we use the # QNNCanonicalizeOps lowering for the testing. def legalize_qnn_dense(attrs, inputs, types): return None def make_requantize_params(input_scale, output_scale, output_zero_point, out_dtype): config = { "input_scale": input_scale, "output_scale": output_scale, "output_zero_point": output_zero_point, "out_dtype": out_dtype, } return config def make_configuration( quantized_data, quantized_kernel, dtype, input_shape, kernel_shape, input_zero_point, kernel_zero_point, input_scale, kernel_scale, units, output, out_dtype="int32", bias=None, requantize=None, ): if requantize is not None: assert bias is not None config = { "quantized_data": quantized_data, "quantized_kernel": quantized_kernel, "dtype": dtype, "input_shape": input_shape, "kernel_shape": kernel_shape, "input_zero_point": input_zero_point, "kernel_zero_point": kernel_zero_point, "input_scale": input_scale, "kernel_scale": kernel_scale, "units": units, "output": output, "out_dtype": out_dtype, "bias": bias, "requantize": requantize, } return config def make_int_configuration(use_bias=False, requantize_output=False, per_channel=False): input_shape, kernel_shape, output_shape = (2, 10), (3, 10), (2, 3) input_zero_point, kernel_zero_point = -1, -1 in_dtype = "int8" out_dtype = "int32" if not requantize_output else "int8" units = 3 quantized_data_np = ( np.array([1, 3, 5, 7, 9, 11, 13, 15, -19, -21, 1, 3, 5, 7, 9, 11, 13, -17, 17, -21]) .astype(in_dtype) .reshape(input_shape) ) quantized_kernel_np = ( np.array( [ 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, ] ) .astype(in_dtype) .reshape(kernel_shape) ) input_scale = 0.5 kernel_scale = 0.5 output_scale = 1.0 bias = np.array([4, 8, 12]).astype(out_dtype).reshape((units,)) if use_bias else None if per_channel: assert use_bias and requantize_output kernel_scale = np.array([0.5, 0.3, 0.4], dtype=np.float32) output = np.array([23, 14, 20, 57, 34, 47]) elif requantize_output: assert use_bias output = np.array([23, 24, 25, 57, 58, 59]) elif use_bias: output = np.array([96, 100, 104, 232, 236, 240]) else: output = np.array([92, 92, 92, 228, 228, 228]) requant_params = ( make_requantize_params(input_scale * kernel_scale, output_scale, -1, "int8") if requantize_output else None ) output = output.astype(out_dtype).reshape(output_shape) return make_configuration( quantized_data=quantized_data_np, quantized_kernel=quantized_kernel_np, dtype=in_dtype, input_shape=input_shape, kernel_shape=kernel_shape, input_zero_point=input_zero_point, kernel_zero_point=kernel_zero_point, input_scale=input_scale, kernel_scale=kernel_scale, units=units, output=output, bias=bias, requantize=requant_params, ) def qnn_dense_driver(test_configuration): in_dtype = test_configuration["dtype"] out_dtype = test_configuration["out_dtype"] quantized_data_name = "quantized_data" quantized_kernel_name = "quantized_kernel" expected_out_dtype = test_configuration["out_dtype"] bias_name = "bias" quantized_data = relay.var( quantized_data_name, shape=test_configuration["input_shape"], dtype=in_dtype ) quantized_kernel = relay.var( quantized_kernel_name, shape=test_configuration["kernel_shape"], dtype=in_dtype ) mod = relay.qnn.op.dense( quantized_data, quantized_kernel, relay.const(test_configuration["input_zero_point"], "int32"), relay.const(test_configuration["kernel_zero_point"], "int32"), relay.const(test_configuration["input_scale"], "float32"), relay.const(test_configuration["kernel_scale"], "float32"), test_configuration["units"], ) if test_configuration[bias_name] is not None: bias = relay.var(bias_name, shape=test_configuration["bias"].shape, dtype=out_dtype) mod = relay.nn.bias_add(mod, bias) if test_configuration["requantize"] is not None: requantize_config = test_configuration["requantize"] mod = relay.qnn.op.requantize( mod, input_scale=relay.const(requantize_config["input_scale"], "float32"), input_zero_point=relay.const(0, "int32"), output_scale=relay.const(requantize_config["output_scale"], "float32"), output_zero_point=relay.const(requantize_config["output_zero_point"], "int32"), out_dtype=requantize_config["out_dtype"], ) expected_out_dtype = requantize_config["out_dtype"] mod = relay.Function(relay.analysis.free_vars(mod), mod) mod = tvm.IRModule.from_expr(mod) mod = relay.transform.InferType()(mod) mod = relay.qnn.transform.CanonicalizeOps()(mod) with tvm.transform.PassContext(opt_level=2): graph, lib, params = relay.build(mod, "llvm", params=None) mod = graph_executor.create(graph, lib, device=tvm.cpu(0)) mod.set_input(quantized_data_name, test_configuration[quantized_data_name]) mod.set_input(quantized_kernel_name, test_configuration[quantized_kernel_name]) if test_configuration[bias_name] is not None: mod.set_input(bias_name, test_configuration[bias_name]) mod.set_input(**params) mod.run() res = mod.get_output(0).numpy() np.testing.assert_equal(res, test_configuration["output"]) assert res.dtype == expected_out_dtype def test_qnn_dense_without_bias(): with TempOpAttr("qnn.dense", "FTVMQnnLegalize", legalize_qnn_dense): int32_output_without_bias_params = make_int_configuration(use_bias=False) qnn_dense_driver(int32_output_without_bias_params) def test_qnn_dense_with_bias(): with TempOpAttr("qnn.dense", "FTVMQnnLegalize", legalize_qnn_dense): int32_output_with_bias_params = make_int_configuration(use_bias=True) qnn_dense_driver(int32_output_with_bias_params) def test_qnn_dense_with_requantized_output(): with TempOpAttr("qnn.dense", "FTVMQnnLegalize", legalize_qnn_dense): int8_requantized_output_with_bias_params = make_int_configuration( use_bias=True, requantize_output=True ) qnn_dense_driver(int8_requantized_output_with_bias_params) def test_per_channel_weight_scale(): with TempOpAttr("qnn.dense", "FTVMQnnLegalize", legalize_qnn_dense): config = make_int_configuration(use_bias=True, requantize_output=True, per_channel=True) qnn_dense_driver(config) if __name__ == "__main__": test_qnn_dense_without_bias() test_qnn_dense_with_bias() test_qnn_dense_with_requantized_output() test_per_channel_weight_scale()