/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ /*! * \file Use external nnpack library call. */ #include #include #include #include #include #include "nnpack_utils.h" namespace tvm { namespace contrib { using namespace runtime; TVM_REGISTER_GLOBAL("tvm.contrib.nnpack.convolution_inference") .set_body([](TVMArgs args, TVMRetValue* ret) { NNPackThreadLocalEntry* entry = NNPackThreadLocalEntry::ThreadLocal(); static std::once_flag flag; std::call_once(flag, []() { ICHECK_EQ(nnp_initialize(), nnp_status_success); }); DLTensor* input = args[0]; DLTensor* kernel = args[1]; DLTensor* bias = nullptr; if (args[2].type_code() == kTVMDLTensorHandle) { bias = args[2]; } DLTensor* output = args[3]; uint64_t pad_top = args[4], pad_right = args[5], pad_bottom = args[6], pad_left = args[7]; nnp_padding input_padding{pad_top, pad_right, pad_bottom, pad_left}; uint64_t stride_width = args[8], stride_height = args[9]; nnp_size stride_size{stride_width, stride_height}; NNPackConfig(args[10]); uint64_t algo_ = args[11]; nnp_convolution_algorithm algo = static_cast(algo_); ICHECK_EQ(input->ndim, 4); ICHECK_EQ(kernel->ndim, 4); if (bias) { ICHECK_EQ(bias->ndim, 1); } ICHECK_EQ(output->ndim, 4); ICHECK_EQ(input->shape[1], kernel->shape[1]); ICHECK_EQ(input->shape[0], output->shape[0]); size_t input_channels = input->shape[1]; ICHECK_EQ(output->shape[1], kernel->shape[0]); if (bias) { ICHECK_EQ(output->shape[1], bias->shape[0]); } size_t output_channels = output->shape[1]; nnp_size input_size{static_cast(input->shape[2]), static_cast(input->shape[3])}; nnp_size kernel_size{static_cast(kernel->shape[2]), static_cast(kernel->shape[3])}; ICHECK(input->strides == nullptr); ICHECK(kernel->strides == nullptr); if (bias) { ICHECK(bias->strides == nullptr); } ICHECK(TypeMatch(input->dtype, kDLFloat, 32)); ICHECK(TypeMatch(kernel->dtype, kDLFloat, 32)); if (bias) { ICHECK(TypeMatch(bias->dtype, kDLFloat, 32)); } ICHECK(TypeMatch(output->dtype, kDLFloat, 32)); // Allocate a zero-bias if we don't pass one in. std::unique_ptr> zero_bias; if (!bias) { zero_bias.reset(new std::vector(output->shape[1], 0.0)); } size_t workspace_size = 0; nnp_status status = nnp_convolution_inference( algo, nnp_convolution_transform_strategy_compute, input_channels, output_channels, input_size, input_padding, kernel_size, stride_size, nullptr, nullptr, nullptr, nullptr, nullptr, &workspace_size, nnp_activation_identity, nullptr, entry->threadpool, nullptr); ICHECK_EQ(status, nnp_status_success); // Division with rounding up, in case size is not multiple of sizeof(float) const size_t workspace_elements = (workspace_size + sizeof(float) - 1) / sizeof(float); Device dev = input->device; DLDataType type_hint = input->dtype; DeviceAPI* cpu_api = DeviceAPI::Get(dev); void* workspace_buffer = cpu_api->AllocWorkspace(dev, workspace_elements * sizeof(float), type_hint); ICHECK(workspace_buffer != nullptr); for (auto n = 0; n < input->shape[0]; ++n) { nnp_status status = nnp_convolution_inference( algo, nnp_convolution_transform_strategy_compute, input_channels, output_channels, input_size, input_padding, kernel_size, stride_size, static_cast(input->data) + n * input->shape[1] * input->shape[2] * input->shape[3], static_cast(kernel->data), bias ? static_cast(bias->data) : zero_bias->data(), static_cast(output->data) + n * output->shape[1] * output->shape[2] * output->shape[3], workspace_buffer, &workspace_size, nnp_activation_identity, nullptr, entry->threadpool, nullptr); ICHECK_EQ(status, nnp_status_success); } cpu_api->FreeWorkspace(dev, workspace_buffer); }); TVM_REGISTER_GLOBAL("tvm.contrib.nnpack.convolution_inference_without_weight_transform") .set_body([](TVMArgs args, TVMRetValue* ret) { NNPackThreadLocalEntry* entry = NNPackThreadLocalEntry::ThreadLocal(); static std::once_flag flag; std::call_once(flag, []() { ICHECK_EQ(nnp_initialize(), nnp_status_success); }); DLTensor* input = args[0]; DLTensor* transformed_kernel = args[1]; DLTensor* bias = nullptr; if (args[2].type_code() == kTVMDLTensorHandle) { bias = args[2]; } DLTensor* output = args[3]; uint64_t pad_top = args[4], pad_right = args[5], pad_bottom = args[6], pad_left = args[7]; nnp_padding input_padding{pad_top, pad_right, pad_bottom, pad_left}; uint64_t stride_width = args[8], stride_height = args[9]; nnp_size stride_size{stride_width, stride_height}; NNPackConfig(args[10]); uint64_t algo_ = args[11]; nnp_convolution_algorithm algo = static_cast(algo_); ICHECK_EQ(input->ndim, 4); if (bias) { ICHECK_EQ(bias->ndim, 1); } ICHECK_EQ(output->ndim, 4); ICHECK_EQ(input->shape[0], output->shape[0]); size_t input_channels = input->shape[1]; if (bias) { ICHECK_EQ(output->shape[1], bias->shape[0]); } size_t output_channels = output->shape[1]; nnp_size input_size{static_cast(input->shape[2]), static_cast(input->shape[3])}; nnp_size kernel_size{3, 3}; ICHECK(input->strides == nullptr); ICHECK(transformed_kernel->strides == nullptr); if (bias) { ICHECK(bias->strides == nullptr); } ICHECK(TypeMatch(input->dtype, kDLFloat, 32)); ICHECK(TypeMatch(transformed_kernel->dtype, kDLFloat, 32)); if (bias) { ICHECK(TypeMatch(bias->dtype, kDLFloat, 32)); } ICHECK(TypeMatch(output->dtype, kDLFloat, 32)); // Allocate a zero-bias if we don't pass one in. std::unique_ptr> zero_bias; if (!bias) { zero_bias.reset(new std::vector(output->shape[1], 0.0)); } size_t workspace_size = 0; nnp_status status = nnp_convolution_inference( algo, nnp_convolution_transform_strategy_reuse, input_channels, output_channels, input_size, input_padding, kernel_size, stride_size, nullptr, nullptr, nullptr, nullptr, nullptr, &workspace_size, nnp_activation_identity, nullptr, entry->threadpool, nullptr); ICHECK_EQ(status, nnp_status_success); // Division with rounding up, in case size is not multiple of sizeof(float) const size_t workspace_elements = (workspace_size + sizeof(float) - 1) / sizeof(float); Device dev = input->device; DLDataType type_hint = input->dtype; DeviceAPI* cpu_api = DeviceAPI::Get(dev); void* workspace_buffer = cpu_api->AllocWorkspace(dev, workspace_elements * sizeof(float), type_hint); ICHECK(workspace_buffer != nullptr); for (auto n = 0; n < input->shape[0]; ++n) { nnp_status status = nnp_convolution_inference( algo, nnp_convolution_transform_strategy_reuse, input_channels, output_channels, input_size, input_padding, kernel_size, stride_size, static_cast(input->data) + n * input->shape[1] * input->shape[2] * input->shape[3], static_cast(transformed_kernel->data), bias ? static_cast(bias->data) : zero_bias->data(), static_cast(output->data) + n * output->shape[1] * output->shape[2] * output->shape[3], workspace_buffer, &workspace_size, nnp_activation_identity, nullptr, entry->threadpool, nullptr); ICHECK_EQ(status, nnp_status_success); } cpu_api->FreeWorkspace(dev, workspace_buffer); }); TVM_REGISTER_GLOBAL("tvm.contrib.nnpack.convolution_inference_weight_transform") .set_body([](TVMArgs args, TVMRetValue* ret) { NNPackThreadLocalEntry* entry = NNPackThreadLocalEntry::ThreadLocal(); static std::once_flag flag; std::call_once(flag, []() { ICHECK_EQ(nnp_initialize(), nnp_status_success); }); DLTensor* kernel = args[0]; DLTensor* transformed_kernel = args[1]; // Dummy sizes nnp_padding input_padding{1, 1, 1, 1}; nnp_size stride_size{1, 1}; nnp_size input_size{100, 100}; NNPackConfig(args[2]); uint64_t algo_ = args[3]; nnp_convolution_algorithm algo = static_cast(algo_); ICHECK_EQ(kernel->ndim, 4); size_t input_channels = kernel->shape[1]; size_t output_channels = kernel->shape[0]; ICHECK_EQ(kernel->shape[2], 3); ICHECK_EQ(kernel->shape[3], 3); nnp_size kernel_size{static_cast(kernel->shape[2]), static_cast(kernel->shape[3])}; ICHECK(kernel->strides == nullptr); ICHECK(TypeMatch(kernel->dtype, kDLFloat, 32)); size_t transformed_kernel_size = 0; nnp_status status; status = nnp_convolution_inference( algo, nnp_convolution_transform_strategy_precompute, input_channels, output_channels, input_size, input_padding, kernel_size, stride_size, nullptr, nullptr, nullptr, nullptr, nullptr, &transformed_kernel_size, nnp_activation_identity, nullptr, entry->threadpool, nullptr); ICHECK_EQ(status, nnp_status_success); ICHECK_LE(transformed_kernel_size, GetDataSize(*transformed_kernel)); status = nnp_convolution_inference( algo, nnp_convolution_transform_strategy_precompute, input_channels, output_channels, input_size, input_padding, kernel_size, stride_size, nullptr, static_cast(kernel->data), nullptr, nullptr, static_cast(transformed_kernel->data), &transformed_kernel_size, nnp_activation_identity, nullptr, entry->threadpool, nullptr); ICHECK_EQ(status, nnp_status_success); }); } // namespace contrib } // namespace tvm