# Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. import sys import pytest import tvm from tvm import te from tvm.contrib import cudnn from tvm.contrib.nvcc import have_fp16 import numpy as np import tvm.topi.testing import tvm.testing requires_cudnn = pytest.mark.skipif( tvm.get_global_func("tvm.contrib.cudnn.conv.output_shape_from_cudnn", True) is None, reason="CuDNN is not enabled", ) def verify_conv2d(data_dtype, conv_dtype, tensor_format=0, groups=1): in_channel = 4 out_channel = 16 filter_h = 3 filter_w = 3 pad_h = 1 pad_w = 1 stride_h = 1 stride_w = 1 dilation_h = 1 dilation_w = 1 batch = 3 height = 32 width = 32 if data_dtype == "float16" and not have_fp16(tvm.cuda(0).compute_version): print("Skip because gpu does not have fp16 support") return # schedule if tensor_format == 0: xshape = [batch, in_channel, height, width] wshape = [out_channel, in_channel // groups, filter_h, filter_w] else: xshape = [batch, height, width, in_channel] wshape = [out_channel, filter_h, filter_w, in_channel // groups] X = te.placeholder(xshape, name="X", dtype=data_dtype) W = te.placeholder(wshape, name="W", dtype=data_dtype) Y = cudnn.conv_forward( X, W, [pad_h, pad_w], [stride_h, stride_w], [dilation_h, dilation_w], conv_mode=1, tensor_format=tensor_format, conv_dtype=conv_dtype, algo=-1, groups=groups, ) yshape = [x.value for x in Y.shape] s = te.create_schedule(Y.op) # validation dev = tvm.cuda(0) f = tvm.build(s, [X, W, Y], "cuda --host=llvm", name="conv2d") x_np = np.random.uniform(-1, 1, xshape).astype(data_dtype) w_np = np.random.uniform(-1, 1, wshape).astype(data_dtype) y_np = np.zeros(yshape).astype(data_dtype) x = tvm.nd.array(x_np, dev) w = tvm.nd.array(w_np, dev) y = tvm.nd.array(y_np, dev) if tensor_format == 0: c_np = tvm.topi.testing.conv2d_nchw_python(x_np, w_np, 1, 1, groups=groups) elif tensor_format == 1: wt = w_np.transpose((1, 2, 3, 0)) # OHWI => HWIO c_np = tvm.topi.testing.conv2d_nhwc_python(x_np, wt, 1, 1, groups=groups) f(x, w, y) tvm.testing.assert_allclose(y.numpy(), c_np, atol=1e-2, rtol=1e-2) @tvm.testing.requires_gpu @requires_cudnn def test_conv2d(): verify_conv2d("float32", "float32", tensor_format=0) verify_conv2d("float16", "float32", tensor_format=1) verify_conv2d("float16", "float16", tensor_format=0) verify_conv2d("float16", "float16", tensor_format=1) verify_conv2d("int8", "int32", tensor_format=1) verify_conv2d("float32", "float32", tensor_format=0, groups=2) verify_conv2d("float16", "float32", tensor_format=1, groups=2) verify_conv2d("float16", "float16", tensor_format=0, groups=2) verify_conv2d("int8", "int32", tensor_format=1, groups=2) def verify_conv3d(data_dtype, conv_dtype, tensor_format=0, groups=1): in_channel = 4 out_channel = 16 filter_d = 3 filter_h = 3 filter_w = 3 pad_d = 1 pad_h = 1 pad_w = 1 stride_d = 1 stride_h = 1 stride_w = 1 dilation_d = 1 dilation_h = 1 dilation_w = 1 batch = 3 depth = 32 height = 32 width = 32 # schedule xshape = [batch, in_channel, depth, height, width] wshape = [out_channel, in_channel // groups, filter_d, filter_h, filter_w] X = te.placeholder(xshape, name="X", dtype=data_dtype) W = te.placeholder(wshape, name="W", dtype=data_dtype) Y = cudnn.conv_forward( X, W, [pad_d, pad_h, pad_w], [stride_d, stride_h, stride_w], [dilation_d, dilation_h, dilation_w], conv_mode=1, tensor_format=tensor_format, algo=-1, conv_dtype=conv_dtype, groups=groups, ) yshape = [x.value for x in Y.shape] s = te.create_schedule(Y.op) # validation dev = tvm.cuda(0) f = tvm.build(s, [X, W, Y], target="cuda --host=llvm", name="conv3d") x_np = np.random.uniform(-1, 1, xshape).astype(data_dtype) w_np = np.random.uniform(-1, 1, wshape).astype(data_dtype) y_np = np.zeros(yshape).astype(data_dtype) x = tvm.nd.array(x_np, dev) w = tvm.nd.array(w_np, dev) y = tvm.nd.array(y_np, dev) if tensor_format == 0: c_np = tvm.topi.testing.conv3d_ncdhw_python(x_np, w_np, 1, 1, groups) else: raise AssertionError("For now, conv3d tensor format only support: 0(NCHW)") f(x, w, y) tvm.testing.assert_allclose(y.numpy(), c_np, atol=3e-5, rtol=1e-4) @tvm.testing.requires_gpu @requires_cudnn def test_conv3d(): verify_conv3d("float32", "float32", tensor_format=0) verify_conv3d("float32", "float32", tensor_format=0, groups=2) def verify_softmax(shape, axis, dtype="float32", log_softmax=False): cudnn_op = cudnn.log_softmax if log_softmax else cudnn.softmax testing_op = ( tvm.topi.testing.log_softmax_python if log_softmax else tvm.topi.testing.softmax_python ) A = te.placeholder(shape, dtype=dtype, name="A") B = cudnn_op(A, axis) s = te.create_schedule([B.op]) dev = tvm.cuda(0) a_np = np.random.uniform(size=shape).astype(dtype) b_np = testing_op(a_np) a = tvm.nd.array(a_np, dev) b = tvm.nd.array(b_np, dev) f = tvm.build(s, [A, B], target="cuda --host=llvm", name="softmax") f(a, b) tvm.testing.assert_allclose(b.numpy(), b_np, rtol=1e-3) def verify_softmax_4d(shape, dtype="float32", log_softmax=False): cudnn_op = cudnn.log_softmax if log_softmax else cudnn.softmax testing_op = ( tvm.topi.testing.log_softmax_python if log_softmax else tvm.topi.testing.softmax_python ) A = te.placeholder(shape, dtype=dtype, name="A") B = cudnn_op(A, axis=1) s = te.create_schedule([B.op]) dev = tvm.cuda(0) n, c, h, w = shape a_np = np.random.uniform(size=shape).astype(dtype) b_np = testing_op(a_np.transpose(0, 2, 3, 1).reshape(h * w, c)) b_np = b_np.reshape(n, h, w, c).transpose(0, 3, 1, 2) a = tvm.nd.array(a_np, dev) b = tvm.nd.array(b_np, dev) f = tvm.build(s, [A, B], target="cuda --host=llvm", name="softmax") f(a, b) tvm.testing.assert_allclose(b.numpy(), b_np, rtol=1e-3) @tvm.testing.requires_gpu @requires_cudnn def test_softmax(): verify_softmax((32, 10), -1) verify_softmax((3, 4), -1) verify_softmax((1, 5), -1, "float64") verify_softmax_4d((1, 16, 256, 256)) verify_softmax_4d((1, 16, 256, 256), "float64") verify_softmax((32, 10), -1, log_softmax=True) verify_softmax((3, 4), -1, log_softmax=True) verify_softmax((1, 5), -1, "float64", log_softmax=True) verify_softmax_4d((1, 16, 256, 256), log_softmax=True) verify_softmax_4d((1, 16, 256, 256), "float64", log_softmax=True) test_kwargs_default_2d = { "tensor_format": 0, "pad": [1, 1], "stride": [1, 1], "dilation": [1, 1], "x_shape": [16, 4, 32, 32], "w_shape": [8, 4, 3, 3], "groups": 1, "conv_dtype": "float32", "data_dtype": "float32", } test_kwargs_default_3d = { "tensor_format": 0, "pad": [1, 1, 1], "stride": [1, 1, 1], "dilation": [1, 1, 1], "x_shape": [16, 4, 32, 32, 32], "w_shape": [8, 4, 3, 3, 3], "groups": 1, "conv_dtype": "float32", "data_dtype": "float32", } conv_output_shape_conditions = { "2d_small": test_kwargs_default_2d, "2d_large": { **test_kwargs_default_2d, "x_shape": [16, 32, 512, 1024], "w_shape": [8, 32, 5, 5], }, "2d_pad": {**test_kwargs_default_2d, "pad": [2, 3]}, "2d_stride": {**test_kwargs_default_2d, "stride": [2, 3]}, "2d_dilation": {**test_kwargs_default_2d, "dilation": [2, 3]}, "2d_groups": {**test_kwargs_default_2d, "groups": 4, "w_shape": [8, 1, 3, 3]}, "2d_NHWC": { **test_kwargs_default_2d, "tensor_format": 1, "x_shape": [16, 32, 32, 4], "w_shape": [8, 3, 3, 4], }, "2d_NCHW_VECT_C": { **test_kwargs_default_2d, "tensor_format": 2, "w_shape": [8, 16, 3, 3], "data_dtype": "int8x4", }, "3d_small": test_kwargs_default_3d, "3d_large": { **test_kwargs_default_3d, "x_shape": [16, 32, 64, 128, 256], "w_shape": [8, 32, 5, 5, 5], }, "3d_pad": {**test_kwargs_default_3d, "pad": [2, 3, 4]}, "3d_stride": {**test_kwargs_default_3d, "stride": [2, 3, 4]}, "3d_dilation": {**test_kwargs_default_3d, "dilation": [2, 3, 4]}, "3d_groups": {**test_kwargs_default_3d, "groups": 4, "w_shape": [8, 1, 3, 3, 3]}, "3d_NCHW_VECT_C": { **test_kwargs_default_3d, "tensor_format": 2, "w_shape": [8, 16, 3, 3, 3], "data_dtype": "int8x4", }, } @pytest.fixture( params=[pytest.param(kwargs, id=name) for name, kwargs in conv_output_shape_conditions.items()] ) def conv_output_shape_kwargs(request): return request.param @tvm.testing.requires_gpu @requires_cudnn def test_conv_output_shape(conv_output_shape_kwargs): shape_from_cudnn = cudnn._conv_output_shape_from_cudnn(**conv_output_shape_kwargs) shape_from_python = cudnn.conv_output_shape(**conv_output_shape_kwargs) assert shape_from_cudnn == shape_from_python if __name__ == "__main__": sys.exit(pytest.main(sys.argv))