/****************************************************************************** * * Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ******************************************************************************/ #pragma once #include "Exceptions.h" #include #include #include #include "ATen/cuda/ATenCUDAGeneral.h" #include #if !defined(TORCH_CUDA_API) && defined(TORCH_CUDA_CU_API) #define TORCH_CUDA_API TORCH_CUDA_CU_API #endif namespace at { namespace native { namespace nhwc { // TODO: Add constructors for all of the descriptors inline int dataSize(cudnnDataType_t dataType) { switch (dataType) { case CUDNN_DATA_HALF: return 2; case CUDNN_DATA_FLOAT: return 4; default: return 8; } } // The stride for a size-1 dimensions is not uniquely determined; in // fact, it can be anything you want, because the fact that the // tensor is size 1 at this dimension means that you will never actually // try advancing your pointer by this stride. // // However, CuDNN has a much more stringent requirement on strides: // if you are passing a contiguous input, it better be the case // that the stride for dim i is the product of the sizes of dims // i+1 to the end. This stride is indeed uniquely determined. This // function modifies 'stride' in place so this invariant holds. static inline void fixSizeOneDimStride(int dim, const int *size, int *stride) { int64_t z = 1; for(int d = dim-1; d >= 0; d--) { if (size[d] == 1) { stride[d] = z; } else { z *= size[d]; } } } template struct DescriptorDeleter { void operator()(T* x) { if (x != nullptr) { CUDNN_CHECK(dtor(x)); } } }; // A generic class for wrapping cuDNN descriptor types. All you need // is to give the underlying type the Descriptor_t points to (usually, // if it's cudnnTensorDescriptor_t it points to cudnnTensorStruct), // the constructor and the destructor. Subclasses are responsible // for defining a set() function to actually set the descriptor. // // Descriptors default construct to a nullptr, and have a descriptor // initialized the first time you call set() or any other initializing // function. template class TORCH_CUDA_API Descriptor { public: // TODO: Figure out why const-correctness doesn't work here // Use desc() to access the underlying descriptor pointer in // a read-only fashion. Most client code should use this. // If the descriptor was never initialized, this will return // nullptr. T* desc() const { return desc_.get(); } T* desc() { return desc_.get(); } // Use mut_desc() to access the underlying desciptor pointer // if you intend to modify what it points to (e.g., using // cudnnSetFooDescriptor). This will ensure that the descriptor // is initialized. Code in this file will use this function. T* mut_desc() { init(); return desc_.get(); } protected: void init() { if (desc_ == nullptr) { T* raw_desc; CUDNN_CHECK(ctor(&raw_desc)); desc_.reset(raw_desc); } } private: std::unique_ptr> desc_; }; class TORCH_CUDA_API TensorDescriptor : public Descriptor { public: TensorDescriptor() {} explicit TensorDescriptor(const at::Tensor &t, size_t pad = 0) { set(t, pad); } // Note [CuDNN broadcast padding] // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // pad specifies the minimum dimensionality of the tensor descriptor // we produce (it doesn't have anything to do with, e.g., convolution // padding). If 't' is lower-dimensional than 'pad', the remaining // dimensions (on the right) are padded with ones. This doesn't // affect the underlying data layout. This is particularly useful for // dealing with a pecularity of the CuDNN API, which is that broadcasting in CuDNN is // done in two steps: first, the client code is expected to pad out // (the dimensions) input tensors to be the same dimension as the // target broadcast, and then second, CuDNN takes of actually // broadcasting size 1 dimensions. void set(const at::Tensor &t, size_t pad = 0); void set(cudnnDataType_t dataType, IntArrayRef sizes, IntArrayRef strides, size_t pad = 0); void print(); private: void set(cudnnDataType_t dataType, int dim, int* size, int* stride) { int cudnn_dims[] = {size[0], size[3], size[1], size[2]}; fixSizeOneDimStride(dim, size, stride); // modified as we're hacking in {N, H, W, C} ordering where we shouldn't be CUDNN_CHECK(cudnnSetTensorNdDescriptorEx(mut_desc(), CUDNN_TENSOR_NHWC, dataType, dim, cudnn_dims)); } }; std::ostream& operator<<(std::ostream & out, const TensorDescriptor& d); class FilterDescriptor : public Descriptor { public: void set(const at::Tensor &t, int64_t pad = 0); private: void set(cudnnDataType_t dataType, int dim, int* size) { int cudnn_size[] = {size[0], size[3], size[1], size[2]}; CUDNN_CHECK(cudnnSetFilterNdDescriptor(mut_desc(), dataType, CUDNN_TENSOR_NHWC, dim, cudnn_size)); } }; struct TORCH_CUDA_API ConvolutionDescriptor : public Descriptor { void set(cudnnDataType_t dataType, int dim, int* pad, int* stride, int * upscale /* aka dilation */, int groups) { cudnnDataType_t mathType = dataType; if (dataType == CUDNN_DATA_HALF) mathType = CUDNN_DATA_FLOAT; CUDNN_CHECK(cudnnSetConvolutionNdDescriptor(mut_desc(), dim, pad, stride, upscale, CUDNN_CROSS_CORRELATION, mathType)); #if CUDNN_VERSION >= 7000 CUDNN_CHECK(cudnnSetConvolutionGroupCount(mut_desc(), groups)); CUDNN_CHECK(cudnnSetConvolutionMathType(mut_desc(), CUDNN_DEFAULT_MATH)); if(dataType == CUDNN_DATA_HALF) CUDNN_CHECK(cudnnSetConvolutionMathType(mut_desc(), CUDNN_TENSOR_OP_MATH)); #endif } }; union Constant { float f; double d; Constant(cudnnDataType_t dataType, double value) { if (dataType == CUDNN_DATA_HALF || dataType == CUDNN_DATA_FLOAT) { f = (float) value; } else { d = value; } } }; }}} // namespace namespace at { namespace native { namespace nchw { static inline void fixSizeOneDimStride(int dim, const int *size, int *stride) { int64_t z = 1; for(int d = dim-1; d >= 0; d--) { if (size[d] == 1) { stride[d] = z; } else { z *= size[d]; } } } template struct DescriptorDeleter { void operator()(T* x) { if (x != nullptr) { CUDNN_CHECK(dtor(x)); } } }; template class TORCH_CUDA_API Descriptor { public: // TODO: Figure out why const-correctness doesn't work here // Use desc() to access the underlying descriptor pointer in // a read-only fashion. Most client code should use this. // If the descriptor was never initialized, this will return // nullptr. T* desc() const { return desc_.get(); } T* desc() { return desc_.get(); } // Use mut_desc() to access the underlying desciptor pointer // if you intend to modify what it points to (e.g., using // cudnnSetFooDescriptor). This will ensure that the descriptor // is initialized. Code in this file will use this function. T* mut_desc() { init(); return desc_.get(); } protected: void init() { if (desc_ == nullptr) { T* raw_desc; CUDNN_CHECK(ctor(&raw_desc)); desc_.reset(raw_desc); } } private: std::unique_ptr> desc_; }; class TORCH_CUDA_API TensorDescriptor : public Descriptor { public: TensorDescriptor() {} explicit TensorDescriptor(const at::Tensor &t, size_t pad = 0) { set(t, pad); } // Note [CuDNN broadcast padding] // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // pad specifies the minimum dimensionality of the tensor descriptor // we produce (it doesn't have anything to do with, e.g., convolution // padding). If 't' is lower-dimensional than 'pad', the remaining // dimensions (on the right) are padded with ones. This doesn't // affect the underlying data layout. This is particularly useful for // dealing with a pecularity of the CuDNN API, which is that broadcasting in CuDNN is // done in two steps: first, the client code is expected to pad out // (the dimensions) input tensors to be the same dimension as the // target broadcast, and then second, CuDNN takes of actually // broadcasting size 1 dimensions. void set(const at::Tensor &t, size_t pad = 0); void set(cudnnDataType_t dataType, IntArrayRef sizes, IntArrayRef strides, size_t pad = 0); void print(); private: void set(cudnnDataType_t dataType, int dim, int* size, int* stride) { int cudnn_dims[] = {size[0], size[1], size[2], size[3]}; fixSizeOneDimStride(dim, size, stride); // modified as we're hacking in {N, H, W, C} ordering where we shouldn't be CUDNN_CHECK(cudnnSetTensorNdDescriptorEx(mut_desc(), CUDNN_TENSOR_NCHW, dataType, dim, cudnn_dims)); } }; }}}