/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ /*! * \file opencl_device_api.cc */ #include #include #include "opencl_common.h" namespace tvm { namespace runtime { namespace cl { std::string GetPlatformInfo(cl_platform_id pid, cl_platform_info param_name); std::string GetDeviceInfo(cl_device_id pid, cl_device_info param_name); struct ImageInfo { size_t origin[3] = {}; size_t region[3] = {}; size_t row_pitch = 0; size_t slice_pitch = 0; }; /*! * \brief Utility to apply a memory layout specific lowering convention * to infer the physical shape from the provided DLTensor's logical shape. * \param desc Descriptor which contains the buffer and layout tag. * \param The DLTensor used to infer the tensors physical shape. */ ImageInfo GetImageInfo(const cl::BufferDescriptor* desc, const DLTensor* tensor) { ImageInfo info{}; ICHECK(tensor->dtype.lanes == 1) << "Image dtype has lanes: " << tensor->dtype.lanes; info.origin[0] = info.origin[1] = info.origin[2] = 0; info.row_pitch = 0; info.slice_pitch = 0; size_t axis = DefaultTextureLayoutSeparator( tensor->ndim, cl::BufferDescriptor::ScopeFromMemoryLayout(desc->layout)); auto texture_shape = ApplyTexture2DFlattening(tensor->shape, tensor->ndim, axis); info.region[0] = texture_shape.width; info.region[1] = texture_shape.height; info.region[2] = 1; return info; } cl::BufferDescriptor::MemoryLayout cl::BufferDescriptor::MemoryLayoutFromScope( Optional mem_scope) { if (!mem_scope.defined()) { return cl::BufferDescriptor::MemoryLayout::kBuffer1D; } else if (mem_scope.value() == "global.texture") { return cl::BufferDescriptor::MemoryLayout::kImage2DActivation; } else if (mem_scope.value() == "global.texture-weight") { return cl::BufferDescriptor::MemoryLayout::kImage2DWeight; } LOG(FATAL) << "No memory layout defined for memory of scope: " << mem_scope.value(); return cl::BufferDescriptor::MemoryLayout::kBuffer1D; } String cl::BufferDescriptor::ScopeFromMemoryLayout(cl::BufferDescriptor::MemoryLayout layout) { switch (layout) { case cl::BufferDescriptor::MemoryLayout::kBuffer1D: return "global"; case cl::BufferDescriptor::MemoryLayout::kImage2DActivation: return "global.texture"; case cl::BufferDescriptor::MemoryLayout::kImage2DWeight: return "global.texture-weight"; } LOG(FATAL) << "No scope corresponding to the provided memory layout: " << static_cast(layout); return ""; } OpenCLThreadEntry* OpenCLWorkspace::GetThreadEntry() { return OpenCLThreadEntry::ThreadLocal(); } OpenCLWorkspace* OpenCLWorkspace::Global() { static OpenCLWorkspace* inst = new OpenCLWorkspace(); return inst; } void OpenCLWorkspace::SetDevice(Device dev) { GetThreadEntry()->device.device_id = dev.device_id; } void OpenCLWorkspace::GetAttr(Device dev, DeviceAttrKind kind, TVMRetValue* rv) { this->Init(); size_t index = static_cast(dev.device_id); if (kind == kExist) { *rv = static_cast(index < devices.size()); return; } ICHECK_LT(index, devices.size()) << "Invalid device id " << index; switch (kind) { case kExist: break; case kMaxThreadsPerBlock: { size_t value; OPENCL_CALL(clGetDeviceInfo(devices[index], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &value, nullptr)); *rv = static_cast(value); break; } case kWarpSize: { /* TODO: the warp size of OpenCL device is not always 1 e.g. Intel Graphics has a sub group concept which contains 8 - 32 work items, corresponding to the number of SIMD entries the heardware configures. We need to figure out a way to query this information from the hardware. */ *rv = 1; break; } case kMaxSharedMemoryPerBlock: { cl_ulong value; OPENCL_CALL(clGetDeviceInfo(devices[index], CL_DEVICE_LOCAL_MEM_SIZE, sizeof(cl_ulong), &value, nullptr)); *rv = static_cast(value); break; } case kComputeVersion: { // String returned is "OpenCL $MAJOR.$MINOR $VENDOR_INFO". To // match other implementations, we want to return "$MAJOR.$MINOR" std::string ret = GetDeviceInfo(devices[index], CL_DEVICE_VERSION); const size_t version_start = 7; // Length of initial "OpenCL " prefix to skip const size_t version_end = ret.find(' ', version_start); *rv = ret.substr(version_start, version_end - version_start); break; } return; case kDeviceName: *rv = GetDeviceInfo(devices[index], CL_DEVICE_NAME); break; case kMaxClockRate: { cl_uint value; OPENCL_CALL(clGetDeviceInfo(devices[index], CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(cl_uint), &value, nullptr)); // OpenCL returns the clock rate in MHz, while CUDA/ROCm return the // clock rate in kHz. Converting to the same units for each. *rv = static_cast(value * 1000); break; } case kMultiProcessorCount: { cl_uint value; OPENCL_CALL(clGetDeviceInfo(devices[index], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &value, nullptr)); *rv = static_cast(value); break; } case kMaxThreadDimensions: { size_t dims[3]; OPENCL_CALL(clGetDeviceInfo(devices[index], CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(dims), dims, nullptr)); std::stringstream ss; // use json string to return multiple int values; ss << "[" << dims[0] << ", " << dims[1] << ", " << dims[2] << "]"; *rv = ss.str(); break; } case kMaxRegistersPerBlock: return; case kGcnArch: return; case kApiVersion: { *rv = CL_TARGET_OPENCL_VERSION; break; } case kDriverVersion: { char value[128] = {0}; OPENCL_CALL( clGetDeviceInfo(devices[index], CL_DRIVER_VERSION, sizeof(value) - 1, value, nullptr)); *rv = std::string(value); break; } } } void* OpenCLWorkspace::AllocDataSpace(Device dev, size_t size, size_t alignment, DLDataType type_hint) { this->Init(); ICHECK(context != nullptr) << "No OpenCL device"; cl_int err_code; cl::BufferDescriptor* desc = new cl::BufferDescriptor; // CL_INVALID_BUFFER_SIZE if size is 0. if (size == 0) { size = 1; } desc->buffer = clCreateBuffer(this->context, CL_MEM_READ_WRITE, size, nullptr, &err_code); desc->layout = cl::BufferDescriptor::MemoryLayout::kBuffer1D; OPENCL_CHECK_ERROR(err_code); return desc; } void* OpenCLWorkspace::AllocDataSpace(Device dev, int ndim, const int64_t* shape, DLDataType dtype, Optional mem_scope) { if (!mem_scope.defined() || mem_scope.value() == "global") { return DeviceAPI::AllocDataSpace(dev, ndim, shape, dtype, mem_scope); } ICHECK(IsTextureStorage(std::string(mem_scope.value()))) << "Device does not support allocate data space with " << "specified memory scope: " << mem_scope.value(); ICHECK(ndim > 2) << "Shape for texture allocation must be at least rank 3; " << "provided shape is rank " << ndim; cl::BufferDescriptor* desc = new cl::BufferDescriptor(mem_scope); size_t axis = DefaultTextureLayoutSeparator(ndim, mem_scope.value()); auto texture = ApplyTexture2DFlattening(shape, ndim, axis); desc->buffer = AllocTexture(dev, texture.width, texture.height, dtype); return desc; } void OpenCLWorkspace::FreeDataSpace(Device dev, void* ptr) { // We have to make sure that the memory object is not in the command queue // for some OpenCL platforms. OPENCL_CALL(clFinish(this->GetQueue(dev))); cl::BufferDescriptor* desc = static_cast(ptr); OPENCL_CALL(clReleaseMemObject(desc->buffer)); delete desc; } cl_mem OpenCLWorkspace::AllocTexture(Device dev, size_t width, size_t height, DLDataType type_hint) { this->Init(); ICHECK(context != nullptr) << "No OpenCL device"; cl_int err_code; cl_channel_type cl_type = DTypeToOpenCLChannelType(type_hint); cl_image_format format = {CL_RGBA, cl_type}; cl_image_desc descriptor = {CL_MEM_OBJECT_IMAGE2D, width, height, 0, 0, 0, 0, 0, 0}; cl_mem mptr = clCreateImage(this->context, CL_MEM_READ_WRITE, &format, &descriptor, nullptr, &err_code); OPENCL_CHECK_ERROR(err_code); return mptr; } void* OpenCLWorkspace::AllocTextureWorkspace(Device dev, size_t width, size_t height, DLDataType type_hint) { return GetThreadEntry()->texture_pool.AllocTexture(dev, width, height, type_hint); } void OpenCLWorkspace::FreeTextureWorkspace(Device dev, void* ptr) { GetThreadEntry()->texture_pool.FreeTexture(dev, ptr); } void OpenCLWorkspace::CopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHandle stream) { size_t nbytes = GetDataSize(*from); ICHECK_EQ(nbytes, GetDataSize(*to)); ICHECK(IsContiguous(*from) && IsContiguous(*to)) << "CopyDataFromTo only support contiguous array for now"; if (IsOpenCLDevice(from->device) && IsOpenCLDevice(to->device)) { const auto* from_desc = static_cast(from->data); ICHECK(from_desc->layout == cl::BufferDescriptor::MemoryLayout::kBuffer1D) << "Device to device copying is currently only implemented for OpenCL buffer storage"; auto* to_desc = static_cast(to->data); OPENCL_CALL(clEnqueueCopyBuffer(this->GetQueue(to->device), from_desc->buffer, to_desc->buffer, from->byte_offset, to->byte_offset, nbytes, 0, nullptr, nullptr)); } else if (IsOpenCLDevice(from->device) && to->device.device_type == kDLCPU) { const auto* from_desc = static_cast(from->data); switch (from_desc->layout) { case cl::BufferDescriptor::MemoryLayout::kBuffer1D: OPENCL_CALL(clEnqueueReadBuffer( this->GetQueue(from->device), from_desc->buffer, CL_FALSE, from->byte_offset, nbytes, static_cast(to->data) + to->byte_offset, 0, nullptr, nullptr)); break; case cl::BufferDescriptor::MemoryLayout::kImage2DActivation: case cl::BufferDescriptor::MemoryLayout::kImage2DWeight: auto image_info = GetImageInfo(from_desc, from); // TODO(csullivan): Support calculating row_pitch correctly in the case of reuse. // Note that when utilizing texture pools for memory reuse, the allocated image // size can be larger than the size to be read. OPENCL_CALL(clEnqueueReadImage( this->GetQueue(from->device), from_desc->buffer, CL_FALSE, image_info.origin, image_info.region, image_info.row_pitch, image_info.slice_pitch, static_cast(to->data) + to->byte_offset, 0, nullptr, nullptr)); break; } OPENCL_CALL(clFinish(this->GetQueue(from->device))); } else if (from->device.device_type == kDLCPU && IsOpenCLDevice(to->device)) { auto* to_desc = static_cast(to->data); switch (to_desc->layout) { case cl::BufferDescriptor::MemoryLayout::kBuffer1D: OPENCL_CALL(clEnqueueWriteBuffer( this->GetQueue(to->device), to_desc->buffer, CL_FALSE, to->byte_offset, nbytes, static_cast(from->data) + from->byte_offset, 0, nullptr, nullptr)); break; case cl::BufferDescriptor::MemoryLayout::kImage2DActivation: case cl::BufferDescriptor::MemoryLayout::kImage2DWeight: auto image_info = GetImageInfo(to_desc, to); OPENCL_CALL(clEnqueueWriteImage( this->GetQueue(to->device), to_desc->buffer, CL_FALSE, image_info.origin, image_info.region, image_info.row_pitch, image_info.slice_pitch, static_cast(from->data) + from->byte_offset, 0, nullptr, nullptr)); break; } OPENCL_CALL(clFinish(this->GetQueue(to->device))); } else { LOG(FATAL) << "Expect copy from/to OpenCL or between OpenCL"; } } void OpenCLWorkspace::StreamSync(Device dev, TVMStreamHandle stream) { ICHECK(stream == nullptr); OPENCL_CALL(clFinish(this->GetQueue(dev))); } void* OpenCLWorkspace::AllocWorkspace(Device dev, size_t size, DLDataType type_hint) { return GetThreadEntry()->pool.AllocWorkspace(dev, size); } void OpenCLWorkspace::FreeWorkspace(Device dev, void* data) { GetThreadEntry()->pool.FreeWorkspace(dev, data); } typedef dmlc::ThreadLocalStore OpenCLThreadStore; OpenCLThreadEntry* OpenCLThreadEntry::ThreadLocal() { return OpenCLThreadStore::Get(); } std::string GetPlatformInfo(cl_platform_id pid, cl_platform_info param_name) { size_t ret_size; OPENCL_CALL(clGetPlatformInfo(pid, param_name, 0, nullptr, &ret_size)); std::string ret; ret.resize(ret_size); OPENCL_CALL(clGetPlatformInfo(pid, param_name, ret_size, &ret[0], nullptr)); return ret; } std::string GetDeviceInfo(cl_device_id pid, cl_device_info param_name) { size_t ret_size; OPENCL_CALL(clGetDeviceInfo(pid, param_name, 0, nullptr, &ret_size)); std::string ret; ret.resize(ret_size); OPENCL_CALL(clGetDeviceInfo(pid, param_name, ret_size, &ret[0], nullptr)); return ret; } std::vector GetPlatformIDs() { cl_uint ret_size; cl_int code = clGetPlatformIDs(0, nullptr, &ret_size); std::vector ret; if (code != CL_SUCCESS) return ret; ret.resize(ret_size); OPENCL_CALL(clGetPlatformIDs(ret_size, &ret[0], nullptr)); return ret; } std::vector GetDeviceIDs(cl_platform_id pid, std::string device_type) { cl_device_type dtype = CL_DEVICE_TYPE_ALL; if (device_type == "cpu") dtype = CL_DEVICE_TYPE_CPU; if (device_type == "gpu") dtype = CL_DEVICE_TYPE_GPU; if (device_type == "accelerator") dtype = CL_DEVICE_TYPE_ACCELERATOR; cl_uint ret_size; cl_int code = clGetDeviceIDs(pid, dtype, 0, nullptr, &ret_size); std::vector ret; if (code != CL_SUCCESS) return ret; ret.resize(ret_size); OPENCL_CALL(clGetDeviceIDs(pid, dtype, ret_size, &ret[0], nullptr)); return ret; } bool MatchPlatformInfo(cl_platform_id pid, cl_platform_info param_name, std::string value) { if (value.length() == 0) return true; std::string param_value = GetPlatformInfo(pid, param_name); return param_value.find(value) != std::string::npos; } void OpenCLWorkspace::Init(const std::string& type_key, const std::string& device_type, const std::string& platform_name) { if (initialized_) return; std::lock_guard lock(this->mu); if (initialized_) return; if (context != nullptr) return; this->type_key = type_key; // matched platforms std::vector platform_ids = cl::GetPlatformIDs(); if (platform_ids.size() == 0) { LOG(WARNING) << "No OpenCL platform matched given existing options ..."; return; } this->platform_id = nullptr; for (auto platform_id : platform_ids) { if (!MatchPlatformInfo(platform_id, CL_PLATFORM_NAME, platform_name)) { continue; } std::vector devices_matched = cl::GetDeviceIDs(platform_id, device_type); if ((devices_matched.size() == 0) && (device_type == "gpu")) { LOG(WARNING) << "Using CPU OpenCL device"; devices_matched = cl::GetDeviceIDs(platform_id, "cpu"); } if (devices_matched.size() > 0) { this->platform_id = platform_id; this->platform_name = cl::GetPlatformInfo(platform_id, CL_PLATFORM_NAME); this->device_type = device_type; this->devices = devices_matched; break; } } if (this->platform_id == nullptr) { LOG(WARNING) << "No OpenCL device"; return; } cl_int err_code; this->context = clCreateContext(nullptr, this->devices.size(), &(this->devices[0]), nullptr, nullptr, &err_code); OPENCL_CHECK_ERROR(err_code); ICHECK_EQ(this->queues.size(), 0U); for (size_t i = 0; i < this->devices.size(); ++i) { cl_device_id did = this->devices[i]; this->queues.push_back(clCreateCommandQueue(this->context, did, 0, &err_code)); OPENCL_CHECK_ERROR(err_code); } initialized_ = true; } TVM_REGISTER_GLOBAL("device_api.opencl.AllocTexture").set_body([](TVMArgs args, TVMRetValue* rv) { int device_type = args[0]; int device_id = args[1]; int width = args[2]; int height = args[3]; int dtype_code_hint = args[4]; int dtype_bits_hint = args[5]; Device dev; dev.device_type = static_cast(device_type); dev.device_id = device_id; DLDataType type_hint; type_hint.code = static_cast(dtype_code_hint); type_hint.bits = static_cast(dtype_bits_hint); type_hint.lanes = 1; OpenCLWorkspace* ptr = OpenCLWorkspace::Global(); *rv = ptr->AllocTextureWorkspace(dev, static_cast(width), static_cast(height), type_hint); }); TVM_REGISTER_GLOBAL("device_api.opencl.FreeTexture").set_body([](TVMArgs args, TVMRetValue* rv) { int device_type = args[0]; int device_id = args[1]; void* data = args[2]; OpenCLWorkspace* ptr = OpenCLWorkspace::Global(); Device dev; dev.device_type = static_cast(device_type); dev.device_id = device_id; ptr->FreeTextureWorkspace(dev, data); *rv = static_cast(0); }); TVM_REGISTER_GLOBAL("device_api.opencl").set_body([](TVMArgs args, TVMRetValue* rv) { DeviceAPI* ptr = OpenCLWorkspace::Global(); *rv = static_cast(ptr); }); } // namespace cl } // namespace runtime } // namespace tvm