/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ #include "microtvm_graph_executor.h" #include #include #include #include "picojson.h" namespace tvm { namespace micro { namespace { int TVMSToI(const std::string& str) { // For platforms (e.g. older NDK versions) where std::stoi(...) is not available. char* end; return std::strtol(str.c_str(), &end, 10); } void ParseOutputs(const picojson::array& joutputs, DynArray* outputs) { outputs->resize(joutputs.size()); for (size_t i = 0; i < joutputs.size(); ++i) { const auto& joutput_i = joutputs[i].get(); (*outputs)[i] = NodeEntry{static_cast(joutput_i[0].get()), static_cast(joutput_i[1].get()), static_cast(joutput_i[2].get())}; } } void ParseAttrs(const picojson::object& jattr, GraphAttr* attr) { // parse dltype for (const auto& jdltype_ : jattr.at("dltype").get()) { if (jdltype_.is()) { continue; } const auto& jdltype = jdltype_.get(); attr->dltype.resize(jdltype.size()); for (size_t i = 0; i < jdltype.size(); ++i) { attr->dltype[i] = jdltype[i].get(); } } for (const auto& jstorage_id_ : jattr.at("storage_id").get()) { if (jstorage_id_.is()) { continue; } const auto& jstorage_id = jstorage_id_.get(); attr->storage_id.resize(jstorage_id.size()); for (size_t i = 0; i < jstorage_id.size(); ++i) { attr->storage_id[i] = static_cast(jstorage_id[i].get()); } } for (const auto& jshape_ : jattr.at("shape").get()) { if (jshape_.is()) { continue; } const auto& jshape = jshape_.get(); attr->shape.resize(jshape.size()); for (size_t i = 0; i < jshape.size(); ++i) { const auto& jshape_i = jshape[i].get(); attr->shape[i].resize(jshape_i.size()); for (size_t j = 0; j < jshape_i.size(); ++j) { attr->shape[i][j] = static_cast(jshape_i[j].get()); } } } } void ParseNodes(const picojson::array& jnodes, DynArray* nodes) { nodes->resize(jnodes.size()); for (size_t i = 0; i < nodes->size(); ++i) { auto* n = &(*nodes)[i]; const auto& jn = jnodes[i].get(); n->op_type = jn.at("op").get(); n->name = jn.at("name").get(); const auto jinputs = jn.at("inputs").get(); n->inputs.resize(jinputs.size()); for (size_t i = 0; i < jinputs.size(); ++i) { const auto& jinput_i = jinputs[i].get(); n->inputs[i] = NodeEntry{static_cast(jinput_i[0].get()), static_cast(jinput_i[1].get()), static_cast(jinput_i[2].get())}; } const auto& jattrs_ = jn.find("attrs"); if (jattrs_ != jn.end()) { const auto& jattrs = jattrs_->second.get(); n->param.func_name = jattrs.at("func_name").get(); n->param.num_inputs = TVMSToI(jattrs.at("num_inputs").get()); n->param.num_outputs = TVMSToI(jattrs.at("num_outputs").get()); n->param.flatten_data = TVMSToI(jattrs.at("flatten_data").get()); } } } void ParseArgNodes(const picojson::array& jinput_nodes, DynArray* input_nodes) { input_nodes->resize(jinput_nodes.size()); for (size_t i = 0; i < jinput_nodes.size(); ++i) { (*input_nodes)[i] = static_cast(jinput_nodes[i].get()); } } } // namespace NDArray::~NDArray() {} NDArray NDArray::Empty(const DynArray& shape, DLDataType dtype, DLDevice dev) { NDArray r; int64_t nbytes = (dtype.bits * dtype.lanes + 7) / 8; for (const auto& s : shape) { nbytes *= s; } r.storage_ = std::shared_ptr( TVMBackendAllocWorkspace(static_cast(dev.device_type), static_cast(dev.device_id), nbytes, dtype.code, dtype.bits), [=](void* ptr) { if (ptr) { TVMBackendFreeWorkspace(dev.device_type, dev.device_id, ptr); } }); r.shape_ = shape; r.dtype_ = dtype; r.device_ = dev; return r; } NDArray NDArray::CreateView(const DynArray& shape, DLDataType dtype) { NDArray r; r.storage_ = storage_; r.shape_ = shape; r.dtype_ = dtype; r.device_ = device_; return r; } DLTensor NDArray::ToDLTensor() { DLTensor r; r.data = storage_.get(); assert(r.data != nullptr); r.device = device_; r.ndim = shape_.size(); r.dtype = dtype_; r.shape = shape_.data(); r.strides = nullptr; r.byte_offset = 0; return r; } size_t GetDataSize(const DLTensor& arr) { size_t size = 1; for (size_t i = 0; i < static_cast(arr.ndim); ++i) { size *= static_cast(arr.shape[i]); } size *= (arr.dtype.bits * arr.dtype.lanes + 7) / 8; return size; } void NDArray::CopyFrom(DLTensor* src) { std::memcpy(storage_.get(), reinterpret_cast(src->data) + static_cast(src->byte_offset), GetDataSize(*src)); } void NDArray::CopyTo(DLTensor* dst) const { std::memcpy(reinterpret_cast(dst->data) + static_cast(dst->byte_offset), storage_.get(), GetDataSize(*dst)); } DSOModule::DSOModule(const std::string& name) { dlerror(); lib_handle_ = dlopen(name.c_str(), RTLD_LAZY | RTLD_LOCAL); assert(!dlerror()); assert(lib_handle_ != nullptr); #define TVM_INIT_CONTEXT_FUNC(FuncName) \ if (auto* fp = reinterpret_cast(GetSymbol("__" #FuncName))) { \ *fp = FuncName; \ } // Initialize the functions TVM_INIT_CONTEXT_FUNC(TVMAPISetLastError); TVM_INIT_CONTEXT_FUNC(TVMBackendAllocWorkspace); TVM_INIT_CONTEXT_FUNC(TVMBackendFreeWorkspace); TVM_INIT_CONTEXT_FUNC(TVMBackendParallelLaunch); // TODO(tulloch): implement these functions? // TVM_INIT_CONTEXT_FUNC(TVMFuncCall); // TVM_INIT_CONTEXT_FUNC(TVMBackendGetFuncFromEnv); // TVM_INIT_CONTEXT_FUNC(TVMBackendParallelBarrier); #undef TVM_INIT_CONTEXT_FUNC } DSOModule::~DSOModule() { if (lib_handle_) { dlclose(lib_handle_); } } BackendPackedCFunc DSOModule::GetFunction(const std::string& name) const { auto faddr = reinterpret_cast(GetSymbol(name.c_str())); assert(faddr); return faddr; } void* DSOModule::GetSymbol(const char* name) const { dlerror(); auto* f = dlsym(lib_handle_, name); assert(!dlerror()); return f; } MicroGraphExecutor::MicroGraphExecutor(const std::string& graph_json, DSOModule* module) { assert(module); module_ = module; picojson::value v; picojson::parse(v, graph_json); ParseNodes(v.get()["nodes"].get(), &nodes_); ParseArgNodes(v.get()["arg_nodes"].get(), &input_nodes_); ParseArgNodes(v.get()["node_row_ptr"].get(), &node_row_ptr_); ParseOutputs(v.get()["heads"].get(), &outputs_); ParseAttrs(v.get()["attrs"].get(), &attrs_); SetupStorage(); SetupOpExecs(); } MicroGraphExecutor::~MicroGraphExecutor() {} void MicroGraphExecutor::Run() { for (size_t i = 0; i < op_execs_.size(); ++i) { if (op_execs_[i]) op_execs_[i](); } } void MicroGraphExecutor::SetInput(int index, DLTensor* data_in) { assert(static_cast(index) < input_nodes_.size()); uint32_t eid = this->entry_id(input_nodes_[index], 0); data_entry_[eid].CopyFrom(data_in); } void MicroGraphExecutor::CopyOutputTo(int index, DLTensor* data_out) { assert(static_cast(index) < outputs_.size()); uint32_t eid = this->entry_id(outputs_[index]); const NDArray& data = data_entry_[eid]; data.CopyTo(data_out); } void MicroGraphExecutor::SetupStorage() { // Grab saved optimization plan from graph. DynArray vtype(attrs_.dltype.size()); for (size_t i = 0; i < attrs_.dltype.size(); ++i) { assert(attrs_.dltype[i] == "float32"); DLDataType ty; ty.bits = 32; ty.lanes = 1; ty.code = kDLFloat; vtype[i] = ty; } // Size and device type of each storage pool entry. std::vector pool_entry; // Find the maximum space size. for (size_t i = 0; i < attrs_.shape.size(); ++i) { int storage_id = attrs_.storage_id[i]; // Use the fallback device if no device index is available. int device_type = static_cast(device_.device_type); size_t size = 1; for (int64_t sz : attrs_.shape[i]) { size *= static_cast(sz); } assert(storage_id >= 0); DLDataType t = vtype[i]; size_t bits = t.bits * t.lanes; assert(bits % 8U == 0U || bits == 1U); size_t bytes = ((bits + 7U) / 8U) * size; uint32_t sid = static_cast(storage_id); if (sid >= pool_entry.size()) { pool_entry.resize(sid + 1, {0, -1}); } else { assert(pool_entry[sid].device_type == -1 || pool_entry[sid].device_type == device_type); } pool_entry[sid].size = std::max(pool_entry[sid].size, bytes); pool_entry[sid].device_type = device_type; } // Allocate the space. storage_pool_.resize(pool_entry.size()); for (size_t i = 0; i < pool_entry.size(); ++i) { const auto& pit = pool_entry[i]; DynArray shape(1); shape[0] = static_cast(pit.size + 3) / 4; storage_pool_[i] = NDArray::Empty(shape, DLDataType{kDLFloat, 32, 1}, device_); } // Assign the pooled entries. A unified memory pool is used to simplify // memory assignment for each node entry. The allocated memory on each device // is mapped to this pool. data_entry_.resize(num_node_entries()); for (size_t i = 0; i < data_entry_.size(); ++i) { int storage_id = attrs_.storage_id[i]; assert(static_cast(storage_id) < storage_pool_.size()); data_entry_[i] = storage_pool_[storage_id].CreateView(attrs_.shape[i], vtype[i]); } } std::function CreateTVMOp(const DSOModule& module, const TVMOpParam& param, const DynArray& args) { typedef union { void* v_handle; } TVMValue; /*typedef*/ enum { kTVMDLTensorHandle = 7U, } /*TVMArgTypeCode*/; struct OpArgs { DynArray args; DynArray arg_values; DynArray arg_tcodes; DynArray shape_data; }; std::shared_ptr arg_ptr = std::make_shared(); arg_ptr->args = args; if (param.flatten_data) { arg_ptr->shape_data.resize(arg_ptr->args.size()); } arg_ptr->arg_values.resize(arg_ptr->args.size()); arg_ptr->arg_tcodes.resize(arg_ptr->args.size()); for (size_t i = 0; i < arg_ptr->args.size(); ++i) { TVMValue v; DLTensor* t = &(arg_ptr->args[i]); v.v_handle = t; arg_ptr->arg_values[i] = v; arg_ptr->arg_tcodes[i] = kTVMDLTensorHandle; if (param.flatten_data) { arg_ptr->shape_data[i] = std::accumulate(t->shape, t->shape + t->ndim, 1, std::multiplies()); t->ndim = 1; t->shape = &(arg_ptr->shape_data[i]); } } if (param.func_name == "__nop") { return []() {}; } else if (param.func_name == "__copy") { // TODO(mbs): device_copy cleanup. assert(false); } BackendPackedCFunc pf = module.GetFunction(param.func_name); assert(pf != nullptr); auto fexec = [arg_ptr, pf]() { assert(pf); (pf)(arg_ptr->arg_values.data(), arg_ptr->arg_tcodes.data(), static_cast(arg_ptr->arg_values.size())); }; return fexec; } void MicroGraphExecutor::SetupOpExecs() { op_execs_.resize(nodes_.size()); // setup the array and requirements. for (uint32_t nid = 0; nid < nodes_.size(); ++nid) { const auto& inode = nodes_[nid]; if (inode.op_type == "null") continue; DynArray args(inode.inputs.size() + inode.param.num_outputs); for (size_t i = 0; i < inode.inputs.size(); ++i) { const auto& e = inode.inputs[i]; args[i] = data_entry_[this->entry_id(e)].ToDLTensor(); } for (size_t index = 0; index < inode.param.num_outputs; ++index) { uint32_t eid = this->entry_id(nid, index); args[index + inode.inputs.size()] = data_entry_[eid].ToDLTensor(); } assert(inode.op_type == "tvm_op"); op_execs_[nid] = CreateTVMOp(*module_, inode.param, args); } } } // namespace micro } // namespace tvm