/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ /*! * \file threading_backend.cc * \brief Native threading backend */ #include #include #include #include #if defined(__linux__) || defined(__ANDROID__) #include #include #else #endif #if defined(__linux__) #include #endif #if defined(__hexagon__) #include #endif namespace tvm { namespace runtime { namespace threading { class ThreadGroup::Impl { public: Impl(int num_workers, std::function worker_callback, bool exclude_worker0) : num_workers_(num_workers) { ICHECK_GE(num_workers, 1) << "Requested a non-positive number of worker threads."; for (int i = exclude_worker0; i < num_workers_; ++i) { threads_.emplace_back([worker_callback, i] { worker_callback(i); }); } InitSortedOrder(); } ~Impl() { Join(); } void Join() { for (auto& t : threads_) { if (t.joinable()) t.join(); } } int Configure(AffinityMode mode, int nthreads, bool exclude_worker0) { int num_workers_used = 0; if (mode == kLittle) { num_workers_used = little_count_; } else if (mode == kBig) { num_workers_used = big_count_; } else { // use default num_workers_used = threading::MaxConcurrency(); } // if a specific number was given, use that if (nthreads) { num_workers_used = nthreads; } // if MaxConcurrency restricted the number of workers (e.g., due to // hyperthreading), respect the restriction. On CPUs with N logical cores // and N/2 physical cores this will set affinity to the first N/2 logical // ones. num_workers_used = std::min(num_workers_, num_workers_used); const char* val = getenv("TVM_BIND_THREADS"); if (val == nullptr || atoi(val) == 1) { // Do not set affinity if there are more workers than found cores if (sorted_order_.size() >= static_cast(num_workers_)) { SetAffinity(exclude_worker0, mode == kLittle); } else { LOG(WARNING) << "The thread affinity cannot be set when the number of workers" << "is larger than the number of available cores in the system."; } } return num_workers_used; } private: // bind worker threads to disjoint cores // if worker 0 is offloaded to main, i.e. exclude_worker0 is true, // the main thread is bound to core 0. void SetAffinity(bool exclude_worker0, bool reverse = false) { #if defined(__ANDROID__) #ifndef CPU_SET #define CPU_SETSIZE 1024 #define __NCPUBITS (8 * sizeof(uint64_t)) typedef struct { uint64_t __bits[CPU_SETSIZE / __NCPUBITS]; } cpu_set_t; #define CPU_SET(cpu, cpusetp) \ ((cpusetp)->__bits[(cpu) / __NCPUBITS] |= (1UL << ((cpu) % __NCPUBITS))) #define CPU_ZERO(cpusetp) memset((cpusetp), 0, sizeof(cpu_set_t)) #endif #endif #if defined(__linux__) || defined(__ANDROID__) ICHECK_GE(sorted_order_.size(), num_workers_); for (unsigned i = 0; i < threads_.size(); ++i) { unsigned core_id; if (reverse) { core_id = sorted_order_[sorted_order_.size() - (i + exclude_worker0) - 1]; } else { core_id = sorted_order_[i + exclude_worker0]; } cpu_set_t cpuset; CPU_ZERO(&cpuset); CPU_SET(core_id, &cpuset); #if defined(__ANDROID__) sched_setaffinity(threads_[i].native_handle(), sizeof(cpu_set_t), &cpuset); #else pthread_setaffinity_np(threads_[i].native_handle(), sizeof(cpu_set_t), &cpuset); #endif } if (exclude_worker0) { // main thread run task // Master thread will have free migration on needed cores. // Typically, the OS will schedule the main thread to run at core 0, // which is idle, when other workers are running. // See the comment inside SetMasterThreadFullCpuAffinity function to get more detail. SetMasterThreadFullCpuAffinity(reverse); } #endif } void SetMasterThreadFullCpuAffinity(bool reverse) { #if defined(__linux__) || defined(__ANDROID__) cpu_set_t cpuset; CPU_ZERO(&cpuset); // For example, we have 2xA72 + 4xA53 (id is 0 - 5, 4, 5 is A72 big core) // And we use config_threadpool API to set we will only use 4xA53. // The sorted_order will be [4, 5, 0, 1, 2, 3]. // When to call this API, we have spawn threads on little cores for other workers // in SetAffinity function. And for tvm main thread, it should also run on little cores, // not big cores (4, 5). // Note: this works well on x86 too. Because x86 doesn't have BIG.LITTLE, // our implementation will use kBig mode by default and will let main thread // run on intended cores. if (reverse) { for (int i = 0; i < little_count_; ++i) { CPU_SET(sorted_order_[sorted_order_.size() - i - 1], &cpuset); } } else { int num_cpu_workers = std::min(MaxConcurrency(), big_count_); for (int i = 0; i < num_cpu_workers; ++i) { CPU_SET(sorted_order_[i], &cpuset); } } #if defined(__ANDROID__) sched_setaffinity(pthread_self(), sizeof(cpu_set_t), &cpuset); #else pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset); #endif #endif } void InitSortedOrder() { unsigned int threads = std::thread::hardware_concurrency(); #if defined(__hexagon__) // With unsigned PDs, getting the number of available hardware threads // is not supported in earlier versions of QuRT. In such cases assume 4. if (threads == 0) threads = 4; #endif std::vector > max_freqs; for (unsigned int i = 0; i < threads; ++i) { int64_t cur_freq = 0; #if defined(__linux__) || defined(__ANDROID__) std::ostringstream filepath; filepath << "/sys/devices/system/cpu/cpu" << i << "/cpufreq/scaling_max_freq"; std::ifstream ifs(filepath.str()); if (!ifs.fail()) { if (!(ifs >> cur_freq)) { cur_freq = -1; } ifs.close(); } #endif max_freqs.push_back(std::make_pair(i, cur_freq)); } auto fcmpbyfreq = [](const std::pair& a, const std::pair& b) { return a.second == b.second ? a.first < b.first : a.second > b.second; }; std::sort(max_freqs.begin(), max_freqs.end(), fcmpbyfreq); int64_t big_freq = max_freqs.begin()->second; int64_t little_freq = max_freqs.rbegin()->second; for (auto it = max_freqs.begin(); it != max_freqs.end(); it++) { sorted_order_.push_back(it->first); if (big_freq == it->second) { big_count_++; } if (big_freq != little_freq && little_freq == it->second) { little_count_++; } } if (big_count_ + little_count_ != static_cast(sorted_order_.size())) { LOG(WARNING) << "more than two frequencies detected!"; } } int num_workers_; std::vector threads_; std::vector sorted_order_; int big_count_ = 0; int little_count_ = 0; }; ThreadGroup::ThreadGroup(int num_workers, std::function worker_callback, bool exclude_worker0) : impl_(new ThreadGroup::Impl(num_workers, worker_callback, exclude_worker0)) {} ThreadGroup::~ThreadGroup() { delete impl_; } void ThreadGroup::Join() { impl_->Join(); } int ThreadGroup::Configure(AffinityMode mode, int nthreads, bool exclude_worker0) { return impl_->Configure(mode, nthreads, exclude_worker0); } void Yield() { std::this_thread::yield(); } int MaxConcurrency() { int max_concurrency = 1; const char* val = getenv("TVM_NUM_THREADS"); if (val == nullptr) { val = getenv("OMP_NUM_THREADS"); } if (val != nullptr) { max_concurrency = atoi(val); } else { max_concurrency = std::thread::hardware_concurrency(); #if defined(_M_X64) || defined(__x86_64__) max_concurrency /= 2; // ignore hyper-threading #elif defined(__hexagon__) // With unsigned PDs, getting the number of available hardware threads // is not supported in earlier versions of QuRT. In such cases assume 4. // If running on simulator, set max_concurrency to 1. if (max_concurrency == 0) { if (dlsym(RTLD_DEFAULT, "running_in_sim_dev_17bc90206f6cf5a7")) { max_concurrency = 1; } else { max_concurrency = 4; } } #endif } return std::max(max_concurrency, 1); } } // namespace threading } // namespace runtime } // namespace tvm