""" - Parse jit compile info - Compute warp occupancy histogram """ from __future__ import division, absolute_import, print_function import math import re SMEM0K = 0 SMEM8K = 8 * 2 ** 10 SMEM16K = 16 * 2 ** 10 SMEM48K = 48 * 2 ** 10 SMEM64K = 64 * 2 ** 10 SMEM80K = 80 * 2 ** 10 SMEM96K = 96 * 2 ** 10 SMEM112K = 112 * 2 ** 10 #------------------------------------------------------------------------------ # autotuning class OccupancyThreadKey(object): def __init__(self, item): self.occupancy, self.threads = item self.comparison = self.occupancy, 1 / self.threads def __lt__(self, other): return self.comparison < other.comparison def __eq__(self, other): return self.comparison == other.comparison def __ne__(self, other): return self.comparison != other.comparison def __gt__(self, other): return self.comparison > other.comparison def __le__(self, other): return self.comparison <= other.comparison def __ge__(self, other): return self.comparison >= other.comparison class AutoTuner(object): """Autotune a kernel based upon the theoretical occupancy. """ def __init__(self, cc, info, smem_config=None, dynsmem=0): self.cc = cc self.dynsmem = dynsmem self._table = warp_occupancy(info=info, cc=cc) self._by_occupancy = list(reversed(sorted(((occup, tpb) for tpb, (occup, factor) in self.table.items()), key=OccupancyThreadKey))) @property def table(self): """A dict with thread-per-block as keys and tuple-2 of (occupency, limiting factor) as values. """ return self._table @property def by_occupancy(self): """A list of tuple-2 of (occupancy, thread-per-block) sorted in descending. The first item has the highest occupancy and the lowest number of thread-per-block. """ return self._by_occupancy def best(self): return self.max_occupancy_min_blocks() def max_occupancy_min_blocks(self): """Returns the thread-per-block that optimizes for maximum occupancy and minimum blocks. Maximum blocks allows for the best utilization of parallel execution because each block can be executed concurrently on different SM. """ return self.by_occupancy[0][1] def closest(self, tpb): """Find the occupancy of the closest tpb """ # round to the nearest multiple of warpsize warpsize = PHYSICAL_LIMITS[self.cc]['thread_per_warp'] tpb = ceil(tpb, warpsize) # search return self.table.get(tpb, [0])[0] def best_within(self, mintpb, maxtpb): """Returns the best tpb in the given range inclusively. """ warpsize = PHYSICAL_LIMITS[self.cc]['thread_per_warp'] mintpb = int(ceil(mintpb, warpsize)) maxtpb = int(floor(maxtpb, warpsize)) return self.prefer(*range(mintpb, maxtpb + 1, warpsize)) def prefer(self, *tpblist): """Prefer the thread-per-block with the highest warp occupancy and the lowest thread-per-block. May return None if all threads-per-blocks are invalid """ bin = [] for tpb in tpblist: occ = self.closest(tpb) if occ > 0: bin.append((occ, tpb)) if bin: return sorted(bin, key=OccupancyThreadKey)[-1][1] #------------------------------------------------------------------------------ # warp occupancy calculator # Reference: NVIDIA CUDA Toolkit v10.2.89 Programming Guide, Appendix H. # URL: http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#compute-capabilities LIMITS_CC_20 = { 'thread_per_warp': 32, 'warp_per_sm': 48, 'thread_per_sm': 1536, 'block_per_sm': 8, 'registers': 32768, 'reg_alloc_unit': 64, 'reg_alloc_gran': 'warp', 'reg_per_thread': 63, 'smem_per_sm': SMEM48K, 'smem_alloc_unit': 128, 'warp_alloc_gran': 2, 'max_block_size': 1024, 'default_smem_config': SMEM16K, } LIMITS_CC_21 = LIMITS_CC_20 LIMITS_CC_30 = { 'thread_per_warp': 32, 'warp_per_sm': 64, 'thread_per_sm': 2048, 'block_per_sm': 16, 'registers': 65536, 'reg_alloc_unit': 256, 'reg_alloc_gran': 'warp', 'reg_per_thread': 63, 'smem_per_sm': SMEM48K, 'smem_alloc_unit': 256, 'warp_alloc_gran': 4, 'max_block_size': 1024, 'default_smem_config': SMEM48K, } LIMITS_CC_35 = LIMITS_CC_30.copy() LIMITS_CC_35.update({ 'reg_per_thread': 255, }) LIMITS_CC_37 = LIMITS_CC_35.copy() LIMITS_CC_37.update({ 'registers': 131072, 'default_smem_config': SMEM112K, }) LIMITS_CC_50 = { 'thread_per_warp': 32, 'warp_per_sm': 64, 'thread_per_sm': 2048, 'block_per_sm': 32, 'registers': 65536, 'reg_alloc_unit': 256, 'reg_alloc_gran': 'warp', 'reg_per_thread': 255, 'smem_per_sm': SMEM64K, 'smem_per_block': SMEM48K, 'smem_alloc_unit': 256, 'warp_alloc_gran': 4, 'max_block_size': 1024, 'default_smem_config': SMEM64K, } LIMITS_CC_52 = LIMITS_CC_50.copy() LIMITS_CC_52.update({ 'smem_per_sm': SMEM96K, 'default_smem_config': SMEM96K, }) LIMITS_CC_53 = LIMITS_CC_50.copy() LIMITS_CC_53.update({ 'registers': 32768, }) LIMITS_CC_60 = LIMITS_CC_50.copy() LIMITS_CC_60.update({ 'warp_alloc_gran': 2, }) LIMITS_CC_61 = LIMITS_CC_60.copy() LIMITS_CC_61.update({ 'smem_per_sm': SMEM96K, 'default_smem_config': SMEM96K, 'warp_alloc_gran': 4, }) LIMITS_CC_62 = LIMITS_CC_60.copy() LIMITS_CC_62.update({ 'thread_per_sm': 4096, 'warp_per_sm': 128, 'warp_alloc_gran': 4, }) LIMITS_CC_70 = LIMITS_CC_62.copy() LIMITS_CC_70.update({ 'smem_per_sm': SMEM96K, 'smem_per_block': SMEM96K, 'default_smem_config': SMEM96K, }) LIMITS_CC_75 = LIMITS_CC_70.copy() LIMITS_CC_75.update({ 'warp_per_sm': 32, 'thread_per_sm': 1024, 'block_per_sm': 16, 'smem_per_sm': SMEM64K, 'smem_per_block': SMEM64K, 'default_smem_config': SMEM64K, }) PHYSICAL_LIMITS = { (2, 0): LIMITS_CC_20, (2, 1): LIMITS_CC_21, (3, 0): LIMITS_CC_30, (3, 5): LIMITS_CC_35, (3, 7): LIMITS_CC_35, (5, 0): LIMITS_CC_50, (5, 2): LIMITS_CC_52, (5, 3): LIMITS_CC_53, (6, 0): LIMITS_CC_50, (6, 1): LIMITS_CC_61, (6, 2): LIMITS_CC_62, (7, 0): LIMITS_CC_70, (7, 5): LIMITS_CC_75, } def ceil(x, s=1): return s * math.ceil(x / s) def floor(x, s=1): return s * math.floor(x / s) def warp_occupancy(info, cc, smem_config=None): """Returns a dictionary of {threadperblock: occupancy, factor} Only threadperblock of multiple of warpsize is used. Only threadperblock of non-zero occupancy is returned. """ ret = {} try: limits = PHYSICAL_LIMITS[cc] except KeyError: raise ValueError("%s is not a supported compute capability" % ".".join(str(c) for c in cc)) if smem_config is None: smem_config = limits['default_smem_config'] warpsize = limits['thread_per_warp'] max_thread = info.maxthreads for tpb in range(warpsize, max_thread + 1, warpsize): result = compute_warp_occupancy(tpb=tpb, reg=info.regs, smem=info.shared, smem_config=smem_config, limits=limits) if result[0]: ret[tpb] = result return ret def compute_warp_occupancy(tpb, reg, smem, smem_config, limits): assert limits['reg_alloc_gran'] == 'warp', \ "assume warp register allocation granularity" limit_block_per_sm = limits['block_per_sm'] limit_warp_per_sm = limits['warp_per_sm'] limit_thread_per_warp = limits['thread_per_warp'] limit_reg_per_thread = limits['reg_per_thread'] limit_total_regs = limits['registers'] limit_total_smem = min(limits['smem_per_sm'], smem_config) my_smem_alloc_unit = limits['smem_alloc_unit'] reg_alloc_unit = limits['reg_alloc_unit'] warp_alloc_gran = limits['warp_alloc_gran'] my_warp_per_block = ceil(tpb / limit_thread_per_warp) my_reg_count = reg my_reg_per_block = my_warp_per_block my_smem = smem my_smem_per_block = ceil(my_smem, my_smem_alloc_unit) # allocated resource limit_blocks_due_to_warps = min(limit_block_per_sm, floor( limit_warp_per_sm / my_warp_per_block)) c39 = floor(limit_total_regs / ceil(my_reg_count * limit_thread_per_warp, reg_alloc_unit), warp_alloc_gran) limit_blocks_due_to_regs = (0 if my_reg_count > limit_reg_per_thread else (floor(c39 / my_reg_per_block) if my_reg_count > 0 else limit_block_per_sm)) limit_blocks_due_to_smem = (floor(limit_total_smem / my_smem_per_block) if my_smem_per_block > 0 else limit_block_per_sm) # occupancy active_block_per_sm = min(limit_blocks_due_to_smem, limit_blocks_due_to_warps, limit_blocks_due_to_regs) if active_block_per_sm == limit_blocks_due_to_warps: factor = 'warps' elif active_block_per_sm == limit_blocks_due_to_regs: factor = 'regs' else: factor = 'smem' active_warps_per_sm = active_block_per_sm * my_warp_per_block #active_threads_per_sm = active_warps_per_sm * limit_thread_per_warp occupancy = active_warps_per_sm / limit_warp_per_sm return occupancy, factor