#!/usr/bin/env python # -*- coding: utf-8 -*- # Copyright 2019 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Utility functions for bounding box processing.""" from __future__ import absolute_import from __future__ import division from __future__ import print_function import numpy as np import tensorflow as tf EPSILON = 1e-8 BBOX_XFORM_CLIP = np.log(1000. / 16.) def jitter_boxes(boxes, noise_scale=0.025): """Jitter the box coordinates by some noise distribution. Args: boxes: a tensor whose last dimension is 4 representing the coordinates of boxes in ymin, xmin, ymax, xmax order. noise_scale: a python float which specifies the magnitude of noise. The rule of thumb is to set this between (0, 0.1]. The default value is found to mimic the noisy detections best empirically. Returns: jittered_boxes: a tensor whose shape is the same as `boxes` representing the jittered boxes. Raises: ValueError: If the last dimension of boxes is not 4. """ if boxes.shape[-1] != 4: raise ValueError( 'boxes.shape[-1] is {:d}, but must be 4.'.format(boxes.shape[-1])) with tf.name_scope('jitter_boxes'): bbox_jitters = tf.random_normal(boxes.get_shape(), stddev=noise_scale) ymin = boxes[..., 0:1] xmin = boxes[..., 1:2] ymax = boxes[..., 2:3] xmax = boxes[..., 3:4] width = xmax - xmin height = ymax - ymin new_center_x = (xmin + xmax) / 2.0 + bbox_jitters[..., 0:1] * width new_center_y = (ymin + ymax) / 2.0 + bbox_jitters[..., 1:2] * height new_width = width * tf.exp(bbox_jitters[..., 2:3]) new_height = height * tf.exp(bbox_jitters[..., 3:4]) jittered_boxes = tf.concat([ new_center_y - new_height * 0.5, new_center_x - new_width * 0.5, new_center_y + new_height * 0.5, new_center_x + new_width * 0.5], axis=-1) return jittered_boxes def normalize_boxes(boxes, image_shape): """Converts boxes to the normalized coordinates. Args: boxes: a tensor whose last dimension is 4 representing the coordinates of boxes in ymin, xmin, ymax, xmax order. image_shape: a list of two integers, a two-element vector or a tensor such that all but the last dimensions are `broadcastable` to `boxes`. The last dimension is 2, which represents [height, width]. Returns: normalized_boxes: a tensor whose shape is the same as `boxes` representing the normalized boxes. Raises: ValueError: If the last dimension of boxes is not 4. """ if boxes.shape[-1] != 4: raise ValueError( 'boxes.shape[-1] is {:d}, but must be 4.'.format(boxes.shape[-1])) with tf.name_scope('normalize_boxes'): if isinstance(image_shape, list) or isinstance(image_shape, tuple): height, width = image_shape else: image_shape = tf.cast(image_shape, dtype=boxes.dtype) height = image_shape[..., 0:1] width = image_shape[..., 1:2] ymin = boxes[..., 0:1] / height xmin = boxes[..., 1:2] / width ymax = boxes[..., 2:3] / height xmax = boxes[..., 3:4] / width normalized_boxes = tf.concat([ymin, xmin, ymax, xmax], axis=-1) return normalized_boxes def denormalize_boxes(boxes, image_shape): """Converts boxes normalized by [height, width] to pixel coordinates. Args: boxes: a tensor whose last dimension is 4 representing the coordinates of boxes in ymin, xmin, ymax, xmax order. image_shape: a list of two integers, a two-element vector or a tensor such that all but the last dimensions are `broadcastable` to `boxes`. The last dimension is 2, which represents [height, width]. Returns: denormalized_boxes: a tensor whose shape is the same as `boxes` representing the denormalized boxes. Raises: ValueError: If the last dimension of boxes is not 4. """ with tf.name_scope('denormalize_boxes'): if isinstance(image_shape, list) or isinstance(image_shape, tuple): height, width = image_shape else: image_shape = tf.cast(image_shape, dtype=boxes.dtype) height, width = tf.split(image_shape, 2, axis=-1) ymin, xmin, ymax, xmax = tf.split(boxes, 4, axis=-1) ymin = ymin * height xmin = xmin * width ymax = ymax * height xmax = xmax * width denormalized_boxes = tf.concat([ymin, xmin, ymax, xmax], axis=-1) return denormalized_boxes def clip_boxes(boxes, image_shape): """Clips boxes to image boundaries. Args: boxes: a tensor whose last dimension is 4 representing the coordinates of boxes in ymin, xmin, ymax, xmax order. image_shape: a list of two integers, a two-element vector or a tensor such that all but the last dimensions are `broadcastable` to `boxes`. The last dimension is 2, which represents [height, width]. Returns: clipped_boxes: a tensor whose shape is the same as `boxes` representing the clipped boxes. Raises: ValueError: If the last dimension of boxes is not 4. """ if boxes.shape[-1] != 4: raise ValueError( 'boxes.shape[-1] is {:d}, but must be 4.'.format(boxes.shape[-1])) with tf.name_scope('clip_boxes'): if isinstance(image_shape, list) or isinstance(image_shape, tuple): height, width = image_shape else: image_shape = tf.cast(image_shape, dtype=boxes.dtype) height = image_shape[..., 0:1] width = image_shape[..., 1:2] ymin = boxes[..., 0:1] xmin = boxes[..., 1:2] ymax = boxes[..., 2:3] xmax = boxes[..., 3:4] clipped_ymin = tf.maximum(tf.minimum(ymin, height - 1.0), 0.0) clipped_ymax = tf.maximum(tf.minimum(ymax, height - 1.0), 0.0) clipped_xmin = tf.maximum(tf.minimum(xmin, width - 1.0), 0.0) clipped_xmax = tf.maximum(tf.minimum(xmax, width - 1.0), 0.0) clipped_boxes = tf.concat( [clipped_ymin, clipped_xmin, clipped_ymax, clipped_xmax], axis=-1) return clipped_boxes def compute_outer_boxes(boxes, image_shape, scale=1.0): """Compute outer box encloses an object with a margin. Args: boxes: a tensor whose last dimension is 4 representing the coordinates of boxes in ymin, xmin, ymax, xmax order. image_shape: a list of two integers, a two-element vector or a tensor such that all but the last dimensions are `broadcastable` to `boxes`. The last dimension is 2, which represents [height, width]. scale: a float number specifying the scale of output outer boxes to input `boxes`. Returns: outer_boxes: a tensor whose shape is the same as `boxes` representing the outer boxes. """ if scale < 1.0: raise ValueError( 'scale is {}, but outer box scale must be greater than 1.0.'.format( scale)) centers_y = (boxes[..., 0] + boxes[..., 2]) / 2.0 centers_x = (boxes[..., 1] + boxes[..., 3]) / 2.0 box_height = (boxes[..., 2] - boxes[..., 0]) * scale box_width = (boxes[..., 3] - boxes[..., 1]) * scale outer_boxes = tf.stack([centers_y - box_height / 2.0, centers_x - box_width / 2.0, centers_y + box_height / 2.0, centers_x + box_width / 2.0], axis=1) outer_boxes = clip_boxes(outer_boxes, image_shape) return outer_boxes def encode_boxes(boxes, anchors, weights=None): """Encode boxes to targets. Args: boxes: a tensor whose last dimension is 4 representing the coordinates of boxes in ymin, xmin, ymax, xmax order. anchors: a tensor whose shape is the same as, or `broadcastable` to `boxes`, representing the coordinates of anchors in ymin, xmin, ymax, xmax order. weights: None or a list of four float numbers used to scale coordinates. Returns: encoded_boxes: a tensor whose shape is the same as `boxes` representing the encoded box targets. Raises: ValueError: If the last dimension of boxes is not 4. """ if boxes.shape[-1] != 4: raise ValueError( 'boxes.shape[-1] is {:d}, but must be 4.'.format(boxes.shape[-1])) with tf.name_scope('encode_boxes'): boxes = tf.cast(boxes, dtype=anchors.dtype) y_min, x_min, y_max, x_max = tf.split(boxes, 4, axis=-1) # y_min = boxes[..., 0:1] # x_min = boxes[..., 1:2] # y_max = boxes[..., 2:3] # x_max = boxes[..., 3:4] box_h = y_max - y_min + 1.0 box_w = x_max - x_min + 1.0 box_yc = y_min + 0.5 * box_h box_xc = x_min + 0.5 * box_w anchor_ymin, anchor_xmin, anchor_ymax, anchor_xmax = tf.split(anchors, 4, axis=-1) # anchor_ymin = anchors[..., 0:1] # anchor_xmin = anchors[..., 1:2] # anchor_ymax = anchors[..., 2:3] # anchor_xmax = anchors[..., 3:4] anchor_h = anchor_ymax - anchor_ymin + 1.0 anchor_w = anchor_xmax - anchor_xmin + 1.0 anchor_yc = anchor_ymin + 0.5 * anchor_h anchor_xc = anchor_xmin + 0.5 * anchor_w encoded_dy = (box_yc - anchor_yc) / anchor_h encoded_dx = (box_xc - anchor_xc) / anchor_w encoded_dh = tf.math.log(box_h / anchor_h) encoded_dw = tf.math.log(box_w / anchor_w) if weights: encoded_dy *= weights[0] encoded_dx *= weights[1] encoded_dh *= weights[2] encoded_dw *= weights[3] encoded_boxes = tf.concat([encoded_dy, encoded_dx, encoded_dh, encoded_dw], axis=-1) return encoded_boxes def decode_boxes(encoded_boxes, anchors, weights=None): """Decode boxes. Args: encoded_boxes: a tensor whose last dimension is 4 representing the coordinates of encoded boxes in ymin, xmin, ymax, xmax order. anchors: a tensor whose shape is the same as, or `broadcastable` to `boxes`, representing the coordinates of anchors in ymin, xmin, ymax, xmax order. weights: None or a list of four float numbers used to scale coordinates. Returns: encoded_boxes: a tensor whose shape is the same as `boxes` representing the decoded box targets. """ if encoded_boxes.shape[-1] != 4: raise ValueError( 'encoded_boxes.shape[-1] is {:d}, but must be 4.' .format(encoded_boxes.shape[-1])) with tf.name_scope('decode_boxes'): encoded_boxes = tf.cast(encoded_boxes, dtype=anchors.dtype) dy = encoded_boxes[..., 0:1] dx = encoded_boxes[..., 1:2] dh = encoded_boxes[..., 2:3] dw = encoded_boxes[..., 3:4] if weights: dy /= weights[0] dx /= weights[1] dh /= weights[2] dw /= weights[3] dh = tf.minimum(dh, BBOX_XFORM_CLIP) dw = tf.minimum(dw, BBOX_XFORM_CLIP) anchor_ymin = anchors[..., 0:1] anchor_xmin = anchors[..., 1:2] anchor_ymax = anchors[..., 2:3] anchor_xmax = anchors[..., 3:4] anchor_h = anchor_ymax - anchor_ymin + 1.0 anchor_w = anchor_xmax - anchor_xmin + 1.0 anchor_yc = anchor_ymin + 0.5 * anchor_h anchor_xc = anchor_xmin + 0.5 * anchor_w decoded_boxes_yc = dy * anchor_h + anchor_yc decoded_boxes_xc = dx * anchor_w + anchor_xc decoded_boxes_h = tf.exp(dh) * anchor_h decoded_boxes_w = tf.exp(dw) * anchor_w decoded_boxes_ymin = decoded_boxes_yc - 0.5 * decoded_boxes_h decoded_boxes_xmin = decoded_boxes_xc - 0.5 * decoded_boxes_w decoded_boxes_ymax = decoded_boxes_ymin + decoded_boxes_h - 1.0 decoded_boxes_xmax = decoded_boxes_xmin + decoded_boxes_w - 1.0 decoded_boxes = tf.concat( [decoded_boxes_ymin, decoded_boxes_xmin, decoded_boxes_ymax, decoded_boxes_xmax], axis=-1) return decoded_boxes def filter_boxes(boxes, scores, image_shape, min_size_threshold): """Filter and remove boxes that are too small or fall outside the image. Args: boxes: a tensor whose last dimension is 4 representing the coordinates of boxes in ymin, xmin, ymax, xmax order. scores: a tensor whose shape is the same as tf.shape(boxes)[:-1] representing the original scores of the boxes. image_shape: a tensor whose shape is the same as, or `broadcastable` to `boxes` except the last dimension, which is 2, representing [height, width] of the scaled image. min_size_threshold: a float representing the minimal box size in each side (w.r.t. the scaled image). Boxes whose sides are smaller than it will be filtered out. Returns: filtered_boxes: a tensor whose shape is the same as `boxes` but with the position of the filtered boxes are filled with 0. filtered_scores: a tensor whose shape is the same as 'scores' but with the positinon of the filtered boxes filled with 0. """ if boxes.shape[-1] != 4: raise ValueError( 'boxes.shape[1] is {:d}, but must be 4.'.format(boxes.shape[-1])) with tf.name_scope('filter_boxes'): if isinstance(image_shape, list) or isinstance(image_shape, tuple): height, width = image_shape else: image_shape = tf.cast(image_shape, dtype=boxes.dtype) height = image_shape[..., 0] width = image_shape[..., 1] ymin = boxes[..., 0] xmin = boxes[..., 1] ymax = boxes[..., 2] xmax = boxes[..., 3] h = ymax - ymin + 1.0 w = xmax - xmin + 1.0 yc = ymin + 0.5 * h xc = xmin + 0.5 * w min_size = tf.cast(tf.maximum(min_size_threshold, 1.0), dtype=boxes.dtype) filtered_size_mask = tf.logical_and( tf.greater(h, min_size), tf.greater(w, min_size)) filtered_center_mask = tf.logical_and( tf.logical_and(tf.greater(yc, 0.0), tf.less(yc, height)), tf.logical_and(tf.greater(xc, 0.0), tf.less(xc, width))) filtered_mask = tf.logical_and(filtered_size_mask, filtered_center_mask) filtered_scores = tf.where(filtered_mask, scores, tf.zeros_like(scores)) filtered_boxes = tf.cast( tf.expand_dims(filtered_mask, axis=-1), dtype=boxes.dtype) * boxes return filtered_boxes, filtered_scores # TODO: can this be merged into above? def filter_boxes_2(boxes, scores, min_size, height, width, scale): """Filter out boxes that are too small. Args: boxes: a tensor whose last dimension is 4 representing the coordinates of boxes in ymin, xmin, ymax, xmax order. scores: a tensor such as all but the last dimensions are the same as `boxes`. The last dimension is 1. It represents the scores. min_size: an integer specifying the minimal size. height: an integer, a scalar or a tensor such as all but the last dimensions are the same as `boxes`. The last dimension is 1. It represents the height of the image. width: an integer, a scalar or a tensor such as all but the last dimensions are the same as `boxes`. The last dimension is 1. It represents the width of the image. scale: an integer, a scalar or a tensor such as all but the last dimensions are the same as `boxes`. The last dimension is 1. It represents the scale of the image. Returns: filtered_boxes: a tensor whose shape is the same as `boxes` representing the filtered boxes. filtered_scores: a tensor whose shape is the same as `scores` representing the filtered scores. """ with tf.name_scope('filter_box'): y_min, x_min, y_max, x_max = tf.split(boxes, 4, axis=-1) # y_min = boxes[..., 0:1] # x_min = boxes[..., 1:2] # y_max = boxes[..., 2:3] # x_max = boxes[..., 3:4] h = y_max - y_min + 1.0 w = x_max - x_min + 1.0 yc = y_min + h / 2.0 xc = x_min + w / 2.0 height = tf.cast(height, dtype=boxes.dtype) width = tf.cast(width, dtype=boxes.dtype) scale = tf.cast(scale, dtype=boxes.dtype) min_size = tf.cast(tf.maximum(min_size, 1), dtype=boxes.dtype) size_mask = tf.logical_and( tf.greater_equal(h, min_size * scale), tf.greater_equal(w, min_size * scale) ) center_mask = tf.logical_and(tf.less(yc, height), tf.less(xc, width)) selected_mask = tf.logical_and(size_mask, center_mask) filtered_scores = tf.where(selected_mask, scores, tf.zeros_like(scores)) filtered_boxes = tf.cast(selected_mask, dtype=boxes.dtype) * boxes return filtered_boxes, filtered_scores def filter_boxes_by_scores(boxes, scores, min_score_threshold): """Filter and remove boxes whose scores are smaller than the threshold. Args: boxes: a tensor whose last dimension is 4 representing the coordinates of boxes in ymin, xmin, ymax, xmax order. scores: a tensor whose shape is the same as tf.shape(boxes)[:-1] representing the original scores of the boxes. min_score_threshold: a float representing the minimal box score threshold. Boxes whose score are smaller than it will be filtered out. Returns: filtered_boxes: a tensor whose shape is the same as `boxes` but with the position of the filtered boxes are filled with 0. filtered_scores: a tensor whose shape is the same as 'scores' but with the """ if boxes.shape[-1] != 4: raise ValueError( 'boxes.shape[1] is {:d}, but must be 4.'.format(boxes.shape[-1])) with tf.name_scope('filter_boxes_by_scores'): filtered_mask = tf.greater(scores, min_score_threshold) filtered_scores = tf.where(filtered_mask, scores, tf.zeros_like(scores)) filtered_boxes = tf.cast( tf.expand_dims(filtered_mask, axis=-1), dtype=boxes.dtype) * boxes return filtered_boxes, filtered_scores # TODO: Do we need both of these? def top_k_boxes(boxes, scores, k): """Sort and select top k boxes according to the scores. Args: boxes: a tensor of shape [batch_size, N, 4] representing the coordiante of the boxes. N is the number of boxes per image. scores: a tensor of shsape [batch_size, N] representing the socre of the boxes. k: an integer or a tensor indicating the top k number. Returns: selected_boxes: a tensor of shape [batch_size, k, 4] representing the selected top k box coordinates. selected_scores: a tensor of shape [batch_size, k] representing the selected top k box scores. """ with tf.name_scope('top_k_boxes'): selected_scores, top_k_indices = tf.nn.top_k(scores, k=k, sorted=True) batch_size, _ = scores.get_shape().as_list() if batch_size == 1: selected_boxes = tf.squeeze( tf.gather(boxes, top_k_indices, axis=1), axis=1) else: top_k_indices_shape = tf.shape(top_k_indices) batch_indices = ( tf.expand_dims(tf.range(top_k_indices_shape[0]), axis=-1) * tf.ones([1, top_k_indices_shape[-1]], dtype=tf.int32)) gather_nd_indices = tf.stack([batch_indices, top_k_indices], axis=-1) selected_boxes = tf.gather_nd(boxes, gather_nd_indices) return selected_boxes, selected_scores def top_k(scores, k, boxes_list): """A wrapper that returns top-k scores and correponding boxes. This functions selects the top-k scores and boxes as follows. indices = argsort(scores)[:k] scores = scores[indices] outputs = [] for boxes in boxes_list: outputs.append(boxes[indices, :]) return scores, outputs Args: scores: a tensor with a shape of [batch_size, N]. N is the number of scores. k: an integer for selecting the top-k elements. boxes_list: a list containing at least one element. Each element has a shape of [batch_size, N, 4]. Returns: scores: the selected top-k scores with a shape of [batch_size, k]. outputs: the list containing the corresponding boxes in the order of the input `boxes_list`. """ assert isinstance(boxes_list, list) assert boxes_list # not empty list batch_size, _ = scores.get_shape().as_list() scores, top_k_indices = tf.nn.top_k(scores, k=k) outputs = [] for boxes in boxes_list: if batch_size == 1: boxes = tf.squeeze(tf.gather(boxes, top_k_indices, axis=1), axis=1) else: boxes_index_offsets = tf.range(batch_size) * tf.shape(input=boxes)[1] boxes_indices = tf.reshape( top_k_indices + tf.expand_dims(boxes_index_offsets, 1), [-1]) boxes = tf.reshape( tf.gather(tf.reshape(boxes, [-1, 4]), boxes_indices), [batch_size, -1, 4]) outputs.append(boxes) return scores, outputs def bbox_overlap(boxes, gt_boxes): """Calculates the overlap between proposal and ground truth boxes. Some `gt_boxes` may have been padded. The returned `iou` tensor for these boxes will be -1. Args: boxes: a tensor with a shape of [batch_size, N, 4]. N is the number of proposals before groundtruth assignment (e.g., rpn_post_nms_topn). The last dimension is the pixel coordinates in [ymin, xmin, ymax, xmax] form. gt_boxes: a tensor with a shape of [batch_size, MAX_NUM_INSTANCES, 4]. This tensor might have paddings with a negative value. Returns: iou: a tensor with as a shape of [batch_size, N, MAX_NUM_INSTANCES]. """ with tf.name_scope('bbox_overlap'): bb_y_min, bb_x_min, bb_y_max, bb_x_max = tf.split( value=boxes, num_or_size_splits=4, axis=2) gt_y_min, gt_x_min, gt_y_max, gt_x_max = tf.split( value=gt_boxes, num_or_size_splits=4, axis=2) # Calculates the intersection area. i_xmin = tf.maximum(bb_x_min, tf.transpose(gt_x_min, [0, 2, 1])) i_xmax = tf.minimum(bb_x_max, tf.transpose(gt_x_max, [0, 2, 1])) i_ymin = tf.maximum(bb_y_min, tf.transpose(gt_y_min, [0, 2, 1])) i_ymax = tf.minimum(bb_y_max, tf.transpose(gt_y_max, [0, 2, 1])) i_area = tf.maximum((i_xmax - i_xmin), 0) * tf.maximum((i_ymax - i_ymin), 0) # Calculates the union area. bb_area = (bb_y_max - bb_y_min) * (bb_x_max - bb_x_min) gt_area = (gt_y_max - gt_y_min) * (gt_x_max - gt_x_min) # Adds a small epsilon to avoid divide-by-zero. u_area = bb_area + tf.transpose(gt_area, [0, 2, 1]) - i_area + 1e-8 # Calculates IoU. iou = i_area / u_area # Fills -1 for padded ground truth boxes. padding_mask = tf.less(i_xmin, tf.zeros_like(i_xmin)) iou = tf.where(padding_mask, -tf.ones_like(iou), iou) return iou def to_normalized_coordinates(boxes, height, width): """Converted absolute box coordinates to normalized ones. Args: boxes: a tensor whose last dimension is 4 representing the coordinates of boxes in ymin, xmin, ymax, xmax order. height: an integer, a scalar or a tensor such as all but the last dimensions are the same as `boxes`. The last dimension is 1. It represents the height of the image. width: an integer, a scalar or a tensor such as all but the last dimensions are the same as `boxes`. The last dimension is 1. It represents the width of the image. Returns: normalized_boxes: a tensor whose shape is the same as `boxes` representing the boxes in normalized coordinates. """ with tf.name_scope('normalize_box'): height = tf.cast(height, dtype=boxes.dtype) width = tf.cast(width, dtype=boxes.dtype) y_min, x_min, y_max, x_max = tf.split(boxes, 4, axis=-1) y_min = y_min / height x_min = x_min / width y_max = y_max / height x_max = x_max / width # y_min = boxes[..., 0:1] / height # x_min = boxes[..., 1:2] / width # y_max = boxes[..., 2:3] / height # x_max = boxes[..., 3:4] / width normalized_boxes = tf.concat([y_min, x_min, y_max, x_max], axis=-1) return normalized_boxes BBOX_XFORM_CLIP = np.log(1000. / 16.) NMS_TILE_SIZE = 512 def _self_suppression(iou, _, iou_sum): batch_size = tf.shape(input=iou)[0] can_suppress_others = tf.cast( tf.reshape(tf.reduce_max(input_tensor=iou, axis=1) <= 0.5, [batch_size, -1, 1]), iou.dtype) iou_suppressed = tf.reshape( tf.cast(tf.reduce_max(input_tensor=can_suppress_others * iou, axis=1) <= 0.5, iou.dtype), [batch_size, -1, 1]) * iou iou_sum_new = tf.reduce_sum(input_tensor=iou_suppressed, axis=[1, 2]) return [ iou_suppressed, tf.reduce_any(input_tensor=iou_sum - iou_sum_new > 0.5), iou_sum_new ] def _cross_suppression(boxes, box_slice, iou_threshold, inner_idx): batch_size = tf.shape(input=boxes)[0] new_slice = tf.slice(boxes, [0, inner_idx * NMS_TILE_SIZE, 0], [batch_size, NMS_TILE_SIZE, 4]) iou = bbox_overlap(new_slice, box_slice) ret_slice = tf.expand_dims( tf.cast(tf.reduce_all(input_tensor=iou < iou_threshold, axis=[1]), box_slice.dtype), 2) * box_slice return boxes, ret_slice, iou_threshold, inner_idx + 1 def _suppression_loop_body(boxes, iou_threshold, output_size, idx): """Process boxes in the range [idx*NMS_TILE_SIZE, (idx+1)*NMS_TILE_SIZE). Args: boxes: a tensor with a shape of [batch_size, anchors, 4]. iou_threshold: a float representing the threshold for deciding whether boxes overlap too much with respect to IOU. output_size: an int32 tensor of size [batch_size]. Representing the number of selected boxes for each batch. idx: an integer scalar representing induction variable. Returns: boxes: updated boxes. iou_threshold: pass down iou_threshold to the next iteration. output_size: the updated output_size. idx: the updated induction variable. """ num_tiles = tf.shape(input=boxes)[1] // NMS_TILE_SIZE batch_size = tf.shape(input=boxes)[0] # Iterates over tiles that can possibly suppress the current tile. box_slice = tf.slice(boxes, [0, idx * NMS_TILE_SIZE, 0], [batch_size, NMS_TILE_SIZE, 4]) _, box_slice, _, _ = tf.while_loop( cond=lambda _boxes, _box_slice, _threshold, inner_idx: inner_idx < idx, body=_cross_suppression, loop_vars=[boxes, box_slice, iou_threshold, tf.constant(0)]) # Iterates over the current tile to compute self-suppression. iou = bbox_overlap(box_slice, box_slice) mask = tf.expand_dims( tf.reshape(tf.range(NMS_TILE_SIZE), [1, -1]) > tf.reshape( tf.range(NMS_TILE_SIZE), [-1, 1]), 0) iou *= tf.cast(tf.logical_and(mask, iou >= iou_threshold), iou.dtype) suppressed_iou, _, _ = tf.while_loop( cond=lambda _iou, loop_condition, _iou_sum: loop_condition, body=_self_suppression, loop_vars=[iou, tf.constant(True), tf.reduce_sum(input_tensor=iou, axis=[1, 2])]) suppressed_box = tf.reduce_sum(input_tensor=suppressed_iou, axis=1) > 0 box_slice *= tf.expand_dims(1.0 - tf.cast(suppressed_box, box_slice.dtype), 2) # Uses box_slice to update the input boxes. mask = tf.reshape( tf.cast(tf.equal(tf.range(num_tiles), idx), boxes.dtype), [1, -1, 1, 1]) boxes = tf.tile(tf.expand_dims( box_slice, [1]), [1, num_tiles, 1, 1]) * mask + tf.reshape( boxes, [batch_size, num_tiles, NMS_TILE_SIZE, 4]) * (1 - mask) boxes = tf.reshape(boxes, [batch_size, -1, 4]) # Updates output_size. output_size += tf.reduce_sum( input_tensor=tf.cast(tf.reduce_any(input_tensor=box_slice > 0, axis=[2]), tf.int32), axis=[1]) return boxes, iou_threshold, output_size, idx + 1 def to_absolute_coordinates(boxes, height, width): """Converted normalized box coordinates to absolute ones. Args: boxes: a tensor whose last dimension is 4 representing the coordinates of boxes in ymin, xmin, ymax, xmax order. height: an integer, a scalar or a tensor such as all but the last dimensions are the same as `boxes`. The last dimension is 1. It represents the height of the image. width: an integer, a scalar or a tensor such as all but the last dimensions are the same as `boxes`. The last dimension is 1. It represents the width of the image. Returns: absolute_boxes: a tensor whose shape is the same as `boxes` representing the boxes in absolute coordinates. """ with tf.name_scope('denormalize_box'): height = tf.cast(height, dtype=boxes.dtype) width = tf.cast(width, dtype=boxes.dtype) y_min, x_min, y_max, x_max = tf.split(boxes, 4, axis=-1) y_min = y_min * height x_min = x_min * width y_max = y_max * height x_max = x_max * width # y_min = boxes[..., 0:1] * height # x_min = boxes[..., 1:2] * width # y_max = boxes[..., 2:3] * height # x_max = boxes[..., 3:4] * width absolute_boxes = tf.concat([y_min, x_min, y_max, x_max], axis=-1) return absolute_boxes def sorted_non_max_suppression_padded(scores, boxes, max_output_size, iou_threshold): """A wrapper that handles non-maximum suppression. Assumption: * The boxes are sorted by scores unless the box is a dot (all coordinates are zero). * Boxes with higher scores can be used to suppress boxes with lower scores. The overal design of the algorithm is to handle boxes tile-by-tile: boxes = boxes.pad_to_multiply_of(tile_size) num_tiles = len(boxes) // tile_size output_boxes = [] for i in range(num_tiles): box_tile = boxes[i*tile_size : (i+1)*tile_size] for j in range(i - 1): suppressing_tile = boxes[j*tile_size : (j+1)*tile_size] iou = bbox_overlap(box_tile, suppressing_tile) # if the box is suppressed in iou, clear it to a dot box_tile *= _update_boxes(iou) # Iteratively handle the diagnal tile. iou = _box_overlap(box_tile, box_tile) iou_changed = True while iou_changed: # boxes that are not suppressed by anything else suppressing_boxes = _get_suppressing_boxes(iou) # boxes that are suppressed by suppressing_boxes suppressed_boxes = _get_suppressed_boxes(iou, suppressing_boxes) # clear iou to 0 for boxes that are suppressed, as they cannot be used # to suppress other boxes any more new_iou = _clear_iou(iou, suppressed_boxes) iou_changed = (new_iou != iou) iou = new_iou # remaining boxes that can still suppress others, are selected boxes. output_boxes.append(_get_suppressing_boxes(iou)) if len(output_boxes) >= max_output_size: break Args: scores: a tensor with a shape of [batch_size, anchors]. boxes: a tensor with a shape of [batch_size, anchors, 4]. max_output_size: a scalar integer `Tensor` representing the maximum number of boxes to be selected by non max suppression. iou_threshold: a float representing the threshold for deciding whether boxes overlap too much with respect to IOU. Returns: nms_scores: a tensor with a shape of [batch_size, anchors]. It has same dtype as input scores. nms_proposals: a tensor with a shape of [batch_size, anchors, 4]. It has same dtype as input boxes. """ batch_size = tf.shape(input=boxes)[0] num_boxes = tf.shape(input=boxes)[1] pad = tf.cast( tf.math.ceil(tf.cast(num_boxes, tf.float32) / NMS_TILE_SIZE), tf.int32) * NMS_TILE_SIZE - num_boxes boxes = tf.pad(tensor=tf.cast(boxes, tf.float32), paddings=[[0, 0], [0, pad], [0, 0]]) scores = tf.pad(tensor=tf.cast(scores, tf.float32), paddings=[[0, 0], [0, pad]]) num_boxes += pad def _loop_cond(unused_boxes, unused_threshold, output_size, idx): return tf.logical_and( tf.reduce_min(input_tensor=output_size) < max_output_size, idx < num_boxes // NMS_TILE_SIZE) selected_boxes, _, output_size, _ = tf.while_loop( cond=_loop_cond, body=_suppression_loop_body, loop_vars=[ boxes, iou_threshold, tf.zeros([batch_size], tf.int32), tf.constant(0) ]) idx = num_boxes - tf.cast( tf.nn.top_k( tf.cast(tf.reduce_any(input_tensor=selected_boxes > 0, axis=[2]), tf.int32) * tf.expand_dims(tf.range(num_boxes, 0, -1), 0), max_output_size)[0], tf.int32) idx = tf.minimum(idx, num_boxes - 1) idx = tf.reshape( idx + tf.reshape(tf.range(batch_size) * num_boxes, [-1, 1]), [-1]) boxes = tf.reshape( tf.gather(tf.reshape(boxes, [-1, 4]), idx), [batch_size, max_output_size, 4]) boxes = boxes * tf.cast( tf.reshape(tf.range(max_output_size), [1, -1, 1]) < tf.reshape( output_size, [-1, 1, 1]), boxes.dtype) scores = tf.reshape( tf.gather(tf.reshape(scores, [-1, 1]), idx), [batch_size, max_output_size]) scores = scores * tf.cast( tf.reshape(tf.range(max_output_size), [1, -1]) < tf.reshape( output_size, [-1, 1]), scores.dtype) return scores, boxes