#!/usr/bin/env python
# -*- coding: utf-8 -*-

# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Ops used to post-process raw detections."""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import tensorflow as tf

from sagemakercv.core import box_utils


def generate_detections_per_image_tpu(cls_outputs,
                                      box_outputs,
                                      anchor_boxes,
                                      image_info,
                                      pre_nms_num_detections=1000,
                                      post_nms_num_detections=100,
                                      nms_threshold=0.3,
                                      bbox_reg_weights=(10., 10., 5., 5.)):
    """Generate the final detections per image given the model outputs.

  Args:
    cls_outputs: a tensor with shape [N, num_classes], which stacks class
      logit outputs on all feature levels. The N is the number of total anchors
      on all levels. The num_classes is the number of classes predicted by the
      model. Note that the cls_outputs should be the output of softmax().
    box_outputs: a tensor with shape [N, num_classes*4], which stacks box
      regression outputs on all feature levels. The N is the number of total
      anchors on all levels.
    anchor_boxes: a tensor with shape [N, 4], which stacks anchors on all
      feature levels. The N is the number of total anchors on all levels.
    image_info: a tensor of shape [5] which encodes the input image's [height,
      width, scale, original_height, original_width]
    pre_nms_num_detections: an integer that specifies the number of candidates
      before NMS.
    post_nms_num_detections: an integer that specifies the number of candidates
      after NMS.
    nms_threshold: a float number to specify the IOU threshold of NMS.
    bbox_reg_weights: a list of 4 float scalars, which are default weights on
      (dx, dy, dw, dh) for normalizing bbox regression targets.

  Returns:
    detections: Tuple of tensors corresponding to number of valid boxes,
    box coordinates, object categories for each boxes, and box scores
    -- respectively.
  """
    num_boxes, num_classes = cls_outputs.get_shape().as_list()

    # Remove background class scores.
    cls_outputs = cls_outputs[:, 1:num_classes]
    top_k_scores, top_k_indices_with_classes = tf.nn.top_k(
        tf.reshape(cls_outputs, [-1]),
        k=pre_nms_num_detections,
        sorted=False
    )

    classes = tf.math.mod(top_k_indices_with_classes, num_classes - 1)
    top_k_indices = tf.math.floordiv(top_k_indices_with_classes, num_classes - 1)

    anchor_boxes = tf.gather(anchor_boxes, top_k_indices)
    box_outputs = tf.reshape(box_outputs, [num_boxes, num_classes, 4])[:, 1:num_classes, :]

    class_indices = classes

    box_outputs = tf.gather_nd(box_outputs, tf.stack([top_k_indices, class_indices], axis=1))

    # apply bounding box regression to anchors
    boxes = box_utils.decode_boxes(box_outputs, anchor_boxes, bbox_reg_weights)
    boxes = box_utils.clip_boxes(boxes, (image_info[0], image_info[1]))

    list_of_all_boxes = []
    list_of_all_scores = []
    list_of_all_classes = []

    # Skip background class.
    for class_i in range(num_classes):
        # Compute bitmask for the given classes.
        class_i_bitmask = tf.cast(tf.equal(classes, class_i), top_k_scores.dtype)
        # This works because score is in [0, 1].
        class_i_scores = top_k_scores * class_i_bitmask

        # The TPU and CPU have different behaviors for
        # tf.image.non_max_suppression_padded (b/116754376).
        class_i_post_nms_indices, class_i_nms_num_valid = tf.image.non_max_suppression_padded(
            tf.cast(boxes, dtype=tf.float32),
            tf.cast(class_i_scores, dtype=tf.float32),
            post_nms_num_detections,
            iou_threshold=nms_threshold,
            score_threshold=0.05,
            pad_to_max_output_size=True,
            name='nms_detections_' + str(class_i)
        )

        class_i_post_nms_boxes = tf.gather(boxes, class_i_post_nms_indices)
        class_i_post_nms_scores = tf.gather(class_i_scores, class_i_post_nms_indices)

        mask = tf.less(tf.range(post_nms_num_detections), [class_i_nms_num_valid])

        class_i_post_nms_scores = tf.where(
            mask, class_i_post_nms_scores, tf.zeros_like(class_i_post_nms_scores)
        )

        class_i_classes = tf.fill(tf.shape(input=class_i_post_nms_scores), class_i + 1)
        list_of_all_boxes.append(class_i_post_nms_boxes)
        list_of_all_scores.append(class_i_post_nms_scores)
        list_of_all_classes.append(class_i_classes)

    post_nms_boxes = tf.concat(list_of_all_boxes, axis=0)
    post_nms_scores = tf.concat(list_of_all_scores, axis=0)
    post_nms_classes = tf.concat(list_of_all_classes, axis=0)

    # sort all results.
    post_nms_scores, sorted_indices = tf.nn.top_k(
        tf.cast(post_nms_scores, dtype=tf.float32),
        k=post_nms_num_detections,
        sorted=True
    )

    post_nms_boxes = tf.gather(post_nms_boxes, sorted_indices)
    post_nms_classes = tf.gather(post_nms_classes, sorted_indices)

    valid_mask = tf.where(
        tf.greater(post_nms_scores, 0), tf.ones_like(post_nms_scores),
        tf.zeros_like(post_nms_scores)
    )

    num_valid_boxes = tf.reduce_sum(input_tensor=valid_mask, axis=-1)
    box_classes = tf.cast(post_nms_classes, dtype=tf.float32)

    return num_valid_boxes, post_nms_boxes, box_classes, post_nms_scores


def generate_detections_tpu(class_outputs,
                            box_outputs,
                            anchor_boxes,
                            image_info,
                            pre_nms_num_detections=1000,
                            post_nms_num_detections=100,
                            nms_threshold=0.3,
                            bbox_reg_weights=(10., 10., 5., 5.)
                            ):
    """Generate the final detections given the model outputs (TPU version).

    Args:
    class_outputs: a tensor with shape [batch_size, N, num_classes], which
      stacks class logit outputs on all feature levels. The N is the number of
      total anchors on all levels. The num_classes is the number of classes
      predicted by the model. Note that the class_outputs here is the raw score.
    box_outputs: a tensor with shape [batch_size, N, num_classes*4], which
      stacks box regression outputs on all feature levels. The N is the number
      of total anchors on all levels.
    anchor_boxes: a tensor with shape [batch_size, N, 4], which stacks anchors
      on all feature levels. The N is the number of total anchors on all levels.
    image_info: a tensor of shape [batch_size, 5] which encodes each image's
      [height, width, scale, original_height, original_width].
    pre_nms_num_detections: an integer that specifies the number of candidates
      before NMS.
    post_nms_num_detections: an integer that specifies the number of candidates
      after NMS.
    nms_threshold: a float number to specify the IOU threshold of NMS.
    bbox_reg_weights: a list of 4 float scalars, which are default weights on
      (dx, dy, dw, dh) for normalizing bbox regression targets.

    Returns:
    a tuple of tensors corresponding to number of valid boxes,
    box coordinates, object categories for each boxes, and box scores stacked
    in batch_size.
    """

    with tf.name_scope('generate_detections'):

        batch_size, _, _ = class_outputs.get_shape().as_list()
        softmax_class_outputs = tf.nn.softmax(class_outputs)

        num_valid_boxes, box_coordinates, box_classes, box_scores = ([], [], [], [])

        for i in range(batch_size):
            result = generate_detections_per_image_tpu(
                softmax_class_outputs[i], box_outputs[i], anchor_boxes[i],
                image_info[i], pre_nms_num_detections, post_nms_num_detections,
                nms_threshold, bbox_reg_weights)

            num_valid_boxes.append(result[0])
            box_coordinates.append(result[1])
            box_classes.append(result[2])
            box_scores.append(result[3])

        num_valid_boxes = tf.stack(num_valid_boxes)
        box_coordinates = tf.stack(box_coordinates)
        box_classes = tf.stack(box_classes)
        box_scores = tf.stack(box_scores)

    return num_valid_boxes, box_coordinates, box_classes, box_scores


def generate_detections_gpu(class_outputs,
                            box_outputs,
                            anchor_boxes,
                            image_info,
                            pre_nms_num_detections=1000,
                            post_nms_num_detections=100,
                            nms_threshold=0.3,
                            class_agnostic_box=False,
                            bbox_reg_weights=(10., 10., 5., 5.)
                            ):
    """Generate the final detections given the model outputs (GPU version).

    Args:
    class_outputs: a tensor with shape [batch_size, N, num_classes], which
      stacks class logit outputs on all feature levels. The N is the number of
      total anchors on all levels. The num_classes is the number of classes
      predicted by the model. Note that the class_outputs here is the raw score.
    box_outputs: a tensor with shape [batch_size, N, num_classes*4], which
      stacks box regression outputs on all feature levels. The N is the number
      of total anchors on all levels.
    anchor_boxes: a tensor with shape [batch_size, N, 4], which stacks anchors
      on all feature levels. The N is the number of total anchors on all levels.
    image_info: a tensor of shape [batch_size, 5] which encodes each image's
      [height, width, scale, original_height, original_width].
    pre_nms_num_detections: an integer that specifies the number of candidates
      before NMS.
    post_nms_num_detections: an integer that specifies the number of candidates
      after NMS.
    nms_threshold: a float number to specify the IOU threshold of NMS.
    bbox_reg_weights: a list of 4 float scalars, which are default weights on
      (dx, dy, dw, dh) for normalizing bbox regression targets.

    Returns:
    a tuple of tensors corresponding to number of valid boxes,
    box coordinates, object categories for each boxes, and box scores stacked
    in batch_size.
    """
    with tf.name_scope('generate_detections'):

        batch_size, num_boxes, num_classes = class_outputs.get_shape().as_list()
        softmax_class_outputs = tf.nn.softmax(class_outputs)

        # Remove background
        scores = tf.slice(softmax_class_outputs, [0, 0, 1], [-1, -1, -1])
        
        box_classes = 2 if class_agnostic_box else num_classes
        
        boxes = tf.slice(
            tf.reshape(box_outputs, [batch_size, num_boxes, box_classes, 4]),
            [0, 0, 1, 0], [-1, -1, -1, -1]
        )
        
        if class_agnostic_box:
            boxes = tf.tile(boxes, [1, 1, num_classes - 1, 1])

        anchor_boxes = tf.expand_dims(anchor_boxes, axis=2) * tf.ones([1, 1, num_classes - 1, 1])

        num_detections = num_boxes * (num_classes - 1)

        boxes = tf.reshape(boxes, [batch_size, num_detections, 4])
        scores = tf.reshape(scores, [batch_size, num_detections, 1])
        anchor_boxes = tf.reshape(anchor_boxes, [batch_size, num_detections, 4])

        # Decode
        boxes = box_utils.decode_boxes(boxes, anchor_boxes, bbox_reg_weights)

        # Clip boxes
        height = tf.expand_dims(image_info[:, 0:1], axis=-1)
        width = tf.expand_dims(image_info[:, 1:2], axis=-1)
        boxes = box_utils.clip_boxes(boxes, (height, width))

        # NMS
        pre_nms_boxes = box_utils.to_normalized_coordinates(boxes, height, width)
        pre_nms_boxes = tf.reshape(pre_nms_boxes, [batch_size, num_boxes, num_classes - 1, 4])
        pre_nms_scores = tf.reshape(scores, [batch_size, num_boxes, num_classes - 1])

        post_nms_boxes, post_nms_scores, post_nms_classes, \
        post_nms_num_valid_boxes = tf.image.combined_non_max_suppression(
            pre_nms_boxes,
            pre_nms_scores,
            max_output_size_per_class=pre_nms_num_detections,
            max_total_size=post_nms_num_detections,
            iou_threshold=nms_threshold,
            score_threshold=0.0,
            pad_per_class=False
        )

        post_nms_classes = post_nms_classes + 1

        post_nms_boxes = box_utils.to_absolute_coordinates(post_nms_boxes, height, width)

    return post_nms_num_valid_boxes, post_nms_boxes, tf.cast(post_nms_classes, dtype=tf.float32), post_nms_scores


class BoxDetector(object):
    """
    While running inference, we do a final NMS operation in which we 
    eleminate overlapping boxes, and select the top n boxes most likely
    to contain an object.
    """
    
    def __init__(self,
                 use_batched_nms=False,
                 rpn_post_nms_topn=1000,
                 detections_per_image=100,
                 test_nms=0.5,
                 class_agnostic_box=False,
                 bbox_reg_weights=(10., 10., 5., 5.),):
        self.use_batched_nms = use_batched_nms
        self.rpn_post_nms_topn = rpn_post_nms_topn
        self.detections_per_image = detections_per_image
        self.test_nms = test_nms
        self.bbox_reg_weights = bbox_reg_weights
        self.class_agnostic_box = class_agnostic_box
        
    def __call__(self, class_outputs, box_outputs, rpn_box_rois, img_info):
        
        detection_outputs = dict()
        
        if self.use_batched_nms:
            generate_detections_fn = generate_detections_gpu
        else:
            generate_detections_fn = generate_detections_tpu
        
        detections = generate_detections_fn(
                class_outputs=class_outputs,
                box_outputs=box_outputs,
                anchor_boxes=rpn_box_rois,
                image_info=img_info,
                pre_nms_num_detections=self.rpn_post_nms_topn,
                post_nms_num_detections=self.detections_per_image,
                nms_threshold=self.test_nms,
                class_agnostic_box=self.class_agnostic_box,
                bbox_reg_weights=self.bbox_reg_weights
            )
        
        detection_outputs.update({
                'num_detections': detections[0],
                'detection_boxes': detections[1],
                'detection_classes': detections[2],
                'detection_scores': detections[3],
            })
        
        return detection_outputs