#!/usr/bin/env python # -*- coding: utf-8 -*- # Copyright 2018 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Data loader and processing. Defines input_fn of Mask-RCNN for TF Estimator. The input_fn includes training data for category classification, bounding box regression, and number of positive examples to normalize the loss during training. """ import tensorflow as tf from sagemakercv.core import anchors, preprocess_ops from .coco_utils import POLYGON_PAD_VALUE from .tf_example_decoder import TfExampleDecoder MAX_NUM_INSTANCES = 100 MAX_NUM_VERTICES_PER_INSTANCE = 1500 MAX_NUM_POLYGON_LIST_LEN = 2 * MAX_NUM_VERTICES_PER_INSTANCE * MAX_NUM_INSTANCES __all__ = [ # dataset parser "dataset_parser", # common functions "preprocess_image", "process_groundtruth_is_crowd", "process_source_id", # eval "prepare_labels_for_eval", # training "augment_image", "process_boxes_classes_indices_for_training", "process_gt_masks_for_training", "process_labels_for_training", "process_targets_for_training" ] ############################################################################################################### def dataset_parser(value, mode, params, use_instance_mask, seed=None, regenerate_source_id=False): """Parse data to a fixed dimension input image and learning targets. Args: value: A dictionary contains an image and groundtruth annotations. Returns: features: a dictionary that contains the image and auxiliary information. The following describes {key: value} pairs in the dictionary. image: Image tensor that is preproessed to have normalized value and fixed dimension [image_size, image_size, 3] image_info: image information that includes the original height and width, the scale of the proccessed image to the original image, and the scaled height and width. source_ids: Source image id. Default value -1 if the source id is empty in the groundtruth annotation. labels: a dictionary that contains auxiliary information plus (optional) labels. The following describes {key: value} pairs in the dictionary. `labels` is only for training. score_targets_dict: ordered dictionary with keys [min_level, min_level+1, ..., max_level]. The values are tensor with shape [height_l, width_l, num_anchors]. The height_l and width_l represent the dimension of objectiveness score at l-th level. box_targets_dict: ordered dictionary with keys [min_level, min_level+1, ..., max_level]. The values are tensor with shape [height_l, width_l, num_anchors * 4]. The height_l and width_l represent the dimension of bounding box regression output at l-th level. gt_boxes: Groundtruth bounding box annotations. The box is represented in [y1, x1, y2, x2] format. The tennsor is padded with -1 to the fixed dimension [MAX_NUM_INSTANCES, 4]. gt_classes: Groundtruth classes annotations. The tennsor is padded with -1 to the fixed dimension [MAX_NUM_INSTANCES]. cropped_gt_masks: groundtrugh masks cropped by the bounding box and resized to a fixed size determined by params['gt_mask_size'] regenerate_source_id: `bool`, if True TFExampleParser will use hashed value of `image/encoded` for `image/source_id`. """ if mode not in [tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.PREDICT, tf.estimator.ModeKeys.EVAL]: raise ValueError("Unknown execution mode received: %s" % mode) def create_example_decoder(): return TfExampleDecoder( use_instance_mask=use_instance_mask, regenerate_source_id=regenerate_source_id ) example_decoder = create_example_decoder() with tf.xla.experimental.jit_scope(compile_ops=True): with tf.name_scope('parser'): data = example_decoder.decode(value) data['groundtruth_is_crowd'] = process_groundtruth_is_crowd(data) image = tf.image.convert_image_dtype(data['image'], dtype=tf.float32) source_id = process_source_id(data['source_id']) if mode in [tf.estimator.ModeKeys.PREDICT, tf.estimator.ModeKeys.EVAL]: features = { 'source_ids': source_id, } if params['visualize_images_summary']: features['orig_images'] = tf.image.resize(image, params['image_size']) features["images"], features["image_info"], _, _ = preprocess_image( image, boxes=None, instance_masks=None, image_size=params['image_size'], max_level=params['max_level'], augment_input_data=False, seed=seed ) if params['include_groundtruth_in_features']: labels = prepare_labels_for_eval( data, target_num_instances=MAX_NUM_INSTANCES, target_polygon_list_len=MAX_NUM_POLYGON_LIST_LEN, use_instance_mask=use_instance_mask ) return {'features': features, 'labels': labels} else: return {'features': features} elif mode == tf.estimator.ModeKeys.TRAIN: labels = {} features = { 'source_ids': source_id } boxes, classes, indices, instance_masks = process_boxes_classes_indices_for_training( data, skip_crowd_during_training=params['skip_crowd_during_training'], use_category=params['use_category'], use_instance_mask=use_instance_mask, ) orig_image_size = tf.shape(image)[:2] image, image_info, boxes, instance_masks = preprocess_image( image, boxes=boxes, instance_masks=instance_masks, image_size=params['image_size'], max_level=params['max_level'], augment_input_data=params['augment_input_data'], seed=seed ) features.update({ 'images': image, 'image_info': image_info, }) padded_image_size = image.get_shape().as_list()[:2] # Pads cropped_gt_masks. if use_instance_mask: instance_masks = tf.expand_dims(instance_masks, -1) orig_boxes = boxes * image_info[2] labels['cropped_gt_masks'] = process_gt_masks_for_training( instance_masks, orig_boxes, gt_mask_size=params['gt_mask_size'], padded_image_size=orig_image_size, max_num_instances=MAX_NUM_INSTANCES ) with tf.xla.experimental.jit_scope(compile_ops=False): # Assign anchors. (score_targets, box_targets), input_anchor = process_targets_for_training( padded_image_size=padded_image_size, boxes=boxes, classes=classes, params=params ) additional_labels = process_labels_for_training( image_info, boxes, classes, score_targets, box_targets, max_num_instances=MAX_NUM_INSTANCES, min_level=params["min_level"], max_level=params["max_level"] ) labels.update(additional_labels) # labels["input_anchor"] = input_anchor # Features # { # 'source_ids': , # 'images': , # 'image_info': # } FAKE_FEATURES = False if FAKE_FEATURES: labels["source_ids"] = tf.ones(shape=(), dtype=tf.float32) labels["images"] = tf.ones(shape=(1024, 1024, 3), dtype=tf.float32) labels["image_info"] = tf.ones(shape=(5,), dtype=tf.float32) # Labels # { # 'cropped_gt_masks': , # 'score_targets_2': , # 'box_targets_2': , # 'score_targets_3': , # 'box_targets_3': , # 'score_targets_4': , # 'box_targets_4': , # 'score_targets_5': , # 'box_targets_5': , # 'score_targets_6': , # 'box_targets_6': , # 'gt_boxes': , # 'gt_classes': # } FAKE_LABELS = False if FAKE_LABELS: labels["cropped_gt_masks"] = tf.ones(shape=(100, 116, 116), dtype=tf.float32) labels["gt_boxes"] = tf.ones(shape=(100, 4), dtype=tf.float32) labels["gt_classes"] = tf.ones(shape=(100, 1), dtype=tf.float32) idx = 1 for dim in [256, 128, 64, 32, 16]: idx += 1 # Starts at 2 labels["score_targets_%d" % idx] = tf.ones(shape=(dim, dim, 3), dtype=tf.float32) labels["box_targets_%d" % idx] = tf.ones(shape=(dim, dim, 12), dtype=tf.float32) return features, labels ############################################################################################################### # common functions def preprocess_image(image, boxes, instance_masks, image_size, max_level, augment_input_data=False, seed=None): image = preprocess_ops.normalize_image(image) if augment_input_data: image, boxes, instance_masks = augment_image(image=image, boxes=boxes, instance_masks=instance_masks, seed=seed) # Scaling and padding. image, image_info, boxes, _ = preprocess_ops.resize_and_pad( image=image, target_size=image_size, stride=2 ** max_level, boxes=boxes, masks=None ) return image, image_info, boxes, instance_masks def process_groundtruth_is_crowd(data): return tf.cond( pred=tf.greater(tf.size(input=data['groundtruth_is_crowd']), 0), true_fn=lambda: data['groundtruth_is_crowd'], false_fn=lambda: tf.zeros_like(data['groundtruth_classes'], dtype=tf.bool) ) # def process_source_id(data): # source_id = tf.where(tf.equal(source_id, tf.constant('')), '-1', source_id) # source_id = tf.strings.to_number(source_id) # return source_id def process_source_id(source_id): """Processes source_id to the right format.""" if source_id.dtype == tf.string: source_id = tf.cast(tf.strings.to_number(source_id), tf.int64) with tf.control_dependencies([source_id]): source_id = tf.cond( tf.equal(tf.size(source_id), 0), lambda: tf.cast(tf.constant(-1), tf.int64), lambda: tf.identity(source_id) ) return source_id # eval def prepare_labels_for_eval( data, target_num_instances=MAX_NUM_INSTANCES, target_polygon_list_len=MAX_NUM_POLYGON_LIST_LEN, use_instance_mask=False ): """Create labels dict for infeed from data of tf.Example.""" image = data['image'] height = tf.shape(input=image)[0] width = tf.shape(input=image)[1] boxes = data['groundtruth_boxes'] classes = tf.cast(data['groundtruth_classes'], dtype=tf.float32) num_labels = tf.shape(input=classes)[0] boxes = preprocess_ops.pad_to_fixed_size(boxes, -1, [target_num_instances, 4]) classes = preprocess_ops.pad_to_fixed_size(classes, -1, [target_num_instances, 1]) is_crowd = tf.cast(data['groundtruth_is_crowd'], dtype=tf.float32) is_crowd = preprocess_ops.pad_to_fixed_size(is_crowd, 0, [target_num_instances, 1]) labels = dict() labels['width'] = width labels['height'] = height labels['groundtruth_boxes'] = boxes labels['groundtruth_classes'] = classes labels['num_groundtruth_labels'] = num_labels labels['groundtruth_is_crowd'] = is_crowd # TODO: This needs to be updated for new mask format '''if use_instance_mask: data['groundtruth_masks'] = preprocess_ops.pad_to_fixed_size( data=data['groundtruth_polygons'], pad_value=POLYGON_PAD_VALUE, output_shape=[target_polygon_list_len, 1] )''' return labels # training def augment_image(image, boxes, instance_masks, seed): flipped_results = preprocess_ops.random_horizontal_flip( image, boxes=boxes, masks=instance_masks, seed=seed ) if instance_masks is not None: image, boxes, instance_masks = flipped_results else: image, boxes = flipped_results # multiplicative gaussian noise - speckle image = preprocess_ops.add_noise(image, std=0.2, seed=seed) # add noise half of the time - std scale taken from https://openreview.net/pdf?id=SkeKtyHYPS # image = tf.image.random_brightness(image, max_delta=0.1, seed=seed) # image = tf.image.random_contrast(image, lower=0.9, upper=1.1, seed=seed) # image = tf.image.random_saturation(image, lower=0.9, upper=1.1, seed=seed) # image = tf.image.random_jpeg_quality(image, min_jpeg_quality=80, max_jpeg_quality=100, seed=seed) return image, boxes, instance_masks def process_boxes_classes_indices_for_training(data, skip_crowd_during_training, use_category, use_instance_mask): boxes = data['groundtruth_boxes'] classes = data['groundtruth_classes'] classes = tf.reshape(tf.cast(classes, dtype=tf.float32), [-1, 1]) indices = None instance_masks = None if not use_category: classes = tf.cast(tf.greater(classes, 0), dtype=tf.float32) if skip_crowd_during_training: indices = tf.where(tf.logical_not(data['groundtruth_is_crowd'])) classes = tf.gather_nd(classes, indices) boxes = tf.gather_nd(boxes, indices) if use_instance_mask: instance_masks = tf.gather_nd(data['groundtruth_instance_masks'], indices) return boxes, classes, indices, instance_masks def process_gt_masks_for_training(instance_masks, boxes, gt_mask_size, padded_image_size, max_num_instances): cropped_gt_masks = preprocess_ops.crop_gt_masks( instance_masks=instance_masks, boxes=boxes, gt_mask_size=gt_mask_size, image_size=padded_image_size ) # cropped_gt_masks = tf.reshape(cropped_gt_masks, [max_num_instances, -1]) cropped_gt_masks = preprocess_ops.pad_to_fixed_size( data=cropped_gt_masks, pad_value=-1, output_shape=[max_num_instances, (gt_mask_size + 4) ** 2] ) return tf.reshape(cropped_gt_masks, [max_num_instances, gt_mask_size + 4, gt_mask_size + 4]) def process_labels_for_training( image_info, boxes, classes, score_targets, box_targets, max_num_instances, min_level, max_level ): labels = {} # Pad groundtruth data. # boxes *= image_info[2] boxes = preprocess_ops.pad_to_fixed_size(boxes, -1, [max_num_instances, 4]) classes = preprocess_ops.pad_to_fixed_size(classes, -1, [max_num_instances, 1]) for level in range(min_level, max_level + 1): labels['score_targets_%d' % level] = score_targets[level] labels['box_targets_%d' % level] = box_targets[level] labels['gt_boxes'] = boxes labels['gt_classes'] = classes return labels def process_targets_for_training(padded_image_size, boxes, classes, params): input_anchors = anchors.AnchorGenerator( params['min_level'], params['max_level'], params['num_scales'], params['aspect_ratios'], params['anchor_scale'], padded_image_size ) anchor_labeler = anchors.AnchorLabeler( input_anchors, params['num_classes'], params['rpn_positive_overlap'], params['rpn_negative_overlap'], params['rpn_batch_size_per_im'], params['rpn_fg_fraction'] ) return anchor_labeler.label_anchors(boxes, classes), input_anchors