# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. # SPDX-License-Identifier: Apache-2.0 # -*- coding: utf-8 -*- # File: data.py from tensorpack.dataflow import RNGDataFlow import copy import numpy as np import cv2 from tabulate import tabulate from termcolor import colored from tensorpack.dataflow import ( DataFromList, MapDataComponent, MultiProcessMapDataZMQ, MultiThreadMapData, TestDataSpeed, imgaug, MapData) from tensorpack.utils import logger from tensorpack.utils.argtools import log_once, memoized from common import ( CustomResize, DataFromListOfDict, box_to_point8, filter_boxes_inside_shape, point8_to_box, segmentation_to_mask, np_iou) from config import config as cfg from dataset import DetectionDataset from utils.generate_anchors import generate_anchors from utils.np_box_ops import area as np_area, ioa as np_ioa import math # import tensorpack.utils.viz as tpviz """ Predefined padding: Pad all images into one of the 4 predefined padding shapes in cfg.PREPROC.PADDING_SHAPES. Try to batch the images with same padding shape into the same batch. This can accelerate the data pipeline process. Functions: get_padding_shape & _get_padding_shape: Find the closest padding shape to current image according to its aspect ratio get_next_roidb: Try to get the next image that has the same shape """ def _get_padding_shape(aspect_ratio): for shape in cfg.PREPROC.PADDING_SHAPES: if aspect_ratio >= float(shape[0])/float(shape[1]): return shape return cfg.PREPROC.PADDING_SHAPES[-1] def get_padding_shape(h, w): aspect_ratio = float(h)/float(w) if aspect_ratio > 1.0: inv = 1./aspect_ratio else: inv = aspect_ratio shp = _get_padding_shape(inv) if aspect_ratio > 1.0: return (shp[1], shp[0]) return shp def get_next_roidb(roidbs, i, shp, taken): if i == len(roidbs) - 1: return None for k in range(i+1, len(roidbs)): if get_padding_shape(roidbs[k]['height'], roidbs[k]['width']) == shp and not taken[k]: return k if k - i > 40: # don't try too hard break # at least try to get one dimension the same for k in range(i, len(roidbs)): padding_shape = get_padding_shape(roidbs[k]['height'], roidbs[k]['width']) if (padding_shape[0] == shp[0] or padding_shape[1] == shp[1]) and not taken[k]: return k if k - i > 40: # don't try too hard break for k in range(i, len(roidbs)): if not taken[k]: return k return None class DataFromListOfDictBatched(RNGDataFlow): def __init__(self, lst, keys, batchsize, shuffle=False): self._lst = lst self._keys = keys self._shuffle = shuffle self._size = len(lst) self._bs = batchsize def __len__(self): return int(math.ceil(len(self._lst)/self._bs)) #self._size def __iter__(self): if self._shuffle: self.rng.shuffle(self._lst) num_batches = int(math.ceil(len(self._lst)/self._bs)) for batch in range(num_batches): # print(batch) last = min(len(self._lst), self._bs*(batch+1)) dp = [[dic[k] for k in self._keys] for dic in self._lst[batch*self._bs:last]] yield dp class MalformedData(BaseException): pass def print_class_histogram(roidbs): """ Args: roidbs (list[dict]): the same format as the output of `load_training_roidbs`. """ dataset = DetectionDataset() hist_bins = np.arange(dataset.num_classes + 1) # Histogram of ground-truth objects gt_hist = np.zeros((dataset.num_classes,), dtype=np.int) for entry in roidbs: # filter crowd? gt_inds = np.where( (entry['class'] > 0) & (entry['is_crowd'] == 0))[0] gt_classes = entry['class'][gt_inds] gt_hist += np.histogram(gt_classes, bins=hist_bins)[0] data = [[dataset.class_names[i], v] for i, v in enumerate(gt_hist)] data.append(['total', sum([x[1] for x in data])]) table = tabulate(data, headers=['class', '#box'], tablefmt='pipe') logger.info("Ground-Truth Boxes:\n" + colored(table, 'cyan')) @memoized def get_all_anchors(stride=None, sizes=None, tile=True): """ Get all anchors in the largest possible image, shifted, floatbox Args: stride (int): the stride of anchors. sizes (tuple[int]): the sizes (sqrt area) of anchors Returns: anchors: SxSxNUM_ANCHORx4, where S == ceil(MAX_SIZE/STRIDE), floatbox The layout in the NUM_ANCHOR dim is NUM_RATIO x NUM_SIZE. """ if stride is None: stride = cfg.RPN.ANCHOR_STRIDE if sizes is None: sizes = cfg.RPN.ANCHOR_SIZES # Generates a NAx4 matrix of anchor boxes in (x1, y1, x2, y2) format. Anchors # are centered on stride / 2, have (approximate) sqrt areas of the specified # sizes, and aspect ratios as given. if not cfg.RPN.UNQUANTIZED_ANCHOR: cell_anchors = generate_anchors( stride, scales=np.array(sizes, dtype=np.float) / stride, ratios=np.array(cfg.RPN.ANCHOR_RATIOS, dtype=np.float)) else: anchors = [] ratios=np.array(cfg.RPN.ANCHOR_RATIOS, dtype=np.float) for sz in sizes: for ratio in ratios: w = np.sqrt(sz * sz / ratio) h = ratio * w anchors.append([-w, -h, w, h]) cell_anchors = np.asarray(anchors) * 0.5 # anchors are intbox here. # anchors at featuremap [0,0] are centered at fpcoor (8,8) (half of stride) if tile: max_size = cfg.PREPROC.MAX_SIZE field_size = int(np.ceil(max_size / stride)) if not cfg.RPN.UNQUANTIZED_ANCHOR: shifts = np.arange(0, field_size) * stride else: shifts = (np.arange(0, field_size) * stride).astype("float32") shift_x, shift_y = np.meshgrid(shifts, shifts) shift_x = shift_x.flatten() shift_y = shift_y.flatten() shifts = np.vstack((shift_x, shift_y, shift_x, shift_y)).transpose() # Kx4, K = field_size * field_size K = shifts.shape[0] A = cell_anchors.shape[0] if not cfg.RPN.UNQUANTIZED_ANCHOR: field_of_anchors = ( cell_anchors.reshape((1, A, 4)) + shifts.reshape((1, K, 4)).transpose((1, 0, 2))) else: field_of_anchors = cell_anchors.reshape((1, A, 4)) + shifts.reshape((1, K, 4)).transpose((1, 0, 2)) field_of_anchors = field_of_anchors.reshape((field_size, field_size, A, 4)) # FSxFSxAx4 # Many rounding happens inside the anchor code anyway # assert np.all(field_of_anchors == field_of_anchors.astype('int32')) field_of_anchors = field_of_anchors.astype('float32') if not cfg.RPN.UNQUANTIZED_ANCHOR: field_of_anchors[:, :, :, [2, 3]] += 1 return field_of_anchors else: cell_anchors = cell_anchors.astype('float32') cell_anchors[:, [2, 3]] += 1 return cell_anchors @memoized def get_all_anchors_fpn(strides=None, sizes=None, tile=True): """ Returns: foas: anchors for FPN layers p2-p6, each with size S x S x NUM_ANCHOR_RATIOS x 4 where S == ceil(MAX_SIZE/STRIDE) """ if strides is None: strides = cfg.FPN.ANCHOR_STRIDES if sizes is None: sizes = cfg.RPN.ANCHOR_SIZES assert len(strides) == len(sizes) foas = [] for stride, size in zip(strides, sizes): foa = get_all_anchors(stride=stride, sizes=(size,), tile=tile) foas.append(foa) return foas def get_anchor_labels(anchors, gt_boxes, crowd_boxes): """ Label each anchor as fg/bg/ignore. Args: anchors: A x 4 float gt_boxes: B x 4 float, non-crowd crowd_boxes: C x 4 float Returns: anchor_labels: (A,) int. Each element is {-1, 0, 1} anchor_boxes: A x 4. Contains the target gt_box for each anchor when the anchor is fg. """ # This function will modify labels and return the filtered inds def filter_box_label(labels, value, max_num): curr_inds = np.where(labels == value)[0] if len(curr_inds) > max_num: np.random.seed(cfg.TRAIN.SEED) disable_inds = np.random.choice( curr_inds, size=(len(curr_inds) - max_num), replace=False) labels[disable_inds] = -1 # ignore them curr_inds = np.where(labels == value)[0] return curr_inds NA, NB = len(anchors), len(gt_boxes) assert NB > 0 # empty images should have been filtered already box_ious = np_iou(anchors, gt_boxes) # NA x NB ious_argmax_per_anchor = box_ious.argmax(axis=1) # NA, ious_max_per_anchor = box_ious.max(axis=1) ious_max_per_gt = np.amax(box_ious, axis=0, keepdims=True) # 1xNB # for each gt, find all those anchors (including ties) that has the max ious with it anchors_with_max_iou_per_gt = np.where(box_ious == ious_max_per_gt)[0] # Setting NA labels: 1--fg 0--bg -1--ignore anchor_labels = -np.ones((NA,), dtype='int32') # NA, # the order of setting neg/pos labels matter anchor_labels[anchors_with_max_iou_per_gt] = 1 anchor_labels[ious_max_per_anchor >= cfg.RPN.POSITIVE_ANCHOR_THRESH] = 1 anchor_labels[ious_max_per_anchor < cfg.RPN.NEGATIVE_ANCHOR_THRESH] = 0 # label all non-ignore candidate boxes which overlap crowd as ignore if crowd_boxes.size > 0: cand_inds = np.where(anchor_labels >= 0)[0] cand_anchors = anchors[cand_inds] ioas = np_ioa(crowd_boxes, cand_anchors) overlap_with_crowd = cand_inds[ioas.max(axis=0) > cfg.RPN.CROWD_OVERLAP_THRESH] anchor_labels[overlap_with_crowd] = -1 # Subsample fg labels: ignore some fg if fg is too many target_num_fg = int(cfg.RPN.BATCH_PER_IM * cfg.RPN.FG_RATIO) fg_inds = filter_box_label(anchor_labels, 1, target_num_fg) # Keep an image even if there is no foreground anchors # if len(fg_inds) == 0: # raise MalformedData("No valid foreground for RPN!") # Subsample bg labels. num_bg is not allowed to be too many old_num_bg = np.sum(anchor_labels == 0) if old_num_bg == 0: # No valid bg in this image, skip. raise MalformedData("No valid background for RPN!") target_num_bg = cfg.RPN.BATCH_PER_IM - len(fg_inds) filter_box_label(anchor_labels, 0, target_num_bg) # ignore return values # Set anchor boxes: the best gt_box for each fg anchor anchor_boxes = np.zeros((NA, 4), dtype='float32') fg_boxes = gt_boxes[ious_argmax_per_anchor[fg_inds], :] anchor_boxes[fg_inds, :] = fg_boxes # assert len(fg_inds) + np.sum(anchor_labels == 0) == cfg.RPN.BATCH_PER_IM return anchor_labels, anchor_boxes def get_rpn_anchor_input(im, boxes, is_crowd): """ Args: im: an image boxes: nx4, floatbox, gt. shoudn't be changed is_crowd: n, Returns: The anchor labels and target boxes for each pixel in the featuremap. fm_labels: fHxfWxNA fm_boxes: fHxfWxNAx4 NA will be NUM_ANCHOR_SIZES x NUM_ANCHOR_RATIOS """ boxes = boxes.copy() all_anchors = np.copy(get_all_anchors()) # fHxfWxAx4 -> (-1, 4) featuremap_anchors_flatten = all_anchors.reshape((-1, 4)) # only use anchors inside the image inside_ind, inside_anchors = filter_boxes_inside_shape(featuremap_anchors_flatten, im.shape[:2]) # obtain anchor labels and their corresponding gt boxes anchor_labels, anchor_gt_boxes = get_anchor_labels(inside_anchors, boxes[is_crowd == 0], boxes[is_crowd == 1]) # Fill them back to original size: fHxfWx1, fHxfWx4 anchorH, anchorW = all_anchors.shape[:2] featuremap_labels = -np.ones((anchorH * anchorW * cfg.RPN.NUM_ANCHOR, ), dtype='int32') featuremap_labels[inside_ind] = anchor_labels featuremap_labels = featuremap_labels.reshape((anchorH, anchorW, cfg.RPN.NUM_ANCHOR)) featuremap_boxes = np.zeros((anchorH * anchorW * cfg.RPN.NUM_ANCHOR, 4), dtype='float32') featuremap_boxes[inside_ind, :] = anchor_gt_boxes featuremap_boxes = featuremap_boxes.reshape((anchorH, anchorW, cfg.RPN.NUM_ANCHOR, 4)) return featuremap_labels, featuremap_boxes def get_multilevel_rpn_anchor_input(im, boxes, is_crowd): """ Args: im: a single image, H_image x W_image x NumChannel boxes: n x 4, floatbox, gt. shoudn't be changed is_crowd: (n,), for each box, is it crowd Returns: [(fm_labels, fm_boxes)]: Returns a tuple for each FPN level. Each tuple contains the anchor labels and target boxes for each pixel in the featuremap. fm_labels: H_feature x W_feature x NUM_ANCHOR_RATIOS fm_boxes: H_feature x W_feature x NUM_ANCHOR_RATIOS x4 """ boxes = boxes.copy() anchors_per_level = get_all_anchors_fpn() flatten_anchors_per_level = [k.reshape((-1, 4)) for k in anchors_per_level] all_anchors_flatten = np.concatenate(flatten_anchors_per_level, axis=0) inside_ind, inside_anchors = filter_boxes_inside_shape(all_anchors_flatten, im.shape[:2]) anchor_labels, anchor_gt_boxes = get_anchor_labels(inside_anchors, boxes[is_crowd == 0], boxes[is_crowd == 1]) # map back to all_anchors, then split to each level num_all_anchors = all_anchors_flatten.shape[0] all_labels = -np.ones((num_all_anchors, ), dtype='int32') all_labels[inside_ind] = anchor_labels all_boxes = np.zeros((num_all_anchors, 4), dtype='float32') all_boxes[inside_ind] = anchor_gt_boxes start = 0 multilevel_inputs = [] for level_anchor in anchors_per_level: assert level_anchor.shape[2] == len(cfg.RPN.ANCHOR_RATIOS) anchor_shape = level_anchor.shape[:3] # fHxfWxNUM_ANCHOR_RATIOS num_anchor_this_level = np.prod(anchor_shape) end = start + num_anchor_this_level multilevel_inputs.append( (all_labels[start: end].reshape(anchor_shape), all_boxes[start: end, :].reshape(anchor_shape + (4,)) )) start = end assert end == num_all_anchors, "{} != {}".format(end, num_all_anchors) return multilevel_inputs def get_train_dataflow(): """ Return a training dataflow. Each datapoint consists of the following: An image: (h, w, 3), 1 or more pairs of (anchor_labels, anchor_boxes): anchor_labels: (h', w', NA) anchor_boxes: (h', w', NA, 4) gt_boxes: (N, 4) gt_labels: (N,) If MODE_MASK, gt_masks: (N, h, w) """ roidbs = DetectionDataset().load_training_roidbs(cfg.DATA.TRAIN) print_class_histogram(roidbs) # Valid training images should have at least one fg box. # But this filter shall not be applied for testing. num = len(roidbs) roidbs = list(filter(lambda img: len(img['boxes'][img['is_crowd'] == 0]) > 0, roidbs)) logger.info("Filtered {} images which contain no non-crowd groudtruth boxes. Total #images for training: {}".format( num - len(roidbs), len(roidbs))) ds = DataFromList(roidbs, shuffle=True) aug = imgaug.AugmentorList( [CustomResize(cfg.PREPROC.TRAIN_SHORT_EDGE_SIZE, cfg.PREPROC.MAX_SIZE), imgaug.Flip(horiz=True)]) def preprocess(roidb): fname, boxes, klass, is_crowd = roidb['file_name'], roidb['boxes'], roidb['class'], roidb['is_crowd'] boxes = np.copy(boxes) im = cv2.imread(fname, cv2.IMREAD_COLOR) assert im is not None, fname im = im.astype('float32') # assume floatbox as input assert boxes.dtype == np.float32, "Loader has to return floating point boxes!" # augmentation: params = aug.get_transform(im) im = params.apply_image(im) points = box_to_point8(boxes) points = params.apply_coords(points) boxes = point8_to_box(points) assert np.min(np_area(boxes)) > 0, "Some boxes have zero area!" ret = {'images': im} # rpn anchor: try: if cfg.MODE_FPN: multilevel_anchor_inputs = get_multilevel_rpn_anchor_input(im, boxes, is_crowd) for i, (anchor_labels, anchor_boxes) in enumerate(multilevel_anchor_inputs): ret['anchor_labels_lvl{}'.format(i + 2)] = anchor_labels ret['anchor_boxes_lvl{}'.format(i + 2)] = anchor_boxes else: # anchor_labels, anchor_boxes ret['anchor_labels'], ret['anchor_boxes'] = get_rpn_anchor_input(im, boxes, is_crowd) boxes = boxes[is_crowd == 0] # skip crowd boxes in training target klass = klass[is_crowd == 0] ret['gt_boxes'] = boxes ret['gt_labels'] = klass if not len(boxes): raise MalformedData("No valid gt_boxes!") except MalformedData as e: log_once("Input {} is filtered for training: {}".format(fname, str(e)), 'warn') return None if cfg.MODE_MASK: # augmentation will modify the polys in-place segmentation = copy.deepcopy(roidb['segmentation']) segmentation = [segmentation[k] for k in range(len(segmentation)) if not is_crowd[k]] assert len(segmentation) == len(boxes) # Apply augmentation on polygon coordinates. # And produce one image-sized binary mask per box. masks = [] for polys in segmentation: polys = [params.apply_coords(p) for p in polys] masks.append(segmentation_to_mask(polys, im.shape[0], im.shape[1])) masks = np.asarray(masks, dtype='uint8') # values in {0, 1} ret['gt_masks'] = masks # from viz import draw_annotation, draw_mask # viz = draw_annotation(im, boxes, klass) # for mask in masks: # viz = draw_mask(viz, mask) # tpviz.interactive_imshow(viz) # for k, v in ret.items(): # print("key", k) # if type(v) == np.ndarray: # print("val", v.shape) # else: # print("val", v) return ret if cfg.TRAINER == 'horovod': #ds = MapData(ds, preprocess) ds = MultiThreadMapData(ds, 5, preprocess) # MPI does not like fork() else: ds = MultiProcessMapDataZMQ(ds, 10, preprocess) return ds def get_viz_dataflow(name): """ Return a training dataflow. Each datapoint consists of the following: An image: (h, w, 3), 1 or more pairs of (anchor_labels, anchor_boxes): anchor_labels: (h', w', NA) anchor_boxes: (h', w', NA, 4) gt_boxes: (N, 4) gt_labels: (N,) If MODE_MASK, gt_masks: (N, h, w) """ roidbs = DetectionDataset().load_viz_roidbs(name) print_class_histogram(roidbs) # Valid training images should have at least one fg box. # But this filter shall not be applied for testing. num = len(roidbs) roidbs = list(filter(lambda img: len(img['boxes'][img['is_crowd'] == 0]) > 0, roidbs)) logger.info("Filtered {} images which contain no non-crowd groudtruth boxes. Total #images for training: {}".format( num - len(roidbs), len(roidbs))) ds = DataFromList(roidbs, shuffle=True) aug = imgaug.AugmentorList( [CustomResize(cfg.PREPROC.TRAIN_SHORT_EDGE_SIZE, cfg.PREPROC.MAX_SIZE), imgaug.Flip(horiz=True)]) def preprocess(roidb): fname, boxes, klass, is_crowd = roidb['file_name'], roidb['boxes'], roidb['class'], roidb['is_crowd'] boxes = np.copy(boxes) im = cv2.imread(fname, cv2.IMREAD_COLOR) assert im is not None, fname im = im.astype('float32') # assume floatbox as input assert boxes.dtype == np.float32, "Loader has to return floating point boxes!" # augmentation: params = aug.get_transform(im) im = params.apply_image(im) points = box_to_point8(boxes) points = params.apply_coords(points) boxes = point8_to_box(points) assert np.min(np_area(boxes)) > 0, "Some boxes have zero area!" ret = {'images': im} # rpn anchor: try: if cfg.MODE_FPN: multilevel_anchor_inputs = get_multilevel_rpn_anchor_input(im, boxes, is_crowd) for i, (anchor_labels, anchor_boxes) in enumerate(multilevel_anchor_inputs): ret['anchor_labels_lvl{}'.format(i + 2)] = anchor_labels ret['anchor_boxes_lvl{}'.format(i + 2)] = anchor_boxes else: # anchor_labels, anchor_boxes ret['anchor_labels'], ret['anchor_boxes'] = get_rpn_anchor_input(im, boxes, is_crowd) boxes = boxes[is_crowd == 0] # skip crowd boxes in training target klass = klass[is_crowd == 0] ret['gt_boxes'] = boxes ret['gt_labels'] = klass if not len(boxes): raise MalformedData("No valid gt_boxes!") except MalformedData as e: log_once("Input {} is filtered for training: {}".format(fname, str(e)), 'warn') return None if cfg.MODE_MASK: # augmentation will modify the polys in-place segmentation = copy.deepcopy(roidb['segmentation']) segmentation = [segmentation[k] for k in range(len(segmentation)) if not is_crowd[k]] assert len(segmentation) == len(boxes) # Apply augmentation on polygon coordinates. # And produce one image-sized binary mask per box. masks = [] for polys in segmentation: polys = [params.apply_coords(p) for p in polys] masks.append(segmentation_to_mask(polys, im.shape[0], im.shape[1])) masks = np.asarray(masks, dtype='uint8') # values in {0, 1} ret['gt_masks'] = masks # from viz import draw_annotation, draw_mask # viz = draw_annotation(im, boxes, klass) # for mask in masks: # viz = draw_mask(viz, mask) # tpviz.interactive_imshow(viz) # for k, v in ret.items(): # print("key", k) # if type(v) == np.ndarray: # print("val", v.shape) # else: # print("val", v) return ret if cfg.TRAINER == 'horovod': #ds = MapData(ds, preprocess) ds = MultiThreadMapData(ds, 5, preprocess) # MPI does not like fork() else: ds = MultiProcessMapDataZMQ(ds, 10, preprocess) return ds def get_batch_train_dataflow(batch_size): """ Return a training dataflow. Each datapoint consists of the following: A batch of images: (BS, h, w, 3), For each image 1 or more pairs of (anchor_labels, anchor_boxes) : anchor_labels: (BS, h', w', maxNumAnchors) anchor_boxes: (BS, h', w', maxNumAnchors, 4) gt_boxes: (BS, maxNumAnchors, 4) gt_labels: (BS, maxNumAnchors) If MODE_MASK, gt_masks: (BS, maxNumAnchors, h, w) """ print("In train dataflow") roidbs = DetectionDataset().load_training_roidbs(cfg.DATA.TRAIN) print("Done loading roidbs") # print_class_histogram(roidbs) # Valid training images should have at least one fg box. # But this filter shall not be applied for testing. num = len(roidbs) roidbs = list(filter(lambda img: len(img['boxes'][img['is_crowd'] == 0]) > 0, roidbs)) logger.info("Filtered {} images which contain no non-crowd groudtruth boxes. Total #images for training: {}".format( num - len(roidbs), len(roidbs))) roidbs = sorted(roidbs, key=lambda x: float(x['width']) / float(x['height']), reverse=True) # will shuffle it later at every rank print("Batching roidbs") batched_roidbs = [] if cfg.PREPROC.PREDEFINED_PADDING: taken = [False for _ in roidbs] done = False for i, d in enumerate(roidbs): batch = [] if not taken[i]: batch.append(d) padding_shape = get_padding_shape(d['height'], d['width']) while len(batch) < batch_size: k = get_next_roidb(roidbs, i, padding_shape, taken) if k == None: done = True break batch.append(roidbs[k]) taken[i], taken[k] = True, True if not done: batched_roidbs.append(batch) else: batch = [] for i, d in enumerate(roidbs): if i % batch_size == 0: if len(batch) == batch_size: batched_roidbs.append(batch) batch = [] batch.append(d) #batched_roidbs = sort_by_aspect_ratio(roidbs, batch_size) #batched_roidbs = group_by_aspect_ratio(roidbs, batch_size) print("Done batching roidbs") # Notes: # - discard any leftover images # - The batches will be shuffled, but the contents of each batch will always be the same # - TODO: Fix lack of batch contents shuffling aug = imgaug.AugmentorList( [CustomResize(cfg.PREPROC.TRAIN_SHORT_EDGE_SIZE, cfg.PREPROC.MAX_SIZE), imgaug.Flip(horiz=True)]) # aug = imgaug.AugmentorList([CustomResize(cfg.PREPROC.TRAIN_SHORT_EDGE_SIZE, cfg.PREPROC.MAX_SIZE)]) def preprocess(roidb_batch): datapoint_list = [] for roidb in roidb_batch: fname, boxes, klass, is_crowd = roidb['file_name'], roidb['boxes'], roidb['class'], roidb['is_crowd'] boxes = np.copy(boxes) im = cv2.imread(fname, cv2.IMREAD_COLOR) assert im is not None, fname im = im.astype('float32') # assume floatbox as input assert boxes.dtype == np.float32, "Loader has to return floating point boxes!" # augmentation: params = aug.get_transform(im) im = params.apply_image(im) points = box_to_point8(boxes) points = params.apply_coords(points) boxes = point8_to_box(points) assert np.min(np_area(boxes)) > 0, "Some boxes have zero area!" ret = {'images': im} # rpn anchor: try: if cfg.MODE_FPN: multilevel_anchor_inputs = get_multilevel_rpn_anchor_input(im, boxes, is_crowd) for i, (anchor_labels, anchor_boxes) in enumerate(multilevel_anchor_inputs): ret['anchor_labels_lvl{}'.format(i + 2)] = anchor_labels ret['anchor_boxes_lvl{}'.format(i + 2)] = anchor_boxes else: raise NotImplementedError("Batch mode only available for FPN") boxes = boxes[is_crowd == 0] # skip crowd boxes in training target klass = klass[is_crowd == 0] ret['gt_boxes'] = boxes ret['gt_labels'] = klass ret['filename'] = fname if not len(boxes): raise MalformedData("No valid gt_boxes!") except MalformedData as e: log_once("Input {} is filtered for training: {}".format(fname, str(e)), 'warn') return None if cfg.MODE_MASK: # augmentation will modify the polys in-place segmentation = copy.deepcopy(roidb['segmentation']) segmentation = [segmentation[k] for k in range(len(segmentation)) if not is_crowd[k]] assert len(segmentation) == len(boxes) # Apply augmentation on polygon coordinates. # And produce one image-sized binary mask per box. masks = [] for polys in segmentation: polys = [params.apply_coords(p) for p in polys] masks.append(segmentation_to_mask(polys, im.shape[0], im.shape[1])) masks = np.asarray(masks, dtype='uint8') # values in {0, 1} ret['gt_masks'] = masks datapoint_list.append(ret) ################################################################################################################# # Batchify the output ################################################################################################################# # Now we need to batch the various fields # Easily stackable: # - anchor_labels_lvl2 # - anchor_boxes_lvl2 # - anchor_labels_lvl3 # - anchor_boxes_lvl3 # - anchor_labels_lvl4 # - anchor_boxes_lvl4 # - anchor_labels_lvl5 # - anchor_boxes_lvl5 # - anchor_labels_lvl6 # - anchor_boxes_lvl6 batched_datapoint = {} for stackable_field in ["anchor_labels_lvl2", "anchor_boxes_lvl2", "anchor_labels_lvl3", "anchor_boxes_lvl3", "anchor_labels_lvl4", "anchor_boxes_lvl4", "anchor_labels_lvl5", "anchor_boxes_lvl5", "anchor_labels_lvl6", "anchor_boxes_lvl6"]: batched_datapoint[stackable_field] = np.stack([d[stackable_field] for d in datapoint_list]) # Require padding and original dimension storage # - image (HxWx3) # - gt_boxes (?x4) # - gt_labels (?) # - gt_masks (?xHxW) """ Find the minimum container size for images (maxW x maxH) Find the maximum number of ground truth boxes For each image, save original dimension and pad """ if cfg.PREPROC.PREDEFINED_PADDING: padding_shapes = [get_padding_shape(*(d["images"].shape[:2])) for d in datapoint_list] max_height = max([shp[0] for shp in padding_shapes]) max_width = max([shp[1] for shp in padding_shapes]) else: image_dims = [d["images"].shape for d in datapoint_list] heights = [dim[0] for dim in image_dims] widths = [dim[1] for dim in image_dims] max_height = max(heights) max_width = max(widths) # image padded_images = [] original_image_dims = [] for datapoint in datapoint_list: image = datapoint["images"] original_image_dims.append(image.shape) h_padding = max_height - image.shape[0] w_padding = max_width - image.shape[1] padded_image = np.pad(image, [[0, h_padding], [0, w_padding], [0, 0]], 'constant') padded_images.append(padded_image) batched_datapoint["images"] = np.stack(padded_images) #print(batched_datapoint["images"].shape) batched_datapoint["orig_image_dims"] = np.stack(original_image_dims) # gt_boxes and gt_labels max_num_gts = max([d["gt_labels"].size for d in datapoint_list]) gt_counts = [] padded_gt_labels = [] padded_gt_boxes = [] padded_gt_masks = [] for datapoint in datapoint_list: gt_count_for_image = datapoint["gt_labels"].size gt_counts.append(gt_count_for_image) gt_padding = max_num_gts - gt_count_for_image padded_gt_labels_for_img = np.pad(datapoint["gt_labels"], [0, gt_padding], 'constant', constant_values=-1) padded_gt_labels.append(padded_gt_labels_for_img) padded_gt_boxes_for_img = np.pad(datapoint["gt_boxes"], [[0, gt_padding], [0,0]], 'constant') padded_gt_boxes.append(padded_gt_boxes_for_img) h_padding = max_height - datapoint["images"].shape[0] w_padding = max_width - datapoint["images"].shape[1] if cfg.MODE_MASK: padded_gt_masks_for_img = np.pad(datapoint["gt_masks"], [[0, gt_padding], [0, h_padding], [0, w_padding]], 'constant') padded_gt_masks.append(padded_gt_masks_for_img) batched_datapoint["orig_gt_counts"] = np.stack(gt_counts) batched_datapoint["gt_labels"] = np.stack(padded_gt_labels) batched_datapoint["gt_boxes"] = np.stack(padded_gt_boxes) batched_datapoint["filenames"] = [d["filename"] for d in datapoint_list] if cfg.MODE_MASK: batched_datapoint["gt_masks"] = np.stack(padded_gt_masks) return batched_datapoint ds = DataFromList(batched_roidbs, shuffle=True) if cfg.TRAINER == 'horovod': # ds = MapData(ds, preprocess) ds = MultiThreadMapData(ds, 5, preprocess) # MPI does not like fork() else: ds = MultiProcessMapDataZMQ(ds, 10, preprocess) return ds def get_eval_dataflow(name, shard=0, num_shards=1): """ Args: name (str): name of the dataset to evaluate shard, num_shards: to get subset of evaluation data """ roidbs = DetectionDataset().load_inference_roidbs(name) num_imgs = len(roidbs) img_per_shard = num_imgs // num_shards img_range = (shard * img_per_shard, (shard + 1) * img_per_shard if shard + 1 < num_shards else num_imgs) # no filter for training ds = DataFromListOfDict(roidbs[img_range[0]: img_range[1]], ['file_name', 'image_id']) def f(fname): im = cv2.imread(fname, cv2.IMREAD_COLOR) assert im is not None, fname return im ds = MapDataComponent(ds, f, 0) # Evaluation itself may be multi-threaded, therefore don't add prefetch here. return ds def get_batched_eval_dataflow(name, shard=0, num_shards=1, batch_size=1): """ Args: name (str): name of the dataset to evaluate shard, num_shards: to get subset of evaluation data """ roidbs = DetectionDataset().load_inference_roidbs(name) num_imgs = len(roidbs) img_per_shard = num_imgs // num_shards img_range = (shard * img_per_shard, (shard + 1) * img_per_shard if shard + 1 < num_shards else num_imgs) # no filter for training ds = DataFromListOfDictBatched(roidbs[img_range[0]: img_range[1]], ['file_name', 'image_id'], batch_size) def decode_images(inputs): return [[cv2.imread(inp[0], cv2.IMREAD_COLOR), inp[1]] for inp in inputs] def resize_images(inputs): resizer = CustomResize(cfg.PREPROC.TEST_SHORT_EDGE_SIZE, cfg.PREPROC.MAX_SIZE) resized_imgs = [resizer.augment(inp[0]) for inp in inputs] org_shapes = [inp[0].shape for inp in inputs] scales = [np.sqrt(rimg.shape[0] * 1.0 / org_shape[0] * rimg.shape[1] / org_shape[1]) for rimg, org_shape in zip(resized_imgs, org_shapes)] return [[resized_imgs[i], inp[1], scales[i], org_shapes[i][:2]] for i, inp in enumerate(inputs)] def pad_and_batch(inputs): heights, widths, _ = zip(*[inp[0].shape for inp in inputs]) max_h, max_w = max(heights), max(widths) padded_images = np.stack([np.pad(inp[0], [[0, max_h-inp[0].shape[0]], [0, max_w-inp[0].shape[1]], [0,0]], 'constant') for inp in inputs]) return [padded_images, [inp[1] for inp in inputs], list(zip(heights, widths)), [inp[2] for inp in inputs], [inp[3] for inp in inputs]] ds = MapData(ds, decode_images) ds = MapData(ds, resize_images) ds = MapData(ds, pad_and_batch) return ds if __name__ == '__main__': import os from tensorpack.dataflow import PrintData cfg.DATA.BASEDIR = os.path.expanduser('~/data') ds = get_batched_eval_dataflow('train_reduced') ds.reset_state() cnt = 0 for k in ds: print(k) cnt += 1