import numpy as np import os import shutil from pycocotools.coco import COCO import matplotlib.pyplot as plt from pathlib import Path import json class COCOSubsetter(object): """ Tools for subsetting COCO data. Can be used to create a smaller subset of data either randomly, or by using in conjunction with pycocotools to subset by category. Can also be used to duplicate a dataset. For example, if the user wants to train on a single image for testing, that image can be duplicated multiple times. """ def __init__(self, data_dir): """ Parameters ---------- data_dir : str Filepath location of of COCO data. Expects to find subdirectories for train2017 and annotations """ self.data_dir = Path(data_dir) self.instance_file = self.data_dir.joinpath('annotations/instances_train2017.json') self.train_dir = self.data_dir.joinpath('train2017') self.images = list(self.train_dir.glob('*.jpg')) self.images = {int(os.path.splitext(os.path.basename(i.as_posix()))[0]): \ i for i in self.images} self.load_annotations() return def random_subset(self, count): """ Parameters ---------- count : int the number of random images to select Returns ------- dict dictionary of {image_id: Path(image)} """ images = np.random.choice(list(self.images.keys()), size=count) return {i:self.images[i] for i in images} def load_annotations(self): """ Load annotations for COCO data Returns ------- """ with open(self.instance_file) as infile: self.instances = json.load(infile) def _create_new_annotations(self, annotations, images): """ Used for generating a new set of annotations info licenses and categories are the same for the subset, so just copy them. For annotations and images, take a new set to combine. Parameters ---------- annotations : list[dict] a list of dictionaries of annotations images : list[dict] a list of dictionaries of image information Returns ------- dict A dictionary mirroring the annotations format """ new_annotations = dict() new_annotations['info'] = self.instances['info'] new_annotations['licenses'] = self.instances['licenses'] new_annotations['categories'] = self.instances['categories'] new_annotations['annotations'] = annotations new_annotations['images'] = images return new_annotations def filter_annotations(self, images): """ Given a set of image ids, subset the annotations and images and combine with other fields of the annotations file Parameters ---------- images : list[int] a list of image ids Returns ------- dict A dictionary of new annotations """ annotations = [i for i in self.instances['annotations'] if i['image_id'] in images] images = [i for i in self.instances['images'] if i['id'] in images] return self._create_new_annotations(annotations, images) def duplicate_annotations(self, count): """ Create duplicates of the annotations by incrementing ids and filenames. Given a count, apply the range of (0,count) to the end of the image ids and filenames. Parameters ---------- count : int The number of time to duplicate the annotations Returns ------- dict A dictionary of new annotations """ new_annotations = [] new_images = [] for num in range(count): for anno in self.instances['annotations']: anno_copy = anno.copy() anno_copy['image_id'] = int("{}{}".format(anno['image_id'], str(num))) new_annotations.append(anno_copy) for image in self.instances['images']: image_copy = image.copy() filename = os.path.splitext(image_copy['file_name']) image_copy['file_name'] = "{}{}{}".format(filename[0], str(num), filename[1]) image_id = int("{}{}".format(image_copy['id'], str(num))) image_copy['id']=image_id new_images.append(image_copy) return self._create_new_annotations(new_annotations, new_images) def create_subset_dir(self, dir): """ Checks if the output directory exists and creates it if it doesn't Parameters ---------- dir : str filepath for output Returns ------- None Raises ------ AssertionError If directory already exists, return error """ assert not dir.exists(), "directory {} exists".format(dir.as_posix()) dir.mkdir() dir.joinpath('annotations').mkdir() dir.joinpath('train2017').mkdir() def create_subset(self, images, dir): """ Create a new dataset based on a list of images Parameters ---------- images : list[int] A list of image ids dir : str path for output Returns ------- None """ dir = Path(dir) self.create_subset_dir(dir) for image in images: shutil.copy(self.images[image], dir.joinpath('train2017').joinpath(os.path.basename(self.images[image]))) with open(dir.joinpath('annotations').joinpath('instances_train2017.json'), 'w') as anno_file: anno_file.write(json.dumps(self.filter_annotations(images))) def duplicate_dataset(self, count, dir): """ Create a new dataset with duplicated images Parameters ---------- count : int Number of duplicates to generate dir : str output directory Returns ------- None """ dir = Path(dir) self.create_subset_dir(dir) new_annotations = self.duplicate_annotations(count) for image in self.images.values(): basename, _ = os.path.splitext(os.path.basename(image)) for num in range(count): new_file = basename + str(num) + '.jpg' shutil.copy(image, dir.joinpath('train2017').joinpath(new_file)) with open(dir.joinpath('annotations').joinpath('instances_train2017.json'), 'w') as outfile: outfile.write(json.dumps(new_annotations))