# Use this script to convert annotation xmls to a single annotations.json file that will be taken by Jumpstart OD model # Reference: XML2JSON.py https://linuxtut.com/en/e391e5e6924945b8a852/ import random import xmltodict import copy import json import glob import os from collections import defaultdict categories = [ {"id": 1, "name": "crazing"}, {"id": 2, "name": "inclusion"}, {"id": 3, "name": "pitted_surface"}, {"id": 4, "name": "patches"}, {"id": 5, "name": "rolled-in_scale"}, {"id": 6, "name": "scratches"}, ] def XML2JSON(xmlFiles, test_ratio=None, rnd_seed=100): """ Convert all xmls to annotations.json If the test_ratio is not None, convert to two annotations.json files, one for train+val, another one for test. """ images = list() annotations = list() image_id = 1 annotation_id = 1 for file in xmlFiles: annotation_path = file image = dict() with open(annotation_path) as fd: doc = xmltodict.parse(fd.read(), force_list=('object')) filename = str(doc['annotation']['filename']) image['file_name'] = filename if filename.endswith('.jpg') else filename + '.jpg' image['height'] = int(doc['annotation']['size']['height']) image['width'] = int(doc['annotation']['size']['width']) image['id'] = image_id # print("File Name: {} and image_id {}".format(file, image_id)) images.append(image) if 'object' in doc['annotation']: for obj in doc['annotation']['object']: for value in categories: annotation = dict() if str(obj['name']) == value["name"]: annotation["image_id"] = image_id xmin = int(obj["bndbox"]["xmin"]) ymin = int(obj["bndbox"]["ymin"]) xmax = int(obj["bndbox"]["xmax"]) ymax = int(obj["bndbox"]["ymax"]) annotation["bbox"] = [xmin, ymin, xmax, ymax] annotation["category_id"] = value["id"] annotation["id"] = annotation_id annotation_id += 1 annotations.append(annotation) else: print("File: {} doesn't have any object".format(file)) image_id += 1 if test_ratio is None: attrDict = dict() attrDict["images"] = images attrDict["annotations"] = annotations jsonString = json.dumps(attrDict) with open("annotations.json", "w") as f: f.write(jsonString) else: assert test_ratio < 1.0 # Size of each class category_ids = defaultdict(list) for img in images: category = img['file_name'].split('_')[0] category_ids[category].append(img['id']) print('\ncategory\tnum of images') print('-' * 20) random.seed(rnd_seed) train_val_images = [] test_images = [] train_val_annotations = [] test_annotations = [] for category in category_ids.keys(): print(f"{category}:\t{len(category_ids[category])}") random.shuffle(category_ids[category]) N = len(category_ids[category]) ids = category_ids[category] sep = int(N * test_ratio) category_images = [img for img in images if img['id'] in ids[:sep]] test_images.extend(category_images) category_images = [img for img in images if img['id'] in ids[sep:]] train_val_images.extend(category_images) category_annotations = [ann for ann in annotations if ann['image_id'] in ids[:sep]] test_annotations.extend(category_annotations) category_annotations = [ann for ann in annotations if ann['image_id'] in ids[sep:]] train_val_annotations.extend(category_annotations) print('-' * 20) train_val_attrDict = dict() train_val_attrDict["images"] = train_val_images train_val_attrDict["annotations"] = train_val_annotations print(f"\ntrain_val:\t{len(train_val_images)}") train_val_jsonString = json.dumps(train_val_attrDict) with open("annotations.json", "w") as f: f.write(train_val_jsonString) test_attDict = dict() test_attDict["images"] = test_images test_attDict["annotations"] = test_annotations print(f"test:\t{len(test_images)}") test_jsonString = json.dumps(test_attDict) with open("test_annotations.json", "w") as f: f.write(test_jsonString) def convert_to_pycocotools_ground_truth(annotations_file): """ Given the annotation json file for the test data generated during initial data preparatoin, convert it to the input format pycocotools can consume. """ with open(annotations_file) as f: images_annotations = json.loads(f.read()) attrDict = dict() attrDict["images"] = images_annotations["images"] attrDict["categories"] = categories annotations = [] for entry in images_annotations['annotations']: ann = copy.deepcopy(entry) xmin, ymin, xmax, ymax = ann["bbox"] ann["bbox"] = [xmin, ymin, xmax-xmin, ymax-ymin] # convert to [x, y, W, H] ann["area"] = (xmax - xmin) * (ymax - ymin) ann["iscrowd"] = 0 annotations.append(ann) attrDict["annotations"] = annotations jsonString = json.dumps(attrDict) ground_truth_annotations = "results/ground_truth_annotations.json" with open(ground_truth_annotations, "w") as f: f.write(jsonString) return ground_truth_annotations if __name__ == "__main__": data_path = '../NEU-DET/ANNOTATIONS' xmlfiles = glob.glob(os.path.join(data_path, '*.xml')) xmlfiles.sort() XML2JSON(xmlfiles, test_ratio=0.2)