from io import StringIO import json import s3fs import boto3 import pandas as pd def parse_gt_output(manifest_path, job_name): """ Captures the json GroundTruth bounding box annotations into a pandas dataframe Input: manifest_path: S3 path to the annotation file job_name: name of the GroundTruth job Returns: df_bbox: pandas dataframe with bounding box coordinates for each item in every image """ filesys = s3fs.S3FileSystem() with filesys.open(manifest_path) as fin: annot_list = [] for line in fin.readlines(): record = json.loads(line) if job_name in record.keys(): # is it necessary? image_file_path = record["source-ref"] image_file_name = image_file_path.split("/")[-1] class_maps = record[f"{job_name}-metadata"]["class-map"] imsize_list = record[job_name]["image_size"] assert len(imsize_list) == 1 image_width = imsize_list[0]["width"] image_height = imsize_list[0]["height"] for annot in record[job_name]["annotations"]: left = annot["left"] top = annot["top"] height = annot["height"] width = annot["width"] class_name = class_maps[f'{annot["class_id"]}'] annot_list.append( [ image_file_name, class_name, left, top, height, width, image_width, image_height, ] ) df_bbox = pd.DataFrame( annot_list, columns=[ "img_file", "category", "box_left", "box_top", "box_height", "box_width", "img_width", "img_height", ], ) return df_bbox def save_df_to_s3(df_local, s3_bucket, destination): """ Saves a pandas dataframe to S3 Input: df_local: Dataframe to save s3_bucket: Bucket name destination: Prefix """ csv_buffer = StringIO() s3_resource = boto3.resource("s3") df_local.to_csv(csv_buffer, index=False) s3_resource.Object(s3_bucket, destination).put(Body=csv_buffer.getvalue()) def main(): """ Performs the following tasks: 1. Reads input from 'input.json' 2. Parses the GroundTruth annotations and creates a dataframe 3. Saves the dataframe to S3 """ with open("input.json") as fjson: input_dict = json.load(fjson) s3_bucket = input_dict["s3_bucket"] job_id = input_dict["job_id"] gt_job_name = input_dict["ground_truth_job_name"] mani_path = f"s3://{s3_bucket}/{job_id}/ground_truth_annots/{gt_job_name}/manifests/output/output.manifest" df_annot = parse_gt_output(mani_path, gt_job_name) dest = f"{job_id}/ground_truth_annots/{gt_job_name}/annot.csv" save_df_to_s3(df_annot, s3_bucket, dest) if __name__ == "__main__": main()