# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. # SPDX-License-Identifier: MIT-0 import boto3 import json import pandas as pd import s3fs import time import utils from datetime import datetime def create_manifest_from_bucket(bucket, prefix, folder, labels, output_bucket): """ Based on a bucket / prefix location on S3, this function will crawl this location for images and generate a JSON manifest file compatible with Rekognition Custom Labels. PARAMS ====== bucket (string) - bucket name prefix (string) - S3 prefix where to look for the images folder (string) - either train or test labels (list) - list of labels to look for (normal, anomaly) output_bucket (string) - where to upload the JSON manifest file to """ # Get a creation date: creation_date = str(pd.to_datetime(datetime.now()))[:23].replace(' ','T') # Assign a distinct identifier for each label: auto_label = {} for index, label in enumerate(labels): auto_label.update({label: index + 1}) # Get a handle on an S3 filesystem object: fs = s3fs.S3FileSystem() # Create a manifest file in the output directory passed as argument: with fs.open(output_bucket + f'/{folder}.manifest', 'w') as f: # We expect one subfolder for each label: for label in labels: # Loops through each file present at this point: for file in fs.ls(path=f'{bucket}/{prefix}/{folder}/{label}/', detail=True): # We only care for files, not directories: if file['Size'] > 0: key = file['Key'] # Build a Ground Truth format manifest row: manifest_row = { 'source-ref': f's3://{key}', 'auto-label': auto_label[label], 'auto-label-metadata': { 'confidence': 1, 'job-name': 'labeling-job/auto-label', 'class-name': label, 'human-annotated': 'yes', 'creation-date': creation_date, 'type': 'groundtruth/image-classification' } } # Write this line to the manifest: f.write(json.dumps(manifest_row, indent=None) + '\n') def start_model(project_arn, model_arn, version_name, min_inference_units=1): """ Start a Rekognition Custom Labels model. PARAMS ====== project_arn (string) - project ARN model_arn (string) - project version ARN version_name (string) - project version name min_inference_units (integer) - inference unit to provision for the endpoint which will be deployed for this particular project version. """ client = boto3.client('rekognition') try: # Start the model print('Starting model: ' + model_arn) response = client.start_project_version(ProjectVersionArn=model_arn, MinInferenceUnits=min_inference_units) # Wait for the model to be in the running state: project_version_running_waiter = client.get_waiter('project_version_running') project_version_running_waiter.wait(ProjectArn=project_arn, VersionNames=[version_name]) # Get the running status describe_response=client.describe_project_versions(ProjectArn=project_arn, VersionNames=[version_name]) for model in describe_response['ProjectVersionDescriptions']: print("Status: " + model['Status']) print("Message: " + model['StatusMessage']) except Exception as e: print(e) print('Done.') def stop_model(model_arn): """ Stops a Rekognition Custom Labels model. PARAMS ====== model_arn (string) - project version ARN """ print('Stopping model:' + model_arn) # Stop the model: try: reko = boto3.client('rekognition') response = reko.stop_project_version(ProjectVersionArn=model_arn) status = response['Status'] print('Status: ' + status) except Exception as e: print(e) print('Done.') def show_custom_labels(model, bucket, image, min_confidence): """ Calls the Rekognition detect_custom_labels() API to get the prediction for a given image. PARAMS ====== model (string) - project version ARN bucket (string) - bucket where the image is located image (string) - complete S3 prefix where the image is located min_confidence (float) - minimum confidence score to return a result RETURNS ======= Returns the custom label response """ # Call DetectCustomLabels from the Rekognition API: this will give us the list # of labels detected for this picture and their associated confidence level: reko = boto3.client('rekognition') try: response = reko.detect_custom_labels( Image={'S3Object': {'Bucket': bucket, 'Name': image}}, MinConfidence=min_confidence, ProjectVersionArn=model ) except Exception as e: print(f'Exception encountered when processing {image}') print(e) # Returns the list of custom labels for the image passed as an argument: return response['CustomLabels'] def get_results(project_version_arn, bucket, s3_path, label=None, verbose=True): """ Sends a list of pictures located in an S3 path to the endpoint to get the associated predictions. PARAMS ====== project_version_arn (string) - ARN of the model to query bucket (string) - bucket name s3_path (string) - prefix where to look the images for label (string) - ground truth label of the images verbose (boolean) - shows a progress bar if True (defaults to True) RETURNS ======= predictions (dataframe) A dataframe with the following columns: image, abnormal probability, normal probability and ground truth. """ fs = s3fs.S3FileSystem() data = {} counter = 0 predictions = pd.DataFrame(columns=['image', 'normal', 'abnormal']) for file in fs.ls(path=s3_path, detail=True, refresh=True): if file['Size'] > 0: image = '/'.join(file['Key'].split('/')[1:]) if verbose == True: print('.', end='') labels = show_custom_labels(project_version_arn, bucket, image, 0.0) for L in labels: data[L['Name']] = L['Confidence'] predictions = predictions.append(pd.Series({ 'image': file['Key'].split('/')[-1], 'abnormal': data['abnormal'], 'normal': data['normal'], 'ground truth': label }), ignore_index=True) # Temporization to prevent any throttling: counter += 1 if counter % 100 == 0: if verbose == True: print('|', end='') time.sleep(1) return predictions def reshape_results(df, unknown_threshold=50.0): """ Reshape a results dataframe containing image path, normal and abnormal confidence levels into a more straightforward one with ground truth, prediction and confidence level associated to each image. PARAMS ====== df (dataframe) Input dataframe with the following columns: image, ground truth, normal and abnormal. unknown_threshold (float) If a probability is lower than this threshold, we select the other result (defaults to 50.0). """ new_val_predictions = pd.DataFrame(columns=['Image', 'Ground Truth', 'Prediction', 'Confidence Level']) for index, row in df.iterrows(): new_row = pd.Series(dtype='object') new_row['Image'] = row['image'] new_row['Ground Truth'] = row['ground truth'] if row['normal'] >= unknown_threshold: new_row['Prediction'] = 'normal' new_row['Confidence Level'] = row['normal'] / 100 elif row['abnormal'] >= unknown_threshold: new_row['Prediction'] = 'abnormal' new_row['Confidence Level'] = row['abnormal'] / 100 else: new_row['Prediction'] = 'unknown' new_row['Confidence Level'] = 0.0 new_val_predictions = new_val_predictions.append(pd.Series(new_row), ignore_index=True) return new_val_predictions def classification_report(input_df): """ Generates a classification report (similar to what Amazon Rekognition Custom Labels shows in the console) based on the input_df dataframe. PARAMS ====== RETURNS ======= performance (Pandas Series) Returns a Pandas series with the following attributes: - Label name: 'normal' - F1 score - Number of test images - Precision score - Recall score - Assumed threshold (computed as the confidence level minimum) """ input_df = utils.generate_error_types(input_df, normal_label='normal', anomaly_label='abnormal') # Abnormal samples: df = input_df[input_df['Ground Truth'] == 'abnormal'] TP = df['TN'].sum() FN = df['FN'].sum() FP = df['FP'].sum() recall = TP / (TP + FN) precision = TP / (TP + FP) f1_score = 2 * TP / (2 * TP + FP + FN) min_confidence_level = df.sort_values(by='Confidence Level', ascending=True).iloc[0]['Confidence Level'] performance = pd.DataFrame(columns=['Label name', 'F1 score', 'Test images', 'Precision', 'Recall', 'Assumed threshold']) performance = performance.append(pd.Series({ 'Label name': 'abnormal', 'F1 score': round(f1_score, 3), 'Test images': input_df[input_df['Ground Truth'] == 'abnormal'].shape[0], 'Precision': precision, 'Recall': recall, 'Assumed threshold': round(min_confidence_level,3) }), ignore_index=True) # Normal samples: df = input_df[input_df['Ground Truth'] == 'normal'] TP = df['TP'].sum() FN = df['FN'].sum() FP = df['FP'].sum() recall = TP / (TP + FN) precision = TP / (TP + FP) f1_score = 2 * TP / (2 * TP + FP + FN) min_confidence_level = df.sort_values(by='Confidence Level', ascending=True).iloc[0]['Confidence Level'] performance = performance.append(pd.Series({ 'Label name': 'normal', 'F1 score': round(f1_score,3), 'Test images': input_df[input_df['Ground Truth'] == 'normal'].shape[0], 'Precision': precision, 'Recall': recall, 'Assumed threshold': round(min_confidence_level,3) }), ignore_index=True) return performance