"""Sign & Speak Inference Script This script defines the methods required by Amazon SageMaker to create an inference endpoint for the Sign & Speak project. It expects image input, where each image is a 3x3 grid of video frames, and is stored in an Amazon S3 bucket. The output is the text version of the label and a confidence score. Input sent to the endpoint should be JSON with the following format: {'grid': } Output returned will be JSON with the following format: {'output': , 'confidence': } """ import json import logging import os import tempfile import re import boto3 import torch from torchvision import transforms from PIL import Image logger = logging.getLogger(__name__) JSON_CONTENT_TYPE = 'application/json' # Define a data transformation similar to the one used to train the original ResNet model transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]) classes = {} def model_fn(model_dir): """ Loads the trained model from the model directory. """ logger.info('Loading the model.') logger.info(model_dir) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") logger.info('Current device: {}'.format(device)) if device == "cuda": model = torch.load(os.path.join(model_dir, 'model.pth')) #GPU else: model = torch.load(os.path.join(model_dir, 'model.pth'), map_location="cpu") #CPU model.to(device).eval() logger.info('Loading the classes.') global classes with open(os.path.join(model_dir, 'class_indices.json'), 'r') as file_handler: classes = json.load(file_handler) return model def input_fn(serialized_input_data, content_type=JSON_CONTENT_TYPE): """ Loads the JSON input, fetches the image from S3, and transforms the image. """ logger.info('Deserializing the input data.') if content_type == JSON_CONTENT_TYPE: # Load JSON input input_data = json.loads(serialized_input_data) image_loc_s3 = input_data['grid'] # Fetch bucket and object details url_components = re.search("s3://(.+?)/(.*)", image_loc_s3) bucket_name = url_components.group(1) object_name = url_components.group(2) # Load image file from S3 bucket tmp = tempfile.NamedTemporaryFile() with open(tmp.name, 'wb') as file_handle: s3_client = boto3.client('s3') s3_client.download_fileobj(bucket_name, object_name, file_handle) image = Image.open(tmp.name) # Transform image same as during training transformed_image = transform(image) model_input = transformed_image.unsqueeze(0) return model_input raise Exception("Requested unsupported ContentType in content_type: {}".format(content_type)) def output_fn(prediction_output, accept=JSON_CONTENT_TYPE): """ Transforms the model output to return the text label instead of its index. Returns the result as JSON. """ logger.info('Serializing the generated output.') logger.info("Original output is {}".format(prediction_output)) # Normalize the confidence value to be a float value between 0 and 1 normalized_output = torch.nn.functional.softmax(prediction_output[0], dim=0) batched_norm = normalized_output.unsqueeze(0) values, indices = torch.max(batched_norm.data, 1) # Fetch the text label based on the label index #classes = {"cat": 0, "eight o clock": 1, "friend": 2, "good how are you": 3, "goodbye": 4, "grandfather": 5, "grandmother": 6, "hello": 7, "pleased to meet you": 8, "pub": 9, "restaurant": 10, "thank you": 11} for label, index in classes.items(): if index == indices.item(): class_from_idx = label # Format and return the final result if accept == JSON_CONTENT_TYPE: return json.dumps({'output': class_from_idx, 'confidence': values.item()}), accept raise Exception('Requested unsupported ContentType in Accept: ' + accept)