import numpy as np from urllib import request import gzip import os import boto3 import json dirname = os.path.dirname(os.path.abspath(__file__)) with open(os.path.join(dirname, "config.json"), "r") as f: CONFIG = json.load(f) def mnist_to_numpy(data_dir='/tmp/data', train=True): """Download MNIST dataset and convert it to numpy array Args: data_dir (str): directory to save the data train (bool): download training set Returns: tuple of images and labels as numpy arrays """ if not os.path.exists(data_dir): os.makedirs(data_dir) if train: images_file = "train-images-idx3-ubyte.gz" labels_file = "train-labels-idx1-ubyte.gz" else: images_file = "t10k-images-idx3-ubyte.gz" labels_file = "t10k-labels-idx1-ubyte.gz" # download objects s3 = boto3.client('s3') bucket = CONFIG["public_bucket"] for obj in [images_file, labels_file]: key = os.path.join("datasets/image/MNIST", obj) dest = os.path.join(data_dir, obj) if not os.path.exists(dest): s3.download_file(bucket, key, dest) return _convert_to_numpy(data_dir, images_file, labels_file) def _convert_to_numpy(data_dir, images_file, labels_file): """Byte string to numpy arrays""" with gzip.open(os.path.join(data_dir, images_file), 'rb') as f: images = np.frombuffer(f.read(), np.uint8, offset=16).reshape(-1, 28, 28) with gzip.open(os.path.join(data_dir, labels_file), 'rb') as f: labels = np.frombuffer(f.read(), np.uint8, offset=8) return (images, labels) def normalize(x, axis): eps = np.finfo(float).eps mean = np.mean(x, axis=axis, keepdims=True) # avoid division by zero std = np.std(x, axis=axis, keepdims=True) + eps return (x - mean) / std def adjust_to_framework(x, framework='pytorch'): """Adjust a ``numpy.ndarray`` to be used as input for specified framework Args: x (numpy.ndarray): Batch of images to be adjusted to follow the convention in pytorch / tensorflow / mxnet framework (str): Framework to use. Takes value in ``pytorch``, ``tensorflow`` or ``mxnet`` Return: numpy.ndarray following the convention of tensors in the given framework """ if x.ndim == 3: # input is gray-scale x = np.expand_dims(x, 1) if framework in ['pytorch', 'mxnet']: # depth-major return x elif framework == 'tensorlfow': # depth-minor return np.transpose(x, (0, 2, 3, 1)) elif framework == 'mxnet': return x else: raise ValueError('framework must be one of ' + \ '[pytorch, tensorflow, mxnet], got {}'.format(framework)) if __name__ == '__main__': X, Y = mnist_to_numpy() X, Y = X.astype(np.float32), Y.astype(np.int8)