{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Preparing Deep Learning Training and HPO Code in a Local Sagemaker Instance for Dockerizing\n", "\n", "This notebook shows an example of a running Bayesian HPO and also training for a regression deep neural network written in Keras with a Tensorflow backend." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 1- Load libraries" ] }, { "cell_type": "code", "execution_count": 134, "metadata": {}, "outputs": [], "source": [ "from __future__ import print_function\n", "\n", "import os\n", "import sys\n", "import traceback\n", "import json\n", "import numpy as np\n", "import pandas as pd\n", "import tensorflow as tf\n", "from sklearn.model_selection import train_test_split\n", "from sklearn import preprocessing\n", "\n", "import warnings\n", "warnings.simplefilter(\"ignore\")\n", "\n", "from keras.callbacks import EarlyStopping\n", "from keras.callbacks import ReduceLROnPlateau\n", "from keras.callbacks import ModelCheckpoint\n", "from keras.layers import Dropout, Dense\n", "from keras.wrappers.scikit_learn import KerasRegressor\n", "from keras.layers.normalization import BatchNormalization\n", "from keras.models import Sequential\n", "\n", "from sklearn.preprocessing import StandardScaler\n", "from sklearn.preprocessing import LabelEncoder\n", "from sklearn.model_selection import GridSearchCV\n", "from pickle import dump\n", "\n", "from timeit import default_timer as timer\n", "from hyperopt import STATUS_OK\n", "from hyperopt import hp\n", "from hyperopt import tpe\n", "from hyperopt import Trials\n", "from hyperopt import fmin" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 2- Function for preparing our data" ] }, { "cell_type": "code", "execution_count": 135, "metadata": {}, "outputs": [], "source": [ "def data_prep(train_data): \n", "\n", " if not final_training:\n", " skip = int(100/int(used_data_percentage))\n", " train_data = train_data[::skip]\n", "\n", " train_data = train_data.dropna()\n", " print(train_data.columns)\n", "\n", " train_data = train_data.astype('float32')\n", " \n", " train_x = train_data.drop([target], axis=1)\n", " train_y = train_data[target]\n", "\n", " train_x, val_x, train_y, val_y = train_test_split(train_x, train_y, test_size = train_validation_split)\n", " \n", " scaler = preprocessing.RobustScaler(quantile_range=(25.0, 75.0)).fit(train_x)\n", " dump(scaler, open(os.path.join(model_path, 'scaler.pkl'), 'wb'))\n", " \n", " train_x = scaler.transform(train_x)\n", " val_x = scaler.transform(val_x)\n", "\n", " return train_x, train_y, val_x, val_y" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 3- Function for doing a final training" ] }, { "cell_type": "code", "execution_count": 136, "metadata": {}, "outputs": [], "source": [ "def train_final_model(params):\n", " input_files = [ os.path.join(training_path, file) for file in os.listdir(training_path) ]\n", " if len(input_files) == 0:\n", " raise ValueError(('There are no files in {}.\\n' +\n", " 'This usually indicates that the channel ({}) was incorrectly specified,\\n' +\n", " 'the data specification in S3 was incorrectly specified or the role specified\\n' +\n", " 'does not have permission to access the data.').format(training_path, channel_name))\n", " raw_data = [ pd.read_csv(file) for file in input_files if file.endswith('.csv')]\n", " raw_data = pd.concat(raw_data)\n", " train_x, train_y, test_x, test_y = data_prep(raw_data)\n", " print('data loaded') \n", " start = timer()\n", " \n", " #######################################################\n", " model = Sequential()\n", " for i in range(params['num_dense_layers']-1):\n", " if i ==0:\n", " model.add(Dense(params['num_dense_nodes']['num_dense_nodes_1'], kernel_initializer='normal',input_dim = train_x.shape[1], activation='relu'))\n", " if batch_normalization:\n", " model.add(BatchNormalization())\n", " if include_dropout:\n", " model.add(Dropout(params['dropout']))\n", " else:\n", " model.add(Dense(params['num_dense_nodes']['num_dense_nodes_'+str(i+1)], kernel_initializer='normal', activation='relu'))\n", " if batch_normalization:\n", " model.add(BatchNormalization())\n", " if include_dropout:\n", " model.add(Dropout(params['dropout']))\n", "\n", " model.add(Dense(1, kernel_initializer='normal',activation=params['last_activation_f']))\n", " if batch_normalization:\n", " model.add(BatchNormalization())\n", " model.compile(loss=loss_metric, optimizer = params['optimizer'], metrics=[loss_metric])\n", " model.summary()\n", "\n", " earlyStopping = EarlyStopping(monitor= monitor_metric, patience=early_stopping_patience, verbose=0, mode='min')\n", " mcp_save = ModelCheckpoint('.mdl_wts.hdf5', save_best_only=True, monitor= monitor_metric, mode='min')\n", " reduce_lr_loss = ReduceLROnPlateau(monitor= monitor_metric, factor=0.1, patience=lr_update_patience, verbose=1, epsilon=1e-4, mode='min')\n", "\n", " history = model.fit(train_x, train_y,\n", " callbacks=[earlyStopping, mcp_save, reduce_lr_loss],\n", " epochs=params['nb_epochs'],\n", " verbose=2,\n", " validation_data=(test_x, test_y))\n", "\n", " predictions=model.predict(test_x)\n", "\n", " df = pd.DataFrame(columns=['Actual','Predicted'])\n", " df['Actual'] = test_y\n", " df['Predicted'] = predictions\n", " diff = abs(df['Actual'] - df['Predicted'])/df['Actual'] \n", " q95 = diff.quantile(.95)\n", "\n", "\n", " ###########\n", " # serialize model to JSON\n", " model_json = model.to_json()\n", " with open(os.path.join(model_path, 'model.json'), \"w\") as json_file:\n", " json_file.write(model_json)\n", " # serialize weights to HDF5\n", " model.save_weights(os.path.join(model_path, 'model.h5'))\n", " print(\"Saved model to disk\")\n", " ###########\n", " print('q95 {}'.format(q95))\n", " run_time = timer() - start\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 4- Function for doing Bayesian HPO" ] }, { "cell_type": "code", "execution_count": 137, "metadata": {}, "outputs": [], "source": [ "best_q95 = 10e10\n", "def objective(params):\n", " \"\"\"Objective function for Gradient Boosting Machine Hyperparameter Tuning\"\"\"\n", " \n", " \n", " input_files = [ os.path.join(training_path, file) for file in os.listdir(training_path) ]\n", " if len(input_files) == 0:\n", " raise ValueError(('There are no files in {}.\\n' +\n", " 'This usually indicates that the channel ({}) was incorrectly specified,\\n' +\n", " 'the data specification in S3 was incorrectly specified or the role specified\\n' +\n", " 'does not have permission to access the data.').format(training_path, channel_name))\n", " raw_data = [ pd.read_csv(file) for file in input_files if file.endswith('.csv')]\n", " raw_data = pd.concat(raw_data)\n", "\n", " train_x, train_y, test_x, test_y = data_prep(raw_data)\n", " print('data loaded')\n", "\n", " global ITERATION\n", " print('Iteration: {}'.format(ITERATION))\n", " \n", " ITERATION += 1\n", " start = timer()\n", " \n", " #######################################################\n", " model = Sequential()\n", "\n", " for i in range(params['num_dense_layers']-1):\n", " if i ==0:\n", " model.add(Dense(params['num_dense_nodes']['num_dense_nodes_1'], kernel_initializer='normal',input_dim = train_x.shape[1], activation='relu'))\n", " if batch_normalization:\n", " model.add(BatchNormalization())\n", " if include_dropout:\n", " model.add(Dropout(params['dropout']))\n", " else:\n", " model.add(Dense(params['num_dense_nodes']['num_dense_nodes_'+str(i+1)], kernel_initializer='normal', activation='relu'))\n", " if batch_normalization:\n", " model.add(BatchNormalization())\n", " if include_dropout:\n", " model.add(Dropout(params['dropout']))\n", "\n", " model.add(Dense(1, kernel_initializer='normal',activation= params['last_activation']))\n", " if batch_normalization:\n", " model.add(BatchNormalization())\n", " model.compile(loss=loss_metric, optimizer = params['optimizer'], metrics=[loss_metric])\n", " #model.summary()\n", "\n", " earlyStopping = EarlyStopping(monitor= monitor_metric, patience=early_stopping_patience, verbose=0, mode='min')\n", " reduce_lr_loss = ReduceLROnPlateau(monitor= monitor_metric, factor=0.1, patience=lr_update_patience, verbose=1, epsilon=1e-4, mode='min')\n", "\n", " history = model.fit(train_x, train_y,\n", " callbacks=[earlyStopping, reduce_lr_loss],\n", " epochs=params['nb_epochs'],\n", " verbose=2,\n", " validation_data=(test_x, test_y))\n", "\n", " predictions=model.predict(test_x)\n", "\n", " df = pd.DataFrame(columns=['Actual','Predicted'])\n", " df['Actual'] = test_y\n", " df['Predicted'] = predictions\n", " diff = abs(df['Actual'] - df['Predicted'])/df['Actual'] \n", " q95 = diff.quantile(.95)\n", "\n", " # Save the model if it improves on the best-found performance.\n", " # We use the global keyword so we update the variable outside\n", " # of this function.\n", " global best_q95\n", " global short_model_summary\n", "\n", " # If the classification accuracy of the saved model is improved ...\n", " if q95 < best_q95:\n", " ###########\n", " # serialize model to JSON\n", " model_json = model.to_json()\n", " with open(os.path.join(model_path, 'model.json'), \"w\") as json_file:\n", " json_file.write(model_json)\n", " # serialize weights to HDF5\n", " model.save_weights(os.path.join(model_path, 'model.h5'))\n", " \n", " stringlist = []\n", " model.summary(print_fn=lambda x: stringlist.append(x))\n", " short_model_summary = \"\".join(stringlist)\n", "\n", " print(\"Saved model to disk\")\n", " ###########\n", " \n", " # Update the regression accuracy.\n", " best_q95 = q95\n", " print(100*'=')\n", " print(50*' ',' Iteration: \\n', ITERATION)\n", " print(' q95: \\n{}'.format(q95))\n", " print(' best_q95: \\n {}'.format(best_q95))\n", " print(100*'=')\n", " # Delete the Keras model with these hyper-parameters from memory.\n", " del model\n", " \n", " ####################################################### \n", "\n", " run_time = timer() - start\n", "\n", " # Dictionary with information for evaluation\n", " return {'loss': q95,'params': params, 'iteration': ITERATION,\n", " 'train_time': run_time, 'status': STATUS_OK}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 5- Define HyperParameters for Training and HPO (equivalent of section 5-D in the train script) " ] }, { "cell_type": "code", "execution_count": 144, "metadata": {}, "outputs": [], "source": [ "# Here we define parameters for Final Training or HPO\n", "final_training = True # This flag switched between Final Training mode (True) and HPO mode (False)\n", "\n", "if final_training: # If we are doing Final Training\n", " final_training = True\n", " target = 'PE_'\n", " batch_normalization = False\n", " include_dropout = False\n", " dropout_f = .2\n", " early_stopping_patience = 15\n", " train_validation_split = .15\n", " lr_update_patience = 7\n", " loss_metric = 'mae'\n", " monitor_metric = 'val_mean_absolute_error'\n", " num_layers_f = 8\n", " nodes = [1024,64,1024,32,32,64,512] # The number of nodes (length of \"nodes\" list) should be num_layers_f-1 because the last layer has 1 node and is automatically added\n", " nb_epochs_f = 3\n", " batch_size_f = 32\n", " optimizer_f = 'adam'\n", " last_activation_f = 'tanh'\n", " \n", "else: # If we are doing HPO\n", " final_training = False\n", " target = 'PE_'\n", " batch_normalization = False\n", " include_dropout = False\n", " dropout = [.2,.3,.5]\n", " early_stopping_patience = 15\n", " lr_update_patience = 7\n", " loss_metric = 'mae'\n", " monitor_metric = 'val_mean_absolute_error'\n", " used_data_percentage = 10\n", " train_validation_split = .15\n", " MAX_EVALS = 3\n", " randstate = 50\n", " num_layers_low = 1\n", " num_layers_high = 9\n", " choice_of_node_numbers = [16,32,64,128,256,512,1024,2048] # Here you can give the possible node size for layers. If you want to only have small number of nodes, remove the high values from this list. \n", " nb_epochs = 3\n", " batch_size = [32,64,128]\n", " optimizer = ['adam']\n", " last_activation = ['tanh'] # Activation for the layer with one node. Options for this are 'linear' and 'tanh'\n", " " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 6- Putting above parameters in dictionaries that can be used by Training or HPO functions" ] }, { "cell_type": "code", "execution_count": 145, "metadata": {}, "outputs": [], "source": [ "if final_training: # If we are doing Final Training\n", " parameters = { 'num_dense_layers': num_layers_f,\n", " 'num_dense_nodes': {'num_dense_nodes_'+str(k+1): nodes[k] for k in range(num_layers_f-1)},\n", " 'batch_size' : batch_size_f,\n", " 'nb_epochs' : nb_epochs_f,\n", " 'dropout' : dropout_f,\n", " 'optimizer': optimizer_f,\n", " 'last_activation_f': last_activation_f\n", " }\n", "else: # If we are doing HPO\n", " space = { 'num_dense_layers': hp.choice('num_dense_layers', np.arange(num_layers_low, num_layers_high, dtype=int)),\n", " 'num_dense_nodes': {'num_dense_nodes_'+str(k+1): hp.choice('num_dense_nodes_'+str(k+1), choice_of_node_numbers) for k in range(num_layers_high)},\n", " 'batch_size' : hp.choice('batch_size', batch_size),\n", " 'nb_epochs' : nb_epochs,\n", " 'optimizer': hp.choice('optimizer',optimizer),\n", " 'last_activation': hp.choice('last_activation',last_activation)\n", " }\n", "\n", " if include_dropout:\n", " space['dropout'] = hp.choice('dropout',dropout)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 7- This is the main function which runs the final training or HPO" ] }, { "cell_type": "code", "execution_count": 146, "metadata": {}, "outputs": [], "source": [ "def train():\n", " print('Starting the training/HPO.')\n", " try:\n", " if final_training:\n", " print('Starting the final training...')\n", " train_final_model(parameters)\n", " \n", " else:\n", " print('Starting the HPO...')\n", " tpe_algorithm = tpe.suggest\n", " bayes_trials = Trials()\n", "\n", " # Global variable\n", " global ITERATION\n", "\n", " ITERATION = 0\n", " # Run optimization\n", " best = fmin(fn = objective, space = space, algo = tpe.suggest, \n", " max_evals = MAX_EVALS, trials = bayes_trials, rstate = np.random.RandomState(randstate))\n", "\n", "\n", " print('Training is complete.')\n", " # Sort the trials with lowest loss (highest AUC) first\n", " print(100*'=')\n", " print('\\n Best Model:\\n')\n", " bayes_trials_results = sorted(bayes_trials.results, key = lambda x: x['loss'])\n", " \n", " print('Model Summary: \\n\\n',short_model_summary)\n", " print('\\n\\n\\n')\n", " print(bayes_trials_results[0])\n", " print('\\n\\n\\n')\n", " print(100*'=')\n", " \n", " print('\\n 2nd Best Model: \\n')\n", " print(bayes_trials_results[1])\n", " print(100*'=')\n", "\n", " print('\\n 3rd Best Model: \\n')\n", " print(bayes_trials_results[2])\n", " print(100*'=')\n", " \n", " except Exception as e:\n", " # Write out an error file. This will be returned as the failure\n", " # Reason in the DescribeTrainingJob result.\n", " trc = traceback.format_exc()\n", " with open(os.path.join(output_path, 'failure'), 'w') as s:\n", " s.write('Exception during training: ' + str(e) + '\\n' + trc)\n", " # Printing this causes the exception to be in the training job logs\n", " print(\n", " 'Exception during training: ' + str(e) + '\\n' + trc,\n", " file=sys.stderr)\n", " # A non-zero exit code causes the training job to be marked as Failed.\n", " sys.exit(255)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 8- Define directories for data and model artifacts (equivalent of section 8-D in the train script) " ] }, { "cell_type": "code", "execution_count": 147, "metadata": {}, "outputs": [], "source": [ "training_path = 'data'\n", "output_path = '../opt/ml/output' # You can create this outside of current directory.\n", "model_path = '../opt/ml/model'# You can create this outside of current directory." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 9- Run train() function" ] }, { "cell_type": "code", "execution_count": 148, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Starting the training.\n", "Starting the final training...\n", "Index(['P0', 'P1', 'P2', 'P3', 'P4', 'P5', 'P6', 'P7', 'P8', 'P9', 'P11',\n", " 'P12', 'P13', 'P14', 'P16', 'P17', 'P18', 'P19', 'P20', 'P21', 'P22',\n", " 'P23', 'P24', 'P25', 'P26', 'P27', 'AREA_RATIO_', 'SPEED_', 'PE_'],\n", " dtype='object')\n", "data loaded\n", "_________________________________________________________________\n", "Layer (type) Output Shape Param # \n", "=================================================================\n", "dense_49 (Dense) (None, 1024) 29696 \n", "_________________________________________________________________\n", "dense_50 (Dense) (None, 64) 65600 \n", "_________________________________________________________________\n", "dense_51 (Dense) (None, 1024) 66560 \n", "_________________________________________________________________\n", "dense_52 (Dense) (None, 32) 32800 \n", "_________________________________________________________________\n", "dense_53 (Dense) (None, 32) 1056 \n", "_________________________________________________________________\n", "dense_54 (Dense) (None, 64) 2112 \n", "_________________________________________________________________\n", "dense_55 (Dense) (None, 512) 33280 \n", "_________________________________________________________________\n", "dense_56 (Dense) (None, 1) 513 \n", "=================================================================\n", "Total params: 231,617\n", "Trainable params: 231,617\n", "Non-trainable params: 0\n", "_________________________________________________________________\n", "Train on 104633 samples, validate on 18465 samples\n", "Epoch 1/3\n", " - 33s - loss: 0.0228 - mean_absolute_error: 0.0228 - val_loss: 0.0116 - val_mean_absolute_error: 0.0116\n", "Epoch 2/3\n", " - 31s - loss: 0.0111 - mean_absolute_error: 0.0111 - val_loss: 0.0091 - val_mean_absolute_error: 0.0091\n", "Epoch 3/3\n", " - 29s - loss: 0.0098 - mean_absolute_error: 0.0098 - val_loss: 0.0086 - val_mean_absolute_error: 0.0086\n", "Saved model to disk\n", "q95 0.048900701105594614\n" ] }, { "ename": "SystemExit", "evalue": "0", "output_type": "error", "traceback": [ "An exception has occurred, use %tb to see the full traceback.\n", "\u001b[0;31mSystemExit\u001b[0m\u001b[0;31m:\u001b[0m 0\n" ] } ], "source": [ "if __name__ == '__main__':\n", " train()\n", "\n", " # A zero exit code causes the job to be marked a Succeeded.\n", " sys.exit(0)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 10- Define functions for local inference" ] }, { "cell_type": "code", "execution_count": 154, "metadata": {}, "outputs": [], "source": [ "# This is the file that implements a flask server to do inferences. It's the\n", "# file that you will modify to implement the scoring for your own algorithm.\n", "from __future__ import print_function\n", "\n", "import os\n", "try:\n", " from StringIO import StringIO ## for Python 2\n", "except ImportError:\n", " from io import StringIO ## for Python 3\n", " \n", "import flask\n", "from keras.layers import Dropout, Dense\n", "from keras.wrappers.scikit_learn import KerasRegressor\n", "from keras.models import Sequential\n", "\n", "import tensorflow as tf\n", "import numpy as np\n", "import pandas as pd\n", "from pickle import load\n", "\n", "#############################\n", "from tensorflow import Graph, Session\n", "from keras import backend as K\n", "graph = Graph()\n", "\n", "#############################\n", "\n", "from keras.models import load_model\n", "from sklearn.preprocessing import StandardScaler\n", "from keras.models import model_from_json\n", "\n", "import h5py\n", "from sklearn.preprocessing import LabelEncoder, OneHotEncoder\n", "\n", "# prefix = '/opt/ml/'\n", "# model_path = os.path.join(prefix, 'model')\n", "\n", "prefix = '../opt/ml/'\n", "model_path = os.path.join(prefix, 'model')\n", "\n", "\n", "# A singleton for holding the model. This simply loads the model and holds it.\n", "# It has a predict function that does a prediction based on the model and the\n", "# input data.\n", "\n", "def loadmodel(weightFile, jsonFile): \n", " # load json and create model\n", " json_file = open(jsonFile, 'r')\n", " loaded_model_json = json_file.read()\n", " json_file.close()\n", " reg = model_from_json(loaded_model_json)\n", " # load weights into new model\n", " reg.load_weights(weightFile)\n", " print(\"Loaded model from disk\")\n", " return reg\n", " \n", "\n", "class ScoringService(object):\n", " model = None # Where we keep the model when it's loaded\n", "\n", " @classmethod\n", " def get_model(cls):\n", " \"\"\"\n", " Get the model object for this instance,\n", " loading it if it's not already loaded.\n", " \"\"\"\n", " if cls.model is None:\n", " cls.model = loadmodel(os.path.join(model_path, 'model.h5'),os.path.join(model_path, 'model.json'))\n", " return cls.model\n", "\n", " \n", " @classmethod\n", " def predict(cls,input):\n", " \"\"\"For the input, do the predictions and return them.\n", "\n", " Args:\n", " input (a pandas dataframe): The data on which to do the\n", " predictions.\n", "\n", " There will be one prediction per row in the dataframe\n", " \"\"\"\n", " sess = K.get_session()\n", " with sess.graph.as_default():\n", " clf = cls.get_model()\n", " return clf.predict(input)\n", "\n", "def transform_data(dataset):\n", " dataset = dataset.dropna()\n", " dataset = dataset.astype('float32')\n", " scaler = load(open(os.path.join(model_path, 'scaler.pkl'), 'rb'))\n", "\n", " # Feature Scaling\n", " dataset = scaler.fit_transform(dataset)\n", " return pd.DataFrame(dataset)\n", "\n", "\n", "# # The flask app for serving predictions\n", "# app = flask.Flask(__name__)\n", "\n", "\n", "# @app.route('/ping', methods=['GET'])\n", "# def ping():\n", "# \"\"\"\n", "# Determine if the container is working and healthy.\n", "# In this sample container, we declare it healthy if we can load the model\n", "# successfully.\n", "# \"\"\"\n", "\n", "# # Health check -- You can insert a health check here\n", "# health = ScoringService.get_model() is not None\n", "# status = 200 if health else 404\n", "# return flask.Response(\n", "# response='\\n',\n", "# status=status,\n", "# mimetype='application/json')\n", "# @app.route('/invocations', methods=['POST'])\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 11- Do inference" ] }, { "cell_type": "code", "execution_count": 155, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Loaded model from disk\n" ] }, { "data": { "text/plain": [ "array([[0.6766174 ],\n", " [0.5600138 ],\n", " [0.692883 ],\n", " ...,\n", " [0.7234832 ],\n", " [0.60982114],\n", " [0.75358176]], dtype=float32)" ] }, "execution_count": 155, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def transformation():\n", " \"\"\"\n", " Do an inference on a single batch of data. In this sample server, we take\n", " data as CSV, convert it to a pandas data frame for internal use and then\n", " convert the predictions back to CSV (which really just means one prediction\n", " per line, since there's a single column.\n", " \"\"\"\n", " data = None\n", "\n", " # Convert from CSV to pandas\n", " s = '../test_mod.csv' # MODIFIED\n", " data = pd.read_csv(s, header=None)\n", " data = transform_data(data)\n", " # Do the prediction\n", " predictions = ScoringService.predict(data)\n", "\n", "# # Convert from numpy back to CSV\n", "# out = StringIO()\n", "# pd.DataFrame(predictions).to_csv(out, header=False, index=False)\n", "# result = out.getvalue()\n", "\n", " return predictions # MODIFIED\n", "# return result, predictions\n", "\n", "\n", "transformation()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 12- Inference function in docker: " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def transformation():\n", " \"\"\"\n", " Do an inference on a single batch of data. In this sample server, we take\n", " data as CSV, convert it to a pandas data frame for internal use and then\n", " convert the predictions back to CSV (which really just means one prediction\n", " per line, since there's a single column.\n", " \"\"\"\n", " data = None\n", "\n", " # Convert from CSV to pandas\n", " if flask.request.content_type == 'text/csv':\n", " data = flask.request.data.decode('utf-8')\n", " s = StringIO(data)\n", " data = pd.read_csv(s, header=None)\n", " data = transform_data(data)\n", " else:\n", " return flask.Response(response='This predictor only supports CSV data',status=415, mimetype='text/plain')\n", "\n", " print('Invoked with {} records'.format(data.shape[0]))\n", "\n", " # Do the prediction\n", " predictions = ScoringService.predict(data)\n", "\n", " # Convert from numpy back to CSV\n", " out = StringIO()\n", " pd.DataFrame(predictions).to_csv(out, header=False, index=False)\n", " result = out.getvalue()\n", "\n", " return flask.Response(response=result, status=200, mimetype='text/csv')\n" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Package Version \r\n", "---------------------------------- ----------\r\n", "absl-py 0.9.0 \r\n", "alabaster 0.7.10 \r\n", "anaconda-client 1.6.14 \r\n", "anaconda-project 0.8.2 \r\n", "asn1crypto 0.24.0 \r\n", "astor 0.8.1 \r\n", "astroid 1.6.3 \r\n", "astropy 3.0.2 \r\n", "attrs 18.1.0 \r\n", "Automat 0.3.0 \r\n", "autovizwidget 0.15.0 \r\n", "awscli 1.18.39 \r\n", "Babel 2.5.3 \r\n", "backcall 0.1.0 \r\n", "backports.shutil-get-terminal-size 1.0.0 \r\n", "bcrypt 3.1.7 \r\n", "beautifulsoup4 4.6.0 \r\n", "bitarray 0.8.1 \r\n", "bkcharts 0.2 \r\n", "blaze 0.11.3 \r\n", "bleach 2.1.3 \r\n", "bokeh 1.4.0 \r\n", "boto 2.48.0 \r\n", "boto3 1.12.39 \r\n", "botocore 1.15.39 \r\n", "Bottleneck 1.2.1 \r\n", "cached-property 1.5.1 \r\n", "certifi 2019.11.28\r\n", "cffi 1.11.5 \r\n", "characteristic 14.3.0 \r\n", "chardet 3.0.4 \r\n", "click 6.7 \r\n", "cloudpickle 0.5.3 \r\n", "clyent 1.2.2 \r\n", "colorama 0.3.9 \r\n", "contextlib2 0.5.5 \r\n", "cryptography 2.9 \r\n", "cycler 0.10.0 \r\n", "Cython 0.28.4 \r\n", "cytoolz 0.9.0.1 \r\n", "dask 1.2.2 \r\n", "datashape 0.5.4 \r\n", "decorator 4.3.0 \r\n", "defusedxml 0.6.0 \r\n", "distributed 1.28.1 \r\n", "docker 4.2.0 \r\n", "docker-compose 1.25.5 \r\n", "dockerpty 0.4.1 \r\n", "docopt 0.6.2 \r\n", "docutils 0.14 \r\n", "entrypoints 0.2.3 \r\n", "enum34 1.1.9 \r\n", "environment-kernels 1.1.1 \r\n", "et-xmlfile 1.0.1 \r\n", "fastcache 1.0.2 \r\n", "filelock 3.0.4 \r\n", "Flask 1.0.2 \r\n", "Flask-Cors 3.0.4 \r\n", "future 0.18.2 \r\n", "gast 0.2.2 \r\n", "gevent 1.3.0 \r\n", "glob2 0.6 \r\n", "gmpy2 2.0.8 \r\n", "google-pasta 0.1.8 \r\n", "greenlet 0.4.13 \r\n", "grpcio 1.10.1 \r\n", "h5py 2.8.0 \r\n", "hdijupyterutils 0.15.0 \r\n", "heapdict 1.0.0 \r\n", "horovod 0.19.0 \r\n", "html5lib 1.0.1 \r\n", "hyperas 0.4.1 \r\n", "hyperopt 0.2.4 \r\n", "idna 2.6 \r\n", "imageio 2.3.0 \r\n", "imagesize 1.0.0 \r\n", "importlib-metadata 1.5.0 \r\n", "ipykernel 4.8.2 \r\n", "ipyparallel 6.2.2 \r\n", "ipython 6.4.0 \r\n", "ipython-genutils 0.2.0 \r\n", "ipywidgets 7.4.0 \r\n", "isort 4.3.4 \r\n", "itsdangerous 0.24 \r\n", "jdcal 1.4 \r\n", "jedi 0.12.0 \r\n", "Jinja2 2.10 \r\n", "jmespath 0.9.4 \r\n", "joblib 0.14.1 \r\n", "jsonschema 2.6.0 \r\n", "jupyter 1.0.0 \r\n", "jupyter-client 5.2.3 \r\n", "jupyter-console 5.2.0 \r\n", "jupyter-core 4.4.0 \r\n", "jupyterlab 0.32.1 \r\n", "jupyterlab-launcher 0.10.5 \r\n", "Keras 2.2.4 \r\n", "Keras-Applications 1.0.8 \r\n", "Keras-Preprocessing 1.1.0 \r\n", "kiwisolver 1.0.1 \r\n", "lazy-object-proxy 1.3.1 \r\n", "llvmlite 0.23.1 \r\n", "locket 0.2.0 \r\n", "lxml 4.2.1 \r\n", "Markdown 3.2.1 \r\n", "MarkupSafe 1.0 \r\n", "matplotlib 3.0.3 \r\n", "mccabe 0.6.1 \r\n", "mistune 0.8.3 \r\n", "mkl-fft 1.0.15 \r\n", "mkl-random 1.1.0 \r\n", "mkl-service 2.3.0 \r\n", "mock 4.0.1 \r\n", "more-itertools 4.1.0 \r\n", "mpmath 1.0.0 \r\n", "msgpack 0.6.0 \r\n", "msgpack-python 0.5.6 \r\n", "multipledispatch 0.5.0 \r\n", "nb-conda 2.2.1 \r\n", "nb-conda-kernels 2.2.2 \r\n", "nbconvert 5.4.1 \r\n", "nbformat 4.4.0 \r\n", "networkx 2.4 \r\n", "nltk 3.3 \r\n", "nose 1.3.7 \r\n", "notebook 5.5.0 \r\n", "numba 0.38.0 \r\n", "numexpr 2.7.1 \r\n", "numpy 1.16.4 \r\n", "numpydoc 0.8.0 \r\n", "odo 0.5.1 \r\n", "olefile 0.45.1 \r\n", "opencv-python 3.4.2.17 \r\n", "openpyxl 2.5.3 \r\n", "opt-einsum 3.1.0 \r\n", "packaging 20.1 \r\n", "pandas 0.24.2 \r\n", "pandocfilters 1.4.2 \r\n", "paramiko 2.7.1 \r\n", "parso 0.2.0 \r\n", "partd 0.3.8 \r\n", "path.py 11.0.1 \r\n", "pathlib2 2.3.2 \r\n", "patsy 0.5.0 \r\n", "pep8 1.7.1 \r\n", "pexpect 4.5.0 \r\n", "pickleshare 0.7.4 \r\n", "Pillow 5.4.1 \r\n", "pip 19.3.1 \r\n", "pkginfo 1.4.2 \r\n", "plotly 4.5.2 \r\n", "pluggy 0.6.0 \r\n", "ply 3.11 \r\n", "prompt-toolkit 1.0.15 \r\n", "protobuf 3.8.0 \r\n", "protobuf3-to-dict 0.1.5 \r\n", "psutil 5.4.5 \r\n", "psycopg2 2.7.5 \r\n", "ptyprocess 0.5.2 \r\n", "py 1.5.3 \r\n", "py4j 0.10.7 \r\n", "pyaml 20.4.0 \r\n", "pyasn1 0.4.8 \r\n", "pycodestyle 2.4.0 \r\n", "pycosat 0.6.3 \r\n", "pycparser 2.18 \r\n", "pycrypto 2.6.1 \r\n", "pycurl 7.43.0.2 \r\n", "pyflakes 1.6.0 \r\n", "pygal 2.4.0 \r\n", "Pygments 2.2.0 \r\n", "pykerberos 1.2.1 \r\n", "pylint 1.8.4 \r\n", "PyNaCl 1.3.0 \r\n", "pyodbc 4.0.23 \r\n", "pyOpenSSL 18.0.0 \r\n", "pyparsing 2.2.0 \r\n", "PySocks 1.6.8 \r\n", "pyspark 2.3.4 \r\n", "pytest 3.5.1 \r\n", "pytest-arraydiff 0.2 \r\n", "pytest-astropy 0.3.0 \r\n", "pytest-doctestplus 0.1.3 \r\n", "pytest-openfiles 0.3.0 \r\n", "pytest-remotedata 0.2.1 \r\n", "python-dateutil 2.7.3 \r\n", "pytz 2018.4 \r\n", "PyWavelets 0.5.2 \r\n", "PyYAML 5.3.1 \r\n", "pyzmq 17.0.0 \r\n", "QtAwesome 0.4.4 \r\n", "qtconsole 4.3.1 \r\n", "QtPy 1.4.1 \r\n", "requests 2.20.0 \r\n", "requests-kerberos 0.12.0 \r\n", "retrying 1.3.3 \r\n", "rope 0.10.7 \r\n", "rsa 3.4.2 \r\n", "ruamel-yaml 0.15.35 \r\n", "s3fs 0.1.5 \r\n", "s3transfer 0.3.3 \r\n", "sagemaker 1.55.3 \r\n", "sagemaker-pyspark 1.3.0 \r\n", "scikit-image 0.13.1 \r\n", "scikit-learn 0.22.1 \r\n", "scikit-optimize 0.7.4 \r\n", "scipy 1.4.1 \r\n", "seaborn 0.8.1 \r\n", "Send2Trash 1.5.0 \r\n", "setuptools 45.2.0 \r\n", "simplegeneric 0.8.1 \r\n", "singledispatch 3.4.0.3 \r\n", "six 1.11.0 \r\n", "smdebug-rulesconfig 0.1.2 \r\n", "snowballstemmer 1.2.1 \r\n", "sortedcollections 0.6.1 \r\n", "sortedcontainers 1.5.10 \r\n", "sparkmagic 0.12.5 \r\n", "Sphinx 1.7.4 \r\n", "sphinxcontrib-websupport 1.0.1 \r\n", "spyder 3.2.8 \r\n", "SQLAlchemy 1.2.11 \r\n", "statsmodels 0.9.0 \r\n", "sympy 1.1.1 \r\n", "tables 3.4.3 \r\n", "TBB 0.1 \r\n", "tblib 1.3.2 \r\n", "tensorboard 1.15.0 \r\n", "tensorflow 1.15.2 \r\n", "tensorflow-estimator 1.15.1 \r\n", "tensorflow-serving-api 1.15.0 \r\n", "termcolor 1.1.0 \r\n", "terminado 0.8.1 \r\n", "testpath 0.3.1 \r\n", "texttable 1.6.2 \r\n", "toolz 0.9.0 \r\n", "tornado 5.0.2 \r\n", "tqdm 4.46.1 \r\n", "traitlets 4.3.2 \r\n", "typing 3.6.4 \r\n", "unicodecsv 0.14.1 \r\n", "urllib3 1.23 \r\n", "wcwidth 0.1.7 \r\n", "webencodings 0.5.1 \r\n", "websocket-client 0.57.0 \r\n", "Werkzeug 0.14.1 \r\n", "wheel 0.31.1 \r\n", "widgetsnbextension 3.4.2 \r\n", "wrapt 1.11.2 \r\n", "xlrd 1.1.0 \r\n", "XlsxWriter 1.0.4 \r\n", "xlwt 1.3.0 \r\n", "zict 0.1.3 \r\n", "zipp 3.0.0 \r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[33mWARNING: You are using pip version 19.3.1; however, version 20.1.1 is available.\r\n", "You should consider upgrading via the 'pip install --upgrade pip' command.\u001b[0m\r\n" ] } ], "source": [ "!pip list" ] } ], "metadata": { "kernelspec": { "display_name": "conda_tensorflow_p36", "language": "python", "name": "conda_tensorflow_p36" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.6" } }, "nbformat": 4, "nbformat_minor": 4 }