# Serve a TensorFlow lite model

In [1]:
import tensorflow
import tensorflow_hub as hub

import numpy as np
import tensorflow as tf

# Load TFLite model and allocate tensors.
interpreter = tf.lite.Interpreter(model_path="mobilenet_v1_1.0_224_quant.tflite")
interpreter.allocate_tensors()

# Get input and output tensors.
input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()

# Test model on random input data.
input_shape = input_details[0]['shape']
input_data = np.array(np.random.random_sample(input_shape), dtype=np.float32)
interpreter.set_tensor(input_details[0]['index'], input_data)

interpreter.invoke()

# The function `get_tensor()` returns a copy of the tensor data.
# Use `tensor()` in order to get a pointer to the tensor.
output_data = interpreter.get_tensor(output_details[0]['index'])
print(output_data)

ValueError: Cannot set tensor: Got tensor of type FLOAT32 but expected type UINT8 for input 88, name: input 

In [4]:
embeddings = embed([inputs])
print(embeddings)

tf.Tensor(
[[-3.13301645e-02 -6.33863360e-02 -1.60749946e-02 -1.03489561e-02
  -4.65009995e-02  3.72315571e-02  5.91584947e-03  7.17439875e-02
   1.66644827e-02  6.09076358e-02  6.65525720e-02  2.37051025e-02
   5.76475868e-04  5.68432137e-02  2.41616759e-02 -5.33628371e-03
   4.70477156e-02  1.92157198e-02  7.68255442e-02  5.66959940e-03
  -7.52822161e-02 -1.71372388e-02 -7.50271082e-02  7.63734803e-02
  -5.43796048e-02 -1.38910150e-03 -1.83018427e-02 -4.67203930e-02
  -4.72413860e-02  2.70678177e-02  3.23334038e-02  5.53706214e-02
   3.37095372e-02 -1.37066245e-02  5.52706001e-03 -8.22692588e-02
   1.41951097e-02  6.82791322e-02  1.83205083e-02 -2.14787442e-02
   4.14966866e-02 -2.02740021e-02 -6.01055054e-03  2.44824495e-02
  -8.84009004e-02 -2.56653987e-02 -3.83261517e-02 -5.61062619e-02
   4.68128063e-02  3.20312604e-02  7.72727579e-02 -8.25007185e-02
   5.45060262e-03  5.79300802e-03 -3.86942066e-02  2.90922675e-04
   6.13495857e-02  7.35033751e-02  5.46342283e-02 -8.05496648e-02

## Step 1 : Write a model transform script

#### Make sure you have a ...

- "load_model" function
    - input args are model path
    - returns loaded model object
    - model name is the same as what you saved the model file as (see above step)
<br><br>
- "predict" function
    - input args are the loaded model object and a payload
    - returns the result of model.predict
    - make sure you format it as a single (or multiple) string return inside a list for real time (for mini batch)
    - from a client, a list  or string or np.array that is sent for prediction is interpreted as bytes. Do what you have to for converting back to list or string or np.array
    - return the error for debugging


In [5]:
%%writefile modelscript_tensorflow.py
import tensorflow as tf
import numpy as np
import tensorflow_hub as hub
import json

#Return loaded model
def load_model(modelpath):
    model = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4") 
    return model

# return prediction based on loaded model (from the step above) and an input payload
def predict(model, payload):
    try:
        if(type(payload) == str):
            data = [payload]
        else:
            data = [payload.decode()]# For Multi model endpoints -> [payload[0]['body'].decode()]
            
        out = np.asarray(model(data)).tolist()
    except Exception as e:
        out = str(e)
    return [json.dumps({'output':[out],'tfeager': tf.executing_eagerly()})]

Writing modelscript_tensorflow.py


## Does this work locally? (not "_in a container locally_", but _actually_ in local)

In [6]:
from modelscript_tensorflow import *
model = load_model('./') # path doesn't matter here since we're loading the model directly in the script

In [7]:
predict(model,inputs)

['{"output": [[[-0.0313301682472229, -0.06338633596897125, -0.016074996441602707, -0.010348955169320107, -0.04650101065635681, 0.03723156452178955, 0.005915849469602108, 0.07174398750066757, 0.01666448265314102, 0.06090763583779335, 0.06655257940292358, 0.023705095052719116, 0.0005764864035882056, 0.05684320256114006, 0.024161681532859802, -0.005336277186870575, 0.04704771563410759, 0.019215719774365425, 0.07682554423809052, 0.0056695942766964436, -0.07528221607208252, -0.01713723875582218, -0.07502710819244385, 0.07637347280979156, -0.05437960475683212, -0.0013891014968976378, -0.018301840871572495, -0.04672039672732353, -0.04724138602614403, 0.027067814022302628, 0.03233340382575989, 0.055370621383190155, 0.033709533512592316, -0.013706627301871777, 0.005527065135538578, -0.08226925879716873, 0.014195104129612446, 0.06827913224697113, 0.01832052320241928, -0.021478744223713875, 0.041496675461530685, -0.020274000242352486, -0.006010554730892181, 0.024482453241944313, -0.08840090036392

### ok great! Now let's install ezsmdeploy

_[To Do]_: currently local; replace with pip version!

In [9]:
!pip install ezsmdeploy

Collecting ezsmdeploy
  Using cached ezsmdeploy-0.1.5-py3-none-any.whl (22 kB)
Installing collected packages: ezsmdeploy
Successfully installed ezsmdeploy-0.1.5


In [10]:
import ezsmdeploy

#### If you have been running other inference containers in local mode, stop existing containers to avoid conflict

In [11]:
!docker container stop $(docker container ls -aq) >/dev/null

## Deploy locally

Large models take longer to download and deploy (check TF hub source code to check. Also, keep in mind that hub models are downloaded in each worker; TF hub will recognize that all workers are set to download the same model and will not repeat the download; it will instead give you a _already being downloaded by "worker id"_. Since it takes longer to deploy, initial health pings may also fail. But make sure you see a "Deployed!" and make predictions before proceeding. 

In [12]:
ez = ezsmdeploy.Deploy(model = None, #Since we are loading a model from TF hub
                  script = 'modelscript_tensorflow.py',
                  requirements = ['numpy','tensorflow-gpu==2.0.0','tensorflow_hub'], #or pass in the path to requirements.txt
                  instance_type = 'local_gpu', #... if you intend to deploy on GPU with the flask-ngnix stack
                  wait = True)

[K0:00:00.003290 | No model was passed. Assuming you are downloading a model in the script or in the container
[K0:00:00.108036 | uploaded model tarball(s) ; check returned modelpath
[K0:00:00.108894 | added requirements file
[K0:00:00.110766 | added source file
[K0:00:00.112247 | added Dockerfile
[K0:00:00.114261 | added model_handler and docker utils
[K0:00:00.114350 | building docker container
[K0:02:40.172127 | built docker container
[K0:02:40.286748 | created model(s). Now deploying on local_gpu
[32m∙∙∙[0m [K



[32m∙∙∙[0m [KAttaching to tmpfd5boyt9_algo-1-qmeju_1
[36malgo-1-qmeju_1  |[0m Starting the inference server with 32 workers.
[32m∙●∙[0m [K[36malgo-1-qmeju_1  |[0m [2020-04-23 23:26:07 +0000] [9] [INFO] Starting gunicorn 20.0.4
[36malgo-1-qmeju_1  |[0m [2020-04-23 23:26:07 +0000] [9] [INFO] Listening at: unix:/tmp/gunicorn.sock (9)
[36malgo-1-qmeju_1  |[0m [2020-04-23 23:26:07 +0000] [9] [INFO] Using worker: gevent
[36malgo-1-qmeju_1  |[0m [2020-04-23 23:26:07 +0000] [13] [INFO] Booting worker with pid: 13
[36malgo-1-qmeju_1  |[0m [2020-04-23 23:26:07 +0000] [14] [INFO] Booting worker with pid: 14
[36malgo-1-qmeju_1  |[0m [2020-04-23 23:26:07 +0000] [15] [INFO] Booting worker with pid: 15
[32m∙∙●[0m [K[36malgo-1-qmeju_1  |[0m [2020-04-23 23:26:07 +0000] [16] [INFO] Booting worker with pid: 16
[36malgo-1-qmeju_1  |[0m [2020-04-23 23:26:07 +0000] [17] [INFO] Booting worker with pid: 17
[36malgo-1-qmeju_1  |[0m [2020-04-23 23:26:07 +0000] [19] [INFO] Booting wo

[32m∙●∙[0m [K[36malgo-1-qmeju_1  |[0m [2020-04-23 23:34:08 +0000] [1274] [INFO] Booting worker with pid: 1274
[32m∙●∙[0m [K[36malgo-1-qmeju_1  |[0m [2020-04-23 23:35:24 +0000] [9] [CRITICAL] WORKER TIMEOUT (pid:583)
[36malgo-1-qmeju_1  |[0m 2020/04/23 23:35:24 [error] 10#10: *15 upstream prematurely closed connection while reading response header from upstream, client: 172.27.0.1, server: , request: "GET /ping HTTP/1.1", upstream: "http://unix:/tmp/gunicorn.sock:/ping", host: "localhost:8080"
[36malgo-1-qmeju_1  |[0m 172.27.0.1 - - [23/Apr/2020:23:35:24 +0000] "GET /ping HTTP/1.1" 502 182 "-" "-"
[36malgo-1-qmeju_1  |[0m [2020-04-23 23:35:24 +0000] [583] [INFO] Worker exiting (pid: 583)
[32m∙∙∙[0m [K[36malgo-1-qmeju_1  |[0m [2020-04-23 23:35:25 +0000] [1308] [INFO] Booting worker with pid: 1308
[32m●∙∙[0m [K[36malgo-1-qmeju_1  |[0m [2020-04-23 23:36:33 +0000] [9] [CRITICAL] WORKER TIMEOUT (pid:1274)
[36malgo-1-qmeju_1  |[0m 2020/04/23 23:36:33 [error] 10#10:

## Test containerized version locally

Since you are downloading this model from a hub, the first time you invoke it will be slow, so invoke again to get an inference without all of the container logs. Prediction will especially be slow if your model is still downloading!

In [13]:
out = ez.predictor.predict(inputs.encode()).decode()
out

[36malgo-1-qmeju_1  |[0m received input data
[36malgo-1-qmeju_1  |[0m b'The quick brown fox jumps over the lazy dog.'
[36malgo-1-qmeju_1  |[0m 2020-04-23 23:41:23.248410: W tensorflow/stream_executor/platform/default/dso_loader.cc:55] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
[36malgo-1-qmeju_1  |[0m 2020-04-23 23:41:23.248450: E tensorflow/stream_executor/cuda/cuda_driver.cc:318] failed call to cuInit: UNKNOWN ERROR (303)
[36malgo-1-qmeju_1  |[0m 2020-04-23 23:41:23.248487: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:163] no NVIDIA GPU device is present: /dev/nvidia0 does not exist
[36malgo-1-qmeju_1  |[0m 2020-04-23 23:41:23.248690: I tensorflow/core/platform/cpu_feature_guard.cc:142] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA
[36malgo-1-qmeju_1  |[0m 2020-04-23 23:41:23.276428: I tensorflow/core/platform/profile_utils/cpu_uti

'{"tfeager": true, "output": [[[-0.031330183148384094, -0.06338634341955185, -0.016074996441602707, -0.010348981246352196, -0.046500977128744125, 0.03723153844475746, 0.005915854126214981, 0.07174400240182877, 0.016664467751979828, 0.060907647013664246, 0.06655259430408478, 0.023705121129751205, 0.0005764692323282361, 0.05684323608875275, 0.024161657318472862, -0.00533629534766078, 0.04704771935939789, 0.019215712323784828, 0.07682554423809052, 0.005669617559760809, -0.07528220862150192, -0.017137235030531883, -0.07502710819244385, 0.07637348026037216, -0.054379601031541824, -0.0013890593545511365, -0.018301844596862793, -0.04672040790319443, -0.047241389751434326, 0.02706781215965748, 0.03233340010046959, 0.055370621383190155, 0.03370954468846321, -0.013706635683774948, 0.005527033936232328, -0.08226925879716873, 0.01419509295374155, 0.06827915459871292, 0.018320485949516296, -0.021478744223713875, 0.041496679186820984, -0.020274005830287933, -0.006010557524859905, 0.02448243275284767

[36malgo-1-qmeju_1  |[0m ['{"tfeager": true, "output": [[[-0.031330183148384094, -0.06338634341955185, -0.016074996441602707, -0.010348981246352196, -0.046500977128744125, 0.03723153844475746, 0.005915854126214981, 0.07174400240182877, 0.016664467751979828, 0.060907647013664246, 0.06655259430408478, 0.023705121129751205, 0.0005764692323282361, 0.05684323608875275, 0.024161657318472862, -0.00533629534766078, 0.04704771935939789, 0.019215712323784828, 0.07682554423809052, 0.005669617559760809, -0.07528220862150192, -0.017137235030531883, -0.07502710819244385, 0.07637348026037216, -0.054379601031541824, -0.0013890593545511365, -0.018301844596862793, -0.04672040790319443, -0.047241389751434326, 0.02706781215965748, 0.03233340010046959, 0.055370621383190155, 0.03370954468846321, -0.013706635683774948, 0.005527033936232328, -0.08226925879716873, 0.01419509295374155, 0.06827915459871292, 0.018320485949516296, -0.021478744223713875, 0.041496679186820984, -0.020274005830287933, -0.00601055752

In [14]:
!docker container stop $(docker container ls -aq) >/dev/null

[36malgo-1-qmeju_1  |[0m [2020-04-23 23:41:35 +0000] [9] [INFO] Handling signal: term
[36mtmpfd5boyt9_algo-1-qmeju_1 exited with code 0
[0mAborting on container exit...


## Deploy on SageMaker

In [15]:
ezonsm = ezsmdeploy.Deploy(model = None, #Since we are loading a model from TF hub,
                  script = 'modelscript_tensorflow.py',
                  requirements = ['numpy','tensorflow-gpu==2.0.0','tensorflow_hub'],
                  wait = True,
                  instance_type = 'ml.p3.2xlarge',
                  monitor = True) # turn on model monitoring 

[K0:00:00.003170 | No model was passed. Assuming you are downloading a model in the script or in the container
[K0:00:00.077102 | uploaded model tarball(s) ; check returned modelpath
[K0:00:00.078005 | added requirements file
[K0:00:00.080012 | added source file
[K0:00:00.081448 | added Dockerfile
[K0:00:00.083687 | added model_handler and docker utils
[K0:00:00.083777 | building docker container
[K0:01:19.316520 | built docker container
[K0:01:19.422949 | created model(s). Now deploying on ml.p3.2xlarge
[K0:10:51.965824 | deployed model-!
[K0:10:51.966408 | estimated cost is $4.627 per hour
[K0:10:51.967389 | model monitor data capture location is s3://sagemaker-us-east-1-497456752804/ezsmdeploy/model-anzlpxc6eltjicelbcb5ho/datacapture
[K[32m0:10:51.967497 | Done! ✔[0m 


In [16]:
out = ezonsm.predictor.predict(inputs).decode()
out

'{"output": [[[-0.031330183148384094, -0.06338634341955185, -0.016074996441602707, -0.010348981246352196, -0.046500977128744125, 0.03723153844475746, 0.005915854126214981, 0.07174400240182877, 0.016664467751979828, 0.060907647013664246, 0.06655259430408478, 0.023705121129751205, 0.0005764692323282361, 0.05684323608875275, 0.024161657318472862, -0.00533629534766078, 0.04704771935939789, 0.019215712323784828, 0.07682554423809052, 0.005669617559760809, -0.07528220862150192, -0.017137235030531883, -0.07502710819244385, 0.07637348026037216, -0.054379601031541824, -0.0013890593545511365, -0.018301844596862793, -0.04672040790319443, -0.047241389751434326, 0.02706781215965748, 0.03233340010046959, 0.055370621383190155, 0.03370954468846321, -0.013706635683774948, 0.005527033936232328, -0.08226925879716873, 0.01419509295374155, 0.06827915459871292, 0.018320485949516296, -0.021478744223713875, 0.041496679186820984, -0.020274005830287933, -0.006010557524859905, 0.02448243275284767, -0.088400892913

In [17]:
ezonsm.predictor.delete_endpoint()