In [None]:
import sagemaker
from sagemaker.mxnet.model import MXNetModel
from sagemaker import get_execution_role

In [None]:
sagemaker_session = sagemaker.Session()

role = get_execution_role()

model_data = 's3://<your Amazon S3 bucket name>/gpt2-model/model.tar.gz'
entry_point = './gpt2-inference.py'

## To define MXNetModel

In [None]:
mxnet_model = MXNetModel(model_data=model_data,
                         role=role,
                         entry_point=entry_point,
                         py_version='py3',
                         framework_version='1.6.0',
                         image='<image uri of the container image>,
                         model_server_workers=2
                        )

## Deploy model endpoint

In [None]:
predictor = mxnet_model.deploy(instance_type='ml.c5.large', initial_instance_count=1)
print(predictor.endpoint)

## Run a simple performance test

In [None]:
import sagemaker
from sagemaker.mxnet.model import MXNetPredictor

sagemaker_session = sagemaker.Session()

endpoint_name = '<ENDPOINT 이름>'
predictor = MXNetPredictor(endpoint_name, sagemaker_session)

input_sentence = '아기 공룡 둘리는 희동이와'

pred_latency_sum = 0
pred_count_sum = 0
pred_cnt = 0

for i in range(20):
  try:
    pred_out = predictor.predict(input_sentence)
    if i == 0:
      continue
    
    predicted_sentence= pred_out[0]
    predict_count = pred_out[1]
    predict_latency = pred_out[2]
  
    pred_latency_sum += predict_latency
    pred_count_sum =+ predict_count
    pred_cnt += 1
  except:
    print('Error and ingore it.')

avg_latency = pred_latency_sum / pred_cnt
avg_latency_per_inf = avg_latency / pred_count_sum

print('Input sentence: {}'.format(input_sentence))
print('Predicted sentence: {}'.format(predicted_sentence))
print('Average number of inferenced token: {:.2f}'.format(pred_count_sum))
print('Average inference latency for a sentence completion: {:.2f}'.format(avg_latency))
print('Average inference latency per a token: {:.2f}\n'.format(avg_latency_per_inf))

## Clean UP!

In [None]:
predictor.delete_endpoint()
predictor.delete_model()