# Deploy pre-trained HF model extending the PyTorch 1.8.1 DL inference container

In [None]:
import boto3
import sagemaker
from sagemaker import get_execution_role

ecr_namespace = 'huggingface/'
prefix = 'huggingface-pytorch-serving-container'

ecr_repository_name = ecr_namespace + prefix
role = get_execution_role()
account_id = role.split(':')[4]
region = boto3.Session().region_name
sagemaker_session = sagemaker.session.Session()
bucket = sagemaker_session.default_bucket()
prefix = 'hfdeploypytorch-extend'
hf_cache_dir = 'hf_cache_dir/'

print(account_id)
print(region)
print(role)
print(bucket)

# Build container

In [None]:
! pygmentize docker/Dockerfile

In [None]:
! pygmentize scripts/build_and_push.sh

In [None]:
! /bin/sh scripts/build_and_push.sh $account_id $region $ecr_repository_name

# Download model from HF and save to Amazon S3

In [None]:
pip install transformers==4.5.1

In [None]:
! mkdir -p $hf_cache_dir

In [None]:
import torch
from transformers import BartForConditionalGeneration, BartTokenizer

PRE_TRAINED_MODEL_NAME='facebook/bart-large-cnn'

# Note that we use a specific HF cache dir, to avoid using the default cache dirs that might fill 
# root disk space.
model = BartForConditionalGeneration.from_pretrained(PRE_TRAINED_MODEL_NAME, cache_dir=hf_cache_dir)
model.save_pretrained('./models/bart_model/')

In [None]:
tokenizer = BartTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)
tokenizer.save_pretrained('./models/bart_tokenizer/')

In [None]:
!tar -C models/ -cvzf model.tar.gz bart_model/ bart_tokenizer/

In [None]:
from sagemaker.s3 import S3Uploader
model_artifact = S3Uploader.upload('model.tar.gz','s3://{0}/{1}/model'.format(bucket, prefix))
print(model_artifact)

# Deploy model

In [None]:
container_image_uri = '{0}.dkr.ecr.{1}.amazonaws.com/{2}:latest'.format(account_id, region, ecr_repository_name)
print(container_image_uri)

In [None]:
from sagemaker.predictor import Predictor
from sagemaker.serializers import JSONSerializer
from sagemaker.deserializers import JSONDeserializer

class Summarizer(Predictor):
 def __init__(self, endpoint_name, sagemaker_session):
 super().__init__(endpoint_name, sagemaker_session=sagemaker_session,
 serializer=JSONSerializer(), 
 deserializer=JSONDeserializer())

In [None]:
from sagemaker import Model

hf_model = Model(image_uri=container_image_uri,
 model_data=model_artifact,
 predictor_cls=Summarizer,
 sagemaker_session=sagemaker_session,
 env = {
 'SAGEMAKER_PROGRAM': 'predict'
 },
 role=role)

In [None]:
predictor = hf_model.deploy(instance_type='ml.m5.4xlarge',
 initial_instance_count=1)
predictor

# Test inference

In [None]:
with open('article.txt') as f:
 content = f.read()
content = content.replace('\n', ' ')

json_request_data = {"text": "{0}"}
json_request_data["text"] = json_request_data["text"].format(content)

json_request_data

In [None]:
%%time
prediction = predictor.predict(json_request_data)
print(prediction)

# Delete endpoint

In [None]:
predictor.delete_endpoint()