# Deploy Models

This notebook deploys two large language models (LLMs): one for generating embeddings and another for a question answering task. These models are then used in the `embed_data.ipynb` notebook and the `qa-w-rag-finetuned-llm` Lambda function.

In this notebook we are deploying the [`FLAN-T5 XXL`](https://huggingface.co/google/flan-t5-xxl) model as the LLM for generating the response to questions and the [`GPT-J-6B`](https://huggingface.co/EleutherAI/gpt-j-6b) as the LLM for generating embeddings.

In [1]:
!pip install --upgrade sagemaker --quiet

In [2]:
import sys
import time
import logging
import sagemaker, boto3, json
from sagemaker.model import Model
from sagemaker.session import Session
from sagemaker.predictor import Predictor
from sagemaker.utils import name_from_base
from sagemaker import image_uris, model_uris, script_uris, hyperparameters

In [3]:
# global constants
APP_NAME = "qa-w-rag"
EMBEDDING_MODEL = "huggingface-textembedding-gpt-j-6b"

In [4]:
logger = logging.getLogger()
logging.basicConfig(format='%(asctime)s,%(module)s,%(processName)s,%(levelname)s,%(message)s', level=logging.INFO, stream=sys.stderr)

In [5]:
sagemaker_session = Session()
aws_role = sagemaker_session.get_caller_identity_arn()
aws_region = boto3.Session().region_name
sess = sagemaker.Session()
model_version = "*"
logger.info(f"aws_role={aws_role}, aws_region={aws_region}")

2023-05-02 14:22:28,427,credentials,MainProcess,INFO,Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole
2023-05-02 14:22:28,766,credentials,MainProcess,INFO,Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole
2023-05-02 14:22:28,872,232507896,MainProcess,INFO,aws_role=arn:aws:iam::015469603702:role/SageMakerRepoRole, aws_region=us-east-1


In [6]:
def parse_response_model_flan_t5(query_response):
 model_predictions = json.loads(query_response["Body"].read())
 generated_text = model_predictions["generated_texts"]
 return generated_text

In [7]:
MODEL_CONFIG_LIST = [
 {
 "model_id": "huggingface-text2text-flan-t5-xxl",
 "model_version": "*",
 "instance_type": "ml.g5.12xlarge",
 "instance_count": 1,
 "env": {"TS_DEFAULT_WORKERS_PER_MODEL": "1"},
 "predictor_cls": Predictor
 },
 {
 "model_id": "huggingface-textembedding-gpt-j-6b",
 "model_version": "*",
 "instance_type": "ml.g5.24xlarge",
 "instance_count": 1,
 "env": {"TS_DEFAULT_WORKERS_PER_MODEL": "2"},
 }
]

In [8]:
newline, bold, unbold = "\n", "\033[1m", "\033[0m"

for model in MODEL_CONFIG_LIST: 
 start = time.time()
 endpoint_name = name_from_base(f"{APP_NAME}-{model['model_id']}")
 logger.info(f"going to deploy model={model}, endpoint_name={endpoint_name}") 
 # Retrieve the inference container uri. This is the base HuggingFace container image for the default model above.
 deploy_image_uri = image_uris.retrieve(
 region=None,
 framework=None, # automatically inferred from model_id
 image_scope="inference",
 model_id=model['model_id'],
 model_version=model['model_version'],
 instance_type=model['instance_type'],
 )
 # Retrieve the model uri.
 model_uri = model_uris.retrieve(
 model_id=model['model_id'], model_version=model['model_version'], model_scope="inference"
 )
 logger.info(f"deploy_image_uri={deploy_image_uri}, model_uri={model_uri}")
 model_inference = Model(
 image_uri=deploy_image_uri,
 model_data=model_uri,
 role=aws_role,
 predictor_cls=model.get("predictor_cls"),
 name=endpoint_name,
 env=model['env'],
 )
 model_predictor_inference = model_inference.deploy(
 initial_instance_count=model['instance_count'],
 instance_type=model['instance_type'],
 predictor_cls=model.get("predictor_cls"),
 endpoint_name=endpoint_name,
 )
 time_taken = time.time() - start
 logger.info(f"{bold}model={model['model_id']} has been deployed successfully at endpoint={endpoint_name}, took {time_taken}seconds{unbold}{newline}")
 model["endpoint_name"] = endpoint_name

2023-05-02 14:22:28,893,2580055255,MainProcess,INFO,going to deploy model={'model_id': 'huggingface-text2text-flan-t5-xxl', 'model_version': '*', 'instance_type': 'ml.g5.12xlarge', 'instance_count': 1, 'env': {'TS_DEFAULT_WORKERS_PER_MODEL': '1'}, 'predictor_cls': }, endpoint_name=qa-w-rag-huggingface-text2text-flan-t5--2023-05-02-14-22-28-892
2023-05-02 14:22:28,928,credentials,MainProcess,INFO,Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole
2023-05-02 14:22:29,436,2580055255,MainProcess,INFO,deploy_image_uri=763104351884.dkr.ecr.us-east-1.amazonaws.com/pytorch-inference:1.12.0-gpu-py38, model_uri=s3://jumpstart-cache-prod-us-east-1/huggingface-infer/prepack/v1.0.2/infer-prepack-huggingface-text2text-flan-t5-xxl.tar.gz
2023-05-02 14:22:29,532,session,MainProcess,INFO,Creating model with name: qa-w-rag-huggingface-text2text-flan-t5--2023-05-02-14-22-28-892
2023-05-02 14:22:30,448,session,MainProcess,INFO,Creating endpoint-config with name qa-w-rag-huggingface-text2

-------------!

2023-05-02 14:29:33,163,2580055255,MainProcess,INFO,[1mmodel=huggingface-text2text-flan-t5-xxl has been deployed successfully at endpoint=qa-w-rag-huggingface-text2text-flan-t5--2023-05-02-14-22-28-892, took 424.27027797698975seconds[0m

2023-05-02 14:29:33,163,2580055255,MainProcess,INFO,going to deploy model={'model_id': 'huggingface-textembedding-gpt-j-6b', 'model_version': '*', 'instance_type': 'ml.g5.24xlarge', 'instance_count': 1, 'env': {'TS_DEFAULT_WORKERS_PER_MODEL': '2'}}, endpoint_name=qa-w-rag-huggingface-textembedding-gpt--2023-05-02-14-29-33-163
2023-05-02 14:29:33,325,2580055255,MainProcess,INFO,deploy_image_uri=763104351884.dkr.ecr.us-east-1.amazonaws.com/pytorch-inference:1.12.0-gpu-py38, model_uri=s3://jumpstart-cache-prod-us-east-1/huggingface-infer/prepack/v1.0.0/infer-prepack-huggingface-textembedding-gpt-j-6b.tar.gz
2023-05-02 14:29:33,353,session,MainProcess,INFO,Creating model with name: qa-w-rag-huggingface-textembedding-gpt--2023-05-02-14-29-33-163
2023-05-0

-----------!

2023-05-02 14:35:36,697,2580055255,MainProcess,INFO,[1mmodel=huggingface-textembedding-gpt-j-6b has been deployed successfully at endpoint=qa-w-rag-huggingface-textembedding-gpt--2023-05-02-14-29-33-163, took 363.5339434146881seconds[0m



In [9]:
embedding_model_endpoint_name = None
for model in MODEL_CONFIG_LIST:
 if model['model_id'] == EMBEDDING_MODEL:
 embedding_model_endpoint_name = model['endpoint_name']
 logger.info(f"EMBEDDING_MODEL={EMBEDDING_MODEL}, embedding_model_endpoint_name={embedding_model_endpoint_name}")


2023-05-02 14:35:36,705,4061947677,MainProcess,INFO,EMBEDDING_MODEL=huggingface-textembedding-gpt-j-6b, embedding_model_endpoint_name=qa-w-rag-huggingface-textembedding-gpt--2023-05-02-14-29-33-163


In [10]:
%store embedding_model_endpoint_name

Stored 'embedding_model_endpoint_name' (str)
