{ "cells": [ { "cell_type": "markdown", "id": "d245b130-4eaf-4fa1-b7f8-7eb5e1e9a7eb", "metadata": { "tags": [] }, "source": [ "# Deploy Flan-T5 XXL on SageMaker\n", "\n", "Now, we will deploy the model on SageMaker realtime endpoint, which is also trained on SageMaker with deepspeed on multiple nodes." ] }, { "cell_type": "code", "execution_count": null, "id": "ea489d40-d76e-4f19-a8b0-5755f7b1f23d", "metadata": { "tags": [] }, "outputs": [], "source": [ "import sagemaker\n", "import boto3\n", "\n", "sess = sagemaker.Session()\n", "role = sagemaker.get_execution_role()\n", "\n", "print(f\"sagemaker role arn: {role}\")\n", "print(f\"sagemaker bucket: {sess.default_bucket()}\")\n", "print(f\"sagemaker session region: {sess.boto_region_name}\")\n" ] }, { "cell_type": "markdown", "id": "7b79706d-a346-4072-878e-4e1205e4a4f5", "metadata": { "tags": [] }, "source": [ "We trained the Flan-T5-XXL, and the model is saved as BF16 format. We will use Huggingface accelerate to speed up the model inference. " ] }, { "cell_type": "code", "execution_count": null, "id": "b30d0c87-d3f7-4689-829e-d6b02fe9f60f", "metadata": { "tags": [] }, "outputs": [], "source": [ "!mkdir deploy_code" ] }, { "cell_type": "code", "execution_count": null, "id": "572ed109-920f-4d6f-8215-e9f3e95e9718", "metadata": { "tags": [] }, "outputs": [], "source": [ "%%writefile deploy_code/requirements.txt\n", "accelerate==0.16.0\n", "transformers==4.26.0\n", "bitsandbytes==0.37.0" ] }, { "cell_type": "markdown", "id": "1fd6d42e-beb4-4632-a3f2-cb2f483e560e", "metadata": { "tags": [] }, "source": [ "Now, we use Huggingface accelerate to speed up the model inference (configure \"engine\" to \"Python\"). And we will use g5.48xlarge (which has 8 GPUs) to deploy, so option.tensor_parallel_degree is set to 8. Finally, please configure the option.s3url to your model assets' S3 path, the suffix '/' is a must (such as s3://your_bucket/flan-t5-xxl/model/)." ] }, { "cell_type": "code", "execution_count": null, "id": "3d366acc-b7cc-492b-952b-42db3d6ac75c", "metadata": { "tags": [] }, "outputs": [], "source": [ "%%writefile deploy_code/serving.properties\n", "engine=Python\n", "option.tensor_parallel_degree=8\n", "option.s3url=s3://your_bucket/flan-t5-xxl/model/" ] }, { "cell_type": "code", "execution_count": null, "id": "7e35c899-9c1a-443b-89b0-6f4ed27593c5", "metadata": { "tags": [] }, "outputs": [], "source": [ "%%writefile deploy_code/model.py\n", "from djl_python import Input, Output\n", "import torch\n", "import logging\n", "import math\n", "import os\n", "from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer\n", "\n", "\n", "def load_model(properties):\n", " tensor_parallel = properties[\"tensor_parallel_degree\"]\n", " model_location = properties['model_dir']\n", " if \"model_id\" in properties:\n", " model_location = properties['model_id']\n", " logging.info(f\"Loading model in {model_location}\")\n", " \n", " tokenizer = AutoTokenizer.from_pretrained(model_location)\n", " \n", " model = AutoModelForSeq2SeqLM.from_pretrained(\n", " model_location, \n", " device_map=\"balanced_low_0\", \n", " #load_in_8bit=True\n", " )\n", " model.requires_grad_(False)\n", " model.eval()\n", " \n", " return model, tokenizer\n", "\n", "\n", "model = None\n", "tokenizer = None\n", "generator = None\n", "\n", "\n", "def handle(inputs: Input):\n", " global model, tokenizer\n", " if not model:\n", " model, tokenizer = load_model(inputs.get_properties())\n", "\n", " if inputs.is_empty():\n", " return None\n", " data = inputs.get_as_json()\n", " \n", " input_sentences = data[\"inputs\"]\n", " params = data[\"parameters\"]\n", " \n", " # preprocess\n", " input_ids = tokenizer(input_sentences, return_tensors=\"pt\").input_ids\n", " # pass inputs with all kwargs in data\n", " if params is not None:\n", " outputs = model.generate(input_ids, **params)\n", " else:\n", " outputs = model.generate(input_ids)\n", "\n", " # postprocess the prediction\n", " prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)\n", " \n", " result = {\"outputs\": prediction}\n", " return Output().add_as_json(result)" ] }, { "cell_type": "markdown", "id": "23a3125b-fddf-458e-ad21-9fa61d923606", "metadata": {}, "source": [ "We will use LMI (large model inference) container on SageMaker to serve the LLM." ] }, { "cell_type": "code", "execution_count": null, "id": "879f42da-c2dd-47a3-96ad-43dde83fdfd5", "metadata": { "tags": [] }, "outputs": [], "source": [ "import sagemaker\n", "\n", "sess = sagemaker.Session()\n", "region = sess._region_name\n", "\n", "inference_image_uri = (\n", " f\"763104351884.dkr.ecr.{region}.amazonaws.com/djl-inference:0.21.0-deepspeed0.8.0-cu117\"\n", ")\n", "\n", "print(f\"Image going to be used is ---- > {inference_image_uri}\")" ] }, { "cell_type": "code", "execution_count": null, "id": "7a7070fe-3a46-4f28-976b-33da12a1880f", "metadata": { "tags": [] }, "outputs": [], "source": [ "!rm model-liang.tar.gz\n", "!tar czvf model-liang.tar.gz -C deploy_code ." ] }, { "cell_type": "code", "execution_count": null, "id": "265902c9-6406-45b7-b75f-2c1c5d86c65b", "metadata": { "tags": [] }, "outputs": [], "source": [ "s3_code_prefix = 'code_flan_t5_LMI_liang'\n", "bucket = sess.default_bucket() \n", "s3_code_artifact = sess.upload_data(\"model-liang.tar.gz\", bucket, s3_code_prefix)\n", "print(f\"S3 Code or Model tar ball uploaded to --- > {s3_code_artifact}\")" ] }, { "cell_type": "code", "execution_count": null, "id": "f0d7c064-964d-4c09-8592-2806e6325ed0", "metadata": { "tags": [] }, "outputs": [], "source": [ "from sagemaker.utils import name_from_base\n", "import boto3\n", "sm_client = boto3.client(\"sagemaker\")\n", "smr_client = boto3.client(\"sagemaker-runtime\")\n", "\n", "model_name = name_from_base(f\"flan-t5-xxl-accelerate-LMI\")\n", "print(model_name)\n", "print(f\"Image going to be used is ---- > {inference_image_uri}\")\n", "\n", "create_model_response = sm_client.create_model(\n", " ModelName=model_name,\n", " ExecutionRoleArn=role,\n", " PrimaryContainer={\n", " \"Image\": inference_image_uri,\n", " \"ModelDataUrl\": s3_code_artifact\n", " },\n", " \n", ")\n", "model_arn = create_model_response[\"ModelArn\"]\n", "\n", "print(f\"Created Model: {model_arn}\")" ] }, { "cell_type": "code", "execution_count": null, "id": "76f48151-f61d-4e27-8671-12108a9a882c", "metadata": { "tags": [] }, "outputs": [], "source": [ "endpoint_config_name = f\"{model_name}-config-88\"\n", "endpoint_name = f\"{model_name}-endpoint\"\n", "\n", "endpoint_config_response = sm_client.create_endpoint_config(\n", " EndpointConfigName=endpoint_config_name,\n", " ProductionVariants=[\n", " {\n", " \"VariantName\": \"variant1\",\n", " \"ModelName\": model_name,\n", " \"InstanceType\": \"ml.g5.48xlarge\",\n", " \"InitialInstanceCount\": 1,\n", " #\"ModelDataDownloadTimeoutInSeconds\": 2400,\n", " \"ContainerStartupHealthCheckTimeoutInSeconds\": 2400,\n", " },\n", " ],\n", ")\n", "endpoint_config_response" ] }, { "cell_type": "code", "execution_count": null, "id": "04c1882e-32c3-41c6-befd-ad9907d0b340", "metadata": { "tags": [] }, "outputs": [], "source": [ "create_endpoint_response = sm_client.create_endpoint(\n", " EndpointName=f\"{endpoint_name}\", EndpointConfigName=endpoint_config_name\n", ")\n", "print(f\"Created Endpoint: {create_endpoint_response['EndpointArn']}\")" ] }, { "cell_type": "code", "execution_count": null, "id": "4c9520f0-c04b-4f59-8eab-9a5b287112a7", "metadata": { "tags": [] }, "outputs": [], "source": [ "import time\n", "\n", "resp = sm_client.describe_endpoint(EndpointName=endpoint_name)\n", "status = resp[\"EndpointStatus\"]\n", "print(\"Status: \" + status)\n", "\n", "while status == \"Creating\":\n", " time.sleep(60)\n", " resp = sm_client.describe_endpoint(EndpointName=endpoint_name)\n", " status = resp[\"EndpointStatus\"]\n", " print(\"Status: \" + status)\n", "\n", "print(\"Arn: \" + resp[\"EndpointArn\"])\n", "print(\"Status: \" + status)" ] }, { "cell_type": "markdown", "id": "d96b9462-a2d8-4cda-a8cd-e068992f3dee", "metadata": {}, "source": [ "Use the low level boto3 API to generate context." ] }, { "cell_type": "code", "execution_count": null, "id": "3900294a-bea9-481d-a822-e72b69ce18fc", "metadata": { "tags": [] }, "outputs": [], "source": [ "%%time\n", "import json\n", "import boto3\n", "\n", "smr_client = boto3.client(\"sagemaker-runtime\")\n", "\n", "prompts = \"\"\"Summarize the following news article:\n", "Peter and Elizabeth took a taxi to attend the night party in the city. While in the party, Elizabeth collapsed and was rushed to the hospital.\n", "Since she was diagnosed with a brain injury, the doctor told Peter to stay besides her until she gets well. Therefore, Peter stayed with her at the hospital for 3 days without leaving.\n", "Summary:\n", "\"\"\"\n", "\n", "parameters = {\n", " #\"early_stopping\": True,\n", " #\"length_penalty\": 2.0,\n", " \"max_new_tokens\": 50,\n", " \"temperature\": 0,\n", " \"min_length\": 10,\n", " \"no_repeat_ngram_size\": 2,\n", "}\n", "\n", "\n", "response_model = smr_client.invoke_endpoint(\n", " EndpointName=endpoint_name,\n", " Body=json.dumps(\n", " {\n", " \"inputs\": prompts,\n", " \"parameters\": parameters\n", " }\n", " ),\n", " ContentType=\"application/json\",\n", " )\n", "\n", "response_model['Body'].read().decode('utf8')" ] }, { "cell_type": "code", "execution_count": null, "id": "0d5acf45-1187-4cce-a57f-9cc499891f52", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "availableInstances": [ { "_defaultOrder": 0, "_isFastLaunch": true, "category": "General purpose", "gpuNum": 0, "memoryGiB": 4, "name": "ml.t3.medium", "vcpuNum": 2 }, { "_defaultOrder": 1, "_isFastLaunch": false, "category": "General purpose", "gpuNum": 0, "memoryGiB": 8, "name": "ml.t3.large", "vcpuNum": 2 }, { "_defaultOrder": 2, "_isFastLaunch": false, "category": "General purpose", "gpuNum": 0, "memoryGiB": 16, "name": "ml.t3.xlarge", "vcpuNum": 4 }, { "_defaultOrder": 3, "_isFastLaunch": false, "category": "General purpose", "gpuNum": 0, "memoryGiB": 32, "name": "ml.t3.2xlarge", "vcpuNum": 8 }, { "_defaultOrder": 4, "_isFastLaunch": true, "category": "General purpose", "gpuNum": 0, "memoryGiB": 8, "name": "ml.m5.large", "vcpuNum": 2 }, { "_defaultOrder": 5, "_isFastLaunch": false, "category": "General purpose", "gpuNum": 0, "memoryGiB": 16, "name": "ml.m5.xlarge", "vcpuNum": 4 }, { "_defaultOrder": 6, "_isFastLaunch": false, "category": "General purpose", "gpuNum": 0, "memoryGiB": 32, "name": "ml.m5.2xlarge", "vcpuNum": 8 }, { "_defaultOrder": 7, "_isFastLaunch": false, "category": "General purpose", "gpuNum": 0, "memoryGiB": 64, "name": "ml.m5.4xlarge", "vcpuNum": 16 }, { "_defaultOrder": 8, "_isFastLaunch": false, "category": "General purpose", "gpuNum": 0, "memoryGiB": 128, "name": "ml.m5.8xlarge", "vcpuNum": 32 }, { "_defaultOrder": 9, "_isFastLaunch": false, "category": "General purpose", "gpuNum": 0, "memoryGiB": 192, "name": "ml.m5.12xlarge", "vcpuNum": 48 }, { "_defaultOrder": 10, "_isFastLaunch": false, "category": "General purpose", "gpuNum": 0, "memoryGiB": 256, "name": "ml.m5.16xlarge", "vcpuNum": 64 }, { "_defaultOrder": 11, "_isFastLaunch": false, "category": "General purpose", "gpuNum": 0, "memoryGiB": 384, "name": "ml.m5.24xlarge", "vcpuNum": 96 }, { "_defaultOrder": 12, "_isFastLaunch": false, "category": "General purpose", "gpuNum": 0, "memoryGiB": 8, "name": "ml.m5d.large", "vcpuNum": 2 }, { "_defaultOrder": 13, "_isFastLaunch": false, "category": "General purpose", "gpuNum": 0, "memoryGiB": 16, "name": "ml.m5d.xlarge", "vcpuNum": 4 }, { "_defaultOrder": 14, "_isFastLaunch": false, "category": "General purpose", "gpuNum": 0, "memoryGiB": 32, "name": "ml.m5d.2xlarge", "vcpuNum": 8 }, { "_defaultOrder": 15, "_isFastLaunch": false, "category": "General purpose", "gpuNum": 0, "memoryGiB": 64, "name": "ml.m5d.4xlarge", "vcpuNum": 16 }, { "_defaultOrder": 16, "_isFastLaunch": false, "category": "General purpose", "gpuNum": 0, "memoryGiB": 128, "name": "ml.m5d.8xlarge", "vcpuNum": 32 }, { "_defaultOrder": 17, "_isFastLaunch": false, "category": "General purpose", "gpuNum": 0, "memoryGiB": 192, "name": "ml.m5d.12xlarge", "vcpuNum": 48 }, { "_defaultOrder": 18, "_isFastLaunch": false, "category": "General purpose", "gpuNum": 0, "memoryGiB": 256, "name": "ml.m5d.16xlarge", "vcpuNum": 64 }, { "_defaultOrder": 19, "_isFastLaunch": false, "category": "General purpose", "gpuNum": 0, "memoryGiB": 384, "name": "ml.m5d.24xlarge", "vcpuNum": 96 }, { "_defaultOrder": 20, "_isFastLaunch": true, "category": "Compute optimized", "gpuNum": 0, "memoryGiB": 4, "name": "ml.c5.large", "vcpuNum": 2 }, { "_defaultOrder": 21, "_isFastLaunch": false, "category": "Compute optimized", "gpuNum": 0, "memoryGiB": 8, "name": "ml.c5.xlarge", "vcpuNum": 4 }, { "_defaultOrder": 22, "_isFastLaunch": false, "category": "Compute optimized", "gpuNum": 0, "memoryGiB": 16, "name": "ml.c5.2xlarge", "vcpuNum": 8 }, { "_defaultOrder": 23, "_isFastLaunch": false, "category": "Compute optimized", "gpuNum": 0, "memoryGiB": 32, "name": "ml.c5.4xlarge", "vcpuNum": 16 }, { "_defaultOrder": 24, "_isFastLaunch": false, "category": "Compute optimized", "gpuNum": 0, "memoryGiB": 72, "name": "ml.c5.9xlarge", "vcpuNum": 36 }, { "_defaultOrder": 25, "_isFastLaunch": false, "category": "Compute optimized", "gpuNum": 0, "memoryGiB": 96, "name": "ml.c5.12xlarge", "vcpuNum": 48 }, { "_defaultOrder": 26, "_isFastLaunch": false, "category": "Compute optimized", "gpuNum": 0, "memoryGiB": 144, "name": "ml.c5.18xlarge", "vcpuNum": 72 }, { "_defaultOrder": 27, "_isFastLaunch": false, "category": "Compute optimized", "gpuNum": 0, "memoryGiB": 192, "name": "ml.c5.24xlarge", "vcpuNum": 96 }, { "_defaultOrder": 28, "_isFastLaunch": true, "category": "Accelerated computing", "gpuNum": 1, "memoryGiB": 16, "name": "ml.g4dn.xlarge", "vcpuNum": 4 }, { "_defaultOrder": 29, "_isFastLaunch": false, "category": "Accelerated computing", "gpuNum": 1, "memoryGiB": 32, "name": "ml.g4dn.2xlarge", "vcpuNum": 8 }, { "_defaultOrder": 30, "_isFastLaunch": false, "category": "Accelerated computing", "gpuNum": 1, "memoryGiB": 64, "name": "ml.g4dn.4xlarge", "vcpuNum": 16 }, { "_defaultOrder": 31, "_isFastLaunch": false, "category": "Accelerated computing", "gpuNum": 1, "memoryGiB": 128, "name": "ml.g4dn.8xlarge", "vcpuNum": 32 }, { "_defaultOrder": 32, "_isFastLaunch": false, "category": "Accelerated computing", "gpuNum": 4, "memoryGiB": 192, "name": "ml.g4dn.12xlarge", "vcpuNum": 48 }, { "_defaultOrder": 33, "_isFastLaunch": false, "category": "Accelerated computing", "gpuNum": 1, "memoryGiB": 256, "name": "ml.g4dn.16xlarge", "vcpuNum": 64 }, { "_defaultOrder": 34, "_isFastLaunch": false, "category": "Accelerated computing", "gpuNum": 1, "memoryGiB": 61, "name": "ml.p3.2xlarge", "vcpuNum": 8 }, { "_defaultOrder": 35, "_isFastLaunch": false, "category": "Accelerated computing", "gpuNum": 4, "memoryGiB": 244, "name": "ml.p3.8xlarge", "vcpuNum": 32 }, { "_defaultOrder": 36, "_isFastLaunch": false, "category": "Accelerated computing", "gpuNum": 8, "memoryGiB": 488, "name": "ml.p3.16xlarge", "vcpuNum": 64 }, { "_defaultOrder": 37, "_isFastLaunch": false, "category": "Accelerated computing", "gpuNum": 8, "memoryGiB": 768, "name": "ml.p3dn.24xlarge", "vcpuNum": 96 }, { "_defaultOrder": 38, "_isFastLaunch": false, "category": "Memory Optimized", "gpuNum": 0, "memoryGiB": 16, "name": "ml.r5.large", "vcpuNum": 2 }, { "_defaultOrder": 39, "_isFastLaunch": false, "category": "Memory Optimized", "gpuNum": 0, "memoryGiB": 32, "name": "ml.r5.xlarge", "vcpuNum": 4 }, { "_defaultOrder": 40, "_isFastLaunch": false, "category": "Memory Optimized", "gpuNum": 0, "memoryGiB": 64, "name": "ml.r5.2xlarge", "vcpuNum": 8 }, { "_defaultOrder": 41, "_isFastLaunch": false, "category": "Memory Optimized", "gpuNum": 0, "memoryGiB": 128, "name": "ml.r5.4xlarge", "vcpuNum": 16 }, { "_defaultOrder": 42, "_isFastLaunch": false, "category": "Memory Optimized", "gpuNum": 0, "memoryGiB": 256, "name": "ml.r5.8xlarge", "vcpuNum": 32 }, { "_defaultOrder": 43, "_isFastLaunch": false, "category": "Memory Optimized", "gpuNum": 0, "memoryGiB": 384, "name": "ml.r5.12xlarge", "vcpuNum": 48 }, { "_defaultOrder": 44, "_isFastLaunch": false, "category": "Memory Optimized", "gpuNum": 0, "memoryGiB": 512, "name": "ml.r5.16xlarge", "vcpuNum": 64 }, { "_defaultOrder": 45, "_isFastLaunch": false, "category": "Memory Optimized", "gpuNum": 0, "memoryGiB": 768, "name": "ml.r5.24xlarge", "vcpuNum": 96 }, { "_defaultOrder": 46, "_isFastLaunch": false, "category": "Accelerated computing", "gpuNum": 1, "memoryGiB": 16, "name": "ml.g5.xlarge", "vcpuNum": 4 }, { "_defaultOrder": 47, "_isFastLaunch": false, "category": "Accelerated computing", "gpuNum": 1, "memoryGiB": 32, "name": "ml.g5.2xlarge", "vcpuNum": 8 }, { "_defaultOrder": 48, "_isFastLaunch": false, "category": "Accelerated computing", "gpuNum": 1, "memoryGiB": 64, "name": "ml.g5.4xlarge", "vcpuNum": 16 }, { "_defaultOrder": 49, "_isFastLaunch": false, "category": "Accelerated computing", "gpuNum": 1, "memoryGiB": 128, "name": "ml.g5.8xlarge", "vcpuNum": 32 }, { "_defaultOrder": 50, "_isFastLaunch": false, "category": "Accelerated computing", "gpuNum": 1, "memoryGiB": 256, "name": "ml.g5.16xlarge", "vcpuNum": 64 }, { "_defaultOrder": 51, "_isFastLaunch": false, "category": "Accelerated computing", "gpuNum": 4, "memoryGiB": 192, "name": "ml.g5.12xlarge", "vcpuNum": 48 }, { "_defaultOrder": 52, "_isFastLaunch": false, "category": "Accelerated computing", "gpuNum": 4, "memoryGiB": 384, "name": "ml.g5.24xlarge", "vcpuNum": 96 }, { "_defaultOrder": 53, "_isFastLaunch": false, "category": "Accelerated computing", "gpuNum": 8, "memoryGiB": 768, "name": "ml.g5.48xlarge", "vcpuNum": 192 } ], "instance_type": "ml.m5.large", "kernelspec": { "display_name": "Python 3 (Data Science)", "language": "python", "name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:us-east-1:081325390199:image/datascience-1.0" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.10" } }, "nbformat": 4, "nbformat_minor": 5 }