{ "cells": [ { "cell_type": "markdown", "id": "9b262721-9ba7-40c2-acc4-96c41cf9230a", "metadata": {}, "source": [ "# MPT SageMaker Inference\n", "\n", "This is a sample code to deploy MPT on SageMaker." ] }, { "cell_type": "code", "execution_count": null, "id": "32904a97-bedb-451a-b2cc-ac691f7708de", "metadata": { "scrolled": true, "tags": [] }, "outputs": [], "source": [ "!pip install -U \"sagemaker>=2.143.0\"" ] }, { "cell_type": "code", "execution_count": null, "id": "2edc8a17-9a6b-4dd6-823e-878f6b950ea2", "metadata": { "tags": [] }, "outputs": [], "source": [ "import sagemaker, boto3, json\n", "from sagemaker import get_execution_role\n", "from sagemaker.pytorch.model import PyTorchModel\n", "from sagemaker.huggingface import HuggingFace\n", "\n", "role = get_execution_role()\n", "region = boto3.Session().region_name\n", "sess = sagemaker.Session()\n", "bucket = sess.default_bucket()\n", "\n", "sagemaker.__version__" ] }, { "cell_type": "markdown", "id": "44bf1cde-57d4-43f6-be6b-b09bd942219f", "metadata": {}, "source": [ "## Package and Upload Model" ] }, { "cell_type": "code", "execution_count": null, "id": "916cee87-d530-4e67-8fa6-18550ff92532", "metadata": { "scrolled": true, "tags": [] }, "outputs": [], "source": [ "!rm -rf scripts/model && mkdir scripts/model\n", "%cd scripts\n", "!tar -czvf ../package.tar.gz *\n", "%cd -" ] }, { "cell_type": "code", "execution_count": null, "id": "e252d1ba-c402-4352-a0f5-c59dc51d6fc9", "metadata": { "tags": [] }, "outputs": [], "source": [ "model_path = sess.upload_data('package.tar.gz', bucket=bucket, key_prefix=f\"MPT\")\n", "model_path" ] }, { "cell_type": "markdown", "id": "cee0a96e-b185-4e2f-a8dd-75cbb086abfa", "metadata": {}, "source": [ "## Deploy Model" ] }, { "cell_type": "code", "execution_count": null, "id": "e8232665-f10e-4a74-84d6-d5c87640d1f2", "metadata": { "tags": [] }, "outputs": [], "source": [ "from sagemaker.async_inference import AsyncInferenceConfig\n", "from sagemaker.serializers import JSONSerializer\n", "\n", "endpoint_name = \"MPT-Inference\"\n", "\n", "huggingface_model = PyTorchModel(\n", " model_data=model_path,\n", " framework_version=\"1.13\",\n", " py_version='py39',\n", " role=role,\n", " name=endpoint_name,\n", " env={\n", " \"model_params\": json.dumps({\n", " \"base_model\": \"mosaicml/mpt-7b-Instruct\",\n", " \"lora_weights\": \"model\", # path relative to model package\n", " \"peft\": False,\n", " \"load_8bit\": True,\n", " \"trust_remote_code\": True,\n", " \"prompt_template\": \"alpaca\",\n", " }),\n", " \"SAGEMAKER_MODEL_SERVER_TIMEOUT\": \"3600\"\n", " }\n", ")\n", "\n", "# deploy model to SageMaker Inference\n", "predictor = huggingface_model.deploy(\n", " initial_instance_count=1,\n", " instance_type='ml.g5.2xlarge',\n", " endpoint_name=endpoint_name,\n", " serializer=JSONSerializer(),\n", " # async_inference_config=AsyncInferenceConfig()\n", ")" ] }, { "cell_type": "markdown", "id": "fb4f6d9f-666d-40d6-b2c3-93ec553dded1", "metadata": {}, "source": [ "## Run Inference" ] }, { "cell_type": "code", "execution_count": null, "id": "be982e42-166c-47fb-bc35-cc20ee6a5c4e", "metadata": { "tags": [] }, "outputs": [], "source": [ "# With SageMaker SDK\n", "\n", "from sagemaker.predictor import Predictor\n", "from sagemaker.predictor_async import AsyncPredictor\n", "from sagemaker.serializers import JSONSerializer\n", "from sagemaker.deserializers import JSONDeserializer\n", "\n", "predictor_client = Predictor(\n", " endpoint_name=endpoint_name,\n", " sagemaker_session=sess,\n", " serializer=JSONSerializer(),\n", " deserializer=JSONDeserializer()\n", ")\n", "# predictor_client = AsyncPredictor(\n", "# predictor=predictor_client,\n", "# name=endpoint_name\n", "# )\n", "data = {\n", " \"instruction\": \"When was George Washington president?\",\n", " \"input\": \"\"\"George Washington (February 22, 1732[b] – December 14, 1799) was an American military officer, statesman,\n", "and Founding Father who served as the first president of the United States from 1789 to 1797.\"\"\",\n", " \"max_new_tokens\": 64,\n", " \"temperature\": 0.7,\n", " \"do_sample\": True,\n", " \"stop_ids\": [50278, 50279, 50277, 1, 0],\n", "}\n", "response = predictor_client.predict(\n", " data=data\n", ")\n", "print(response)" ] }, { "cell_type": "code", "execution_count": null, "id": "04c6e83b-e1c3-43f0-ab10-335a7a66ebd0", "metadata": { "tags": [] }, "outputs": [], "source": [ "# With Boto3\n", "\n", "import boto3\n", "import json\n", "\n", "endpoint_name = \"MPT-Inference\"\n", "sagemaker_client = boto3.client('sagemaker-runtime')\n", "\n", "data = {\n", " \"instruction\": \"When was George Washington president?\",\n", " \"input\": \"\"\"George Washington (February 22, 1732[b] – December 14, 1799) was an American military officer, statesman,\n", "and Founding Father who served as the first president of the United States from 1789 to 1797.\"\"\",\n", " \"max_new_tokens\": 64,\n", " \"temperature\": 0.7,\n", " \"do_sample\": True,\n", " \"stop_ids\": [50278, 50279, 50277, 1, 0],\n", "}\n", "\n", "response = sagemaker_client.invoke_endpoint(\n", " EndpointName=endpoint_name,\n", " ContentType='application/json',\n", " Accept='application/json',\n", " Body=json.dumps(data)\n", ")\n", "\n", "result = json.loads(response['Body'].read())\n", "print(result)" ] }, { "cell_type": "markdown", "id": "8b2c563e-c996-419e-a250-439ba79fdfa5", "metadata": { "tags": [] }, "source": [ "## Benchmark Speed\n", "\n", "1.39 s ± 538 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" ] }, { "cell_type": "code", "execution_count": null, "id": "d0774860-2b17-42ca-80fc-7caf7fc6f14d", "metadata": { "tags": [] }, "outputs": [], "source": [ "%timeit response = predictor_client.predict(data=data)" ] }, { "cell_type": "markdown", "id": "39737593-7053-4dbb-b39b-112e27d228a1", "metadata": {}, "source": [ "## Delete Endpoint" ] }, { "cell_type": "code", "execution_count": null, "id": "82559f16-10f8-4573-91d7-e0b85cb7d0de", "metadata": { "tags": [] }, "outputs": [], "source": [ "predictor.delete_model()\n", "predictor.delete_endpoint()" ] }, { "cell_type": "code", "execution_count": null, "id": "c75d2062-a88e-4c2f-97e9-8778ee231d29", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "availableInstances": [ { "_defaultOrder": 0, "_isFastLaunch": true, "category": "General purpose", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 4, "name": "ml.t3.medium", "vcpuNum": 2 }, { "_defaultOrder": 1, "_isFastLaunch": false, "category": "General purpose", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 8, "name": "ml.t3.large", "vcpuNum": 2 }, { "_defaultOrder": 2, "_isFastLaunch": false, "category": "General purpose", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 16, "name": "ml.t3.xlarge", "vcpuNum": 4 }, { "_defaultOrder": 3, "_isFastLaunch": false, "category": "General purpose", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 32, "name": "ml.t3.2xlarge", "vcpuNum": 8 }, { "_defaultOrder": 4, "_isFastLaunch": true, "category": "General purpose", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 8, "name": "ml.m5.large", "vcpuNum": 2 }, { "_defaultOrder": 5, "_isFastLaunch": false, "category": "General purpose", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 16, "name": "ml.m5.xlarge", "vcpuNum": 4 }, { "_defaultOrder": 6, "_isFastLaunch": false, "category": "General purpose", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 32, "name": "ml.m5.2xlarge", "vcpuNum": 8 }, { "_defaultOrder": 7, "_isFastLaunch": false, "category": "General purpose", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 64, "name": "ml.m5.4xlarge", "vcpuNum": 16 }, { "_defaultOrder": 8, "_isFastLaunch": false, "category": "General purpose", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 128, "name": "ml.m5.8xlarge", "vcpuNum": 32 }, { "_defaultOrder": 9, "_isFastLaunch": false, "category": "General purpose", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 192, "name": "ml.m5.12xlarge", "vcpuNum": 48 }, { "_defaultOrder": 10, "_isFastLaunch": false, "category": "General purpose", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 256, "name": "ml.m5.16xlarge", "vcpuNum": 64 }, { "_defaultOrder": 11, "_isFastLaunch": false, "category": "General purpose", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 384, "name": "ml.m5.24xlarge", "vcpuNum": 96 }, { "_defaultOrder": 12, "_isFastLaunch": false, "category": "General purpose", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 8, "name": "ml.m5d.large", "vcpuNum": 2 }, { "_defaultOrder": 13, "_isFastLaunch": false, "category": "General purpose", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 16, "name": "ml.m5d.xlarge", "vcpuNum": 4 }, { "_defaultOrder": 14, "_isFastLaunch": false, "category": "General purpose", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 32, "name": "ml.m5d.2xlarge", "vcpuNum": 8 }, { "_defaultOrder": 15, "_isFastLaunch": false, "category": "General purpose", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 64, "name": "ml.m5d.4xlarge", "vcpuNum": 16 }, { "_defaultOrder": 16, "_isFastLaunch": false, "category": "General purpose", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 128, "name": "ml.m5d.8xlarge", "vcpuNum": 32 }, { "_defaultOrder": 17, "_isFastLaunch": false, "category": "General purpose", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 192, "name": "ml.m5d.12xlarge", "vcpuNum": 48 }, { "_defaultOrder": 18, "_isFastLaunch": false, "category": "General purpose", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 256, "name": "ml.m5d.16xlarge", "vcpuNum": 64 }, { "_defaultOrder": 19, "_isFastLaunch": false, "category": "General purpose", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 384, "name": "ml.m5d.24xlarge", "vcpuNum": 96 }, { "_defaultOrder": 20, "_isFastLaunch": false, "category": "General purpose", "gpuNum": 0, "hideHardwareSpecs": true, "memoryGiB": 0, "name": "ml.geospatial.interactive", "supportedImageNames": [ "sagemaker-geospatial-v1-0" ], "vcpuNum": 0 }, { "_defaultOrder": 21, "_isFastLaunch": true, "category": "Compute optimized", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 4, "name": "ml.c5.large", "vcpuNum": 2 }, { "_defaultOrder": 22, "_isFastLaunch": false, "category": "Compute optimized", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 8, "name": "ml.c5.xlarge", "vcpuNum": 4 }, { "_defaultOrder": 23, "_isFastLaunch": false, "category": "Compute optimized", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 16, "name": "ml.c5.2xlarge", "vcpuNum": 8 }, { "_defaultOrder": 24, "_isFastLaunch": false, "category": "Compute optimized", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 32, "name": "ml.c5.4xlarge", "vcpuNum": 16 }, { "_defaultOrder": 25, "_isFastLaunch": false, "category": "Compute optimized", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 72, "name": "ml.c5.9xlarge", "vcpuNum": 36 }, { "_defaultOrder": 26, "_isFastLaunch": false, "category": "Compute optimized", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 96, "name": "ml.c5.12xlarge", "vcpuNum": 48 }, { "_defaultOrder": 27, "_isFastLaunch": false, "category": "Compute optimized", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 144, "name": "ml.c5.18xlarge", "vcpuNum": 72 }, { "_defaultOrder": 28, "_isFastLaunch": false, "category": "Compute optimized", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 192, "name": "ml.c5.24xlarge", "vcpuNum": 96 }, { "_defaultOrder": 29, "_isFastLaunch": true, "category": "Accelerated computing", "gpuNum": 1, "hideHardwareSpecs": false, "memoryGiB": 16, "name": "ml.g4dn.xlarge", "vcpuNum": 4 }, { "_defaultOrder": 30, "_isFastLaunch": false, "category": "Accelerated computing", "gpuNum": 1, "hideHardwareSpecs": false, "memoryGiB": 32, "name": "ml.g4dn.2xlarge", "vcpuNum": 8 }, { "_defaultOrder": 31, "_isFastLaunch": false, "category": "Accelerated computing", "gpuNum": 1, "hideHardwareSpecs": false, "memoryGiB": 64, "name": "ml.g4dn.4xlarge", "vcpuNum": 16 }, { "_defaultOrder": 32, "_isFastLaunch": false, "category": "Accelerated computing", "gpuNum": 1, "hideHardwareSpecs": false, "memoryGiB": 128, "name": "ml.g4dn.8xlarge", "vcpuNum": 32 }, { "_defaultOrder": 33, "_isFastLaunch": false, "category": "Accelerated computing", "gpuNum": 4, "hideHardwareSpecs": false, "memoryGiB": 192, "name": "ml.g4dn.12xlarge", "vcpuNum": 48 }, { "_defaultOrder": 34, "_isFastLaunch": false, "category": "Accelerated computing", "gpuNum": 1, "hideHardwareSpecs": false, "memoryGiB": 256, "name": "ml.g4dn.16xlarge", "vcpuNum": 64 }, { "_defaultOrder": 35, "_isFastLaunch": false, "category": "Accelerated computing", "gpuNum": 1, "hideHardwareSpecs": false, "memoryGiB": 61, "name": "ml.p3.2xlarge", "vcpuNum": 8 }, { "_defaultOrder": 36, "_isFastLaunch": false, "category": "Accelerated computing", "gpuNum": 4, "hideHardwareSpecs": false, "memoryGiB": 244, "name": "ml.p3.8xlarge", "vcpuNum": 32 }, { "_defaultOrder": 37, "_isFastLaunch": false, "category": "Accelerated computing", "gpuNum": 8, "hideHardwareSpecs": false, "memoryGiB": 488, "name": "ml.p3.16xlarge", "vcpuNum": 64 }, { "_defaultOrder": 38, "_isFastLaunch": false, "category": "Accelerated computing", "gpuNum": 8, "hideHardwareSpecs": false, "memoryGiB": 768, "name": "ml.p3dn.24xlarge", "vcpuNum": 96 }, { "_defaultOrder": 39, "_isFastLaunch": false, "category": "Memory Optimized", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 16, "name": "ml.r5.large", "vcpuNum": 2 }, { "_defaultOrder": 40, "_isFastLaunch": false, "category": "Memory Optimized", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 32, "name": "ml.r5.xlarge", "vcpuNum": 4 }, { "_defaultOrder": 41, "_isFastLaunch": false, "category": "Memory Optimized", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 64, "name": "ml.r5.2xlarge", "vcpuNum": 8 }, { "_defaultOrder": 42, "_isFastLaunch": false, "category": "Memory Optimized", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 128, "name": "ml.r5.4xlarge", "vcpuNum": 16 }, { "_defaultOrder": 43, "_isFastLaunch": false, "category": "Memory Optimized", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 256, "name": "ml.r5.8xlarge", "vcpuNum": 32 }, { "_defaultOrder": 44, "_isFastLaunch": false, "category": "Memory Optimized", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 384, "name": "ml.r5.12xlarge", "vcpuNum": 48 }, { "_defaultOrder": 45, "_isFastLaunch": false, "category": "Memory Optimized", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 512, "name": "ml.r5.16xlarge", "vcpuNum": 64 }, { "_defaultOrder": 46, "_isFastLaunch": false, "category": "Memory Optimized", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 768, "name": "ml.r5.24xlarge", "vcpuNum": 96 }, { "_defaultOrder": 47, "_isFastLaunch": false, "category": "Accelerated computing", "gpuNum": 1, "hideHardwareSpecs": false, "memoryGiB": 16, "name": "ml.g5.xlarge", "vcpuNum": 4 }, { "_defaultOrder": 48, "_isFastLaunch": false, "category": "Accelerated computing", "gpuNum": 1, "hideHardwareSpecs": false, "memoryGiB": 32, "name": "ml.g5.2xlarge", "vcpuNum": 8 }, { "_defaultOrder": 49, "_isFastLaunch": false, "category": "Accelerated computing", "gpuNum": 1, "hideHardwareSpecs": false, "memoryGiB": 64, "name": "ml.g5.4xlarge", "vcpuNum": 16 }, { "_defaultOrder": 50, "_isFastLaunch": false, "category": "Accelerated computing", "gpuNum": 1, "hideHardwareSpecs": false, "memoryGiB": 128, "name": "ml.g5.8xlarge", "vcpuNum": 32 }, { "_defaultOrder": 51, "_isFastLaunch": false, "category": "Accelerated computing", "gpuNum": 1, "hideHardwareSpecs": false, "memoryGiB": 256, "name": "ml.g5.16xlarge", "vcpuNum": 64 }, { "_defaultOrder": 52, "_isFastLaunch": false, "category": "Accelerated computing", "gpuNum": 4, "hideHardwareSpecs": false, "memoryGiB": 192, "name": "ml.g5.12xlarge", "vcpuNum": 48 }, { "_defaultOrder": 53, "_isFastLaunch": false, "category": "Accelerated computing", "gpuNum": 4, "hideHardwareSpecs": false, "memoryGiB": 384, "name": "ml.g5.24xlarge", "vcpuNum": 96 }, { "_defaultOrder": 54, "_isFastLaunch": false, "category": "Accelerated computing", "gpuNum": 8, "hideHardwareSpecs": false, "memoryGiB": 768, "name": "ml.g5.48xlarge", "vcpuNum": 192 }, { "_defaultOrder": 55, "_isFastLaunch": false, "category": "Accelerated computing", "gpuNum": 8, "hideHardwareSpecs": false, "memoryGiB": 1152, "name": "ml.p4d.24xlarge", "vcpuNum": 96 }, { "_defaultOrder": 56, "_isFastLaunch": false, "category": "Accelerated computing", "gpuNum": 8, "hideHardwareSpecs": false, "memoryGiB": 1152, "name": "ml.p4de.24xlarge", "vcpuNum": 96 } ], "instance_type": "ml.t3.medium", "kernelspec": { "display_name": "Python 3 (Data Science)", "language": "python", "name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:us-west-2:236514542706:image/datascience-1.0" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.10" } }, "nbformat": 4, "nbformat_minor": 5 }