{ "cells": [ { "cell_type": "markdown", "id": "2521fea6", "metadata": {}, "source": [ "# TEST - Run the preprocessing step on AWS Batch\n", "This notebook will run preprocess.py on the AWS Batch container using the default API.\n", "\n", "#### Dataset\n", "We will use the census dataset from `sagemaker-examples` for this demo. If you wish to test with another dataset, you will need to modify the logic within preprocess.py.\n", "\n" ] }, { "cell_type": "code", "execution_count": 1, "id": "e54db787", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import utils\n", "import boto3\n", "import json\n", "import sagemaker\n", "import uuid" ] }, { "cell_type": "code", "execution_count": 2, "id": "9a701fc6", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "region: ap-southeast-1\n", "bucket: sagemaker-to-batch\n", "account 239577782971\n" ] } ], "source": [ "# change bucket name to the name of the S3 bucket that was created\n", "bucket = 'REPLACE ME'\n", "region = sagemaker.Session().boto_region_name\n", "account = boto3.client('sts').get_caller_identity().get('Account')\n", "\n", "# PRINT\n", "print('region:', region)\n", "print('bucket:', bucket)\n", "print('account', account)" ] }, { "cell_type": "markdown", "id": "83367ac7", "metadata": {}, "source": [ "# Download dataset locally" ] }, { "cell_type": "code", "execution_count": 3, "id": "61936011", "metadata": {}, "outputs": [], "source": [ "utils.mkpath_if_not_exist('data')" ] }, { "cell_type": "code", "execution_count": 4, "id": "b72b8cd0", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ageclass of workerdetailed industry recodedetailed occupation recodeeducationwage per hourenroll in edu inst last wkmarital statmajor industry codemajor occupation code...country of birth fathercountry of birth mothercountry of birth selfcitizenshipown business or self employedfill inc questionnaire for veteran's adminveterans benefitsweeks worked in yearyearincome
073Not in universe00High school graduate0Not in universeWidowedNot in universe or childrenNot in universe...United-StatesUnited-StatesUnited-StatesNative- Born in the United States0Not in universe2095- 50000.
158Self-employed-not incorporated434Some college but no degree0Not in universeDivorcedConstructionPrecision production craft & repair...United-StatesUnited-StatesUnited-StatesNative- Born in the United States0Not in universe25294- 50000.
218Not in universe0010th grade0High schoolNever marriedNot in universe or childrenNot in universe...VietnamVietnamVietnamForeign born- Not a citizen of U S0Not in universe2095- 50000.
39Not in universe00Children0Not in universeNever marriedNot in universe or childrenNot in universe...United-StatesUnited-StatesUnited-StatesNative- Born in the United States0Not in universe0094- 50000.
410Not in universe00Children0Not in universeNever marriedNot in universe or childrenNot in universe...United-StatesUnited-StatesUnited-StatesNative- Born in the United States0Not in universe0094- 50000.
\n", "

5 rows × 42 columns

\n", "
" ], "text/plain": [ " age class of worker detailed industry recode \\\n", "0 73 Not in universe 0 \n", "1 58 Self-employed-not incorporated 4 \n", "2 18 Not in universe 0 \n", "3 9 Not in universe 0 \n", "4 10 Not in universe 0 \n", "\n", " detailed occupation recode education wage per hour \\\n", "0 0 High school graduate 0 \n", "1 34 Some college but no degree 0 \n", "2 0 10th grade 0 \n", "3 0 Children 0 \n", "4 0 Children 0 \n", "\n", " enroll in edu inst last wk marital stat major industry code \\\n", "0 Not in universe Widowed Not in universe or children \n", "1 Not in universe Divorced Construction \n", "2 High school Never married Not in universe or children \n", "3 Not in universe Never married Not in universe or children \n", "4 Not in universe Never married Not in universe or children \n", "\n", " major occupation code ... country of birth father \\\n", "0 Not in universe ... United-States \n", "1 Precision production craft & repair ... United-States \n", "2 Not in universe ... Vietnam \n", "3 Not in universe ... United-States \n", "4 Not in universe ... United-States \n", "\n", " country of birth mother country of birth self \\\n", "0 United-States United-States \n", "1 United-States United-States \n", "2 Vietnam Vietnam \n", "3 United-States United-States \n", "4 United-States United-States \n", "\n", " citizenship own business or self employed \\\n", "0 Native- Born in the United States 0 \n", "1 Native- Born in the United States 0 \n", "2 Foreign born- Not a citizen of U S 0 \n", "3 Native- Born in the United States 0 \n", "4 Native- Born in the United States 0 \n", "\n", " fill inc questionnaire for veteran's admin veterans benefits \\\n", "0 Not in universe 2 \n", "1 Not in universe 2 \n", "2 Not in universe 2 \n", "3 Not in universe 0 \n", "4 Not in universe 0 \n", "\n", " weeks worked in year year income \n", "0 0 95 - 50000. \n", "1 52 94 - 50000. \n", "2 0 95 - 50000. \n", "3 0 94 - 50000. \n", "4 0 94 - 50000. \n", "\n", "[5 rows x 42 columns]" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "s3 = boto3.client(\"s3\")\n", "s3.download_file(\n", " \"sagemaker-sample-data-{}\".format(region),\n", " \"processing/census/census-income.csv\",\n", " \"data/census-income.csv\",\n", ")\n", "df = pd.read_csv(\"data/census-income.csv\")\n", "df.to_csv(\"data/dataset.csv\")\n", "df.head()" ] }, { "cell_type": "markdown", "id": "7914e3a0", "metadata": {}, "source": [ "# Upload Input data and code to S3" ] }, { "cell_type": "code", "execution_count": 5, "id": "5abc75a1", "metadata": {}, "outputs": [], "source": [ "s3.upload_file('data/dataset.csv',bucket,'data/sample/census/dataset.csv', )" ] }, { "cell_type": "code", "execution_count": 6, "id": "2489818f", "metadata": {}, "outputs": [], "source": [ "s3.upload_file('preprocess.py',bucket,'code-repo/sagemaker-process-code/preprocess.py')\n", "s3.upload_file('sagemaker_entry_point.py',bucket,'code-repo/sagemaker-process-code/sagemaker_entry_point.py')\n", "s3.upload_file('utils.py',bucket,'code-repo/sagemaker-process-code/utils.py')" ] }, { "cell_type": "markdown", "id": "e031903c", "metadata": {}, "source": [ "# Run AWS Batch Job" ] }, { "cell_type": "code", "execution_count": 7, "id": "6b0ba5cf", "metadata": {}, "outputs": [], "source": [ "def create_unique_job_name(base_job_name, max_length=60):\n", " import uuid\n", " from datetime import datetime\n", " datetime_uuid=datetime.strftime(datetime.now(), \"%Y%m%d%H%S\")\n", " return f'{base_job_name}-{datetime_uuid}'[:max_length]\n", "\n", "def get_account_details():\n", " account = boto3.client('sts').get_caller_identity().get('Account')\n", " region = boto3.session.Session().region_name\n", " env= 'dev' if account =='REPLACE ME' else 'prod'\n", " \n", " return account,region,env\n", "\n", "def run_process_job(train_test_split,\n", " validation_flag,\n", " job_queue,\n", " job_definition,\n", " batch_platform='arm64'\n", " ):\n", " \n", " batch = boto3.client('batch')\n", " \n", " account,region,env = get_account_details()\n", " # Change input_bucket, code_bucket and output_bucket to the name of your S3 bucket\n", " INPUT_BUCKET= 'REPLACE ME'\n", " CODE_BUCKET = 'REPLACE ME'\n", " OUTPUT_BUCKET = 'REPLACE ME'\n", " PREPROCESING_ENTRYPOINT = f\"s3://{CODE_BUCKET}/code-repo/sagemaker-process-code/sagemaker_entry_point.py\"\n", "\n", " #AWS BATCH PARAMETERS\n", " job_name=create_unique_job_name(base_job_name=f'data-process-aws-batch-run'.replace('_','-'),max_length=60)\n", " \n", " parameters = {\n", " 'train-test-split' : train_test_split,\n", " 'validataion-flag' : validation_flag,\n", " 'env': env,\n", " 'entry-point': PREPROCESING_ENTRYPOINT,\n", " 'input-bucket': INPUT_BUCKET,\n", " 'output-bucket': OUTPUT_BUCKET,\n", " 'code-bucket': CODE_BUCKET,\n", " }\n", "\n", " try:\n", " # Submit a Batch Job\n", " response = batch.submit_job(jobQueue=job_queue, \n", " jobName=job_name,\n", " jobDefinition=job_definition, \n", " parameters=parameters)\n", " # Log response from AWS Batch\n", " print(f\"Job submitted for jobName={job_name} for parameters={parameters}, Response: \" + json.dumps(response, indent=2))\n", " # Return the jobId\n", " jobId = response['jobId']\n", " return jobId\n", " except Exception as e:\n", " print(e)\n", " message = f'Error submitting Batch Job for jobName={job_name} for parameters={parameters}'\n", " print(message)\n", " raise Exception(message)" ] }, { "cell_type": "code", "execution_count": 8, "id": "4616c781", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Job submitted for jobName=data-process-aws-batch-run-202206281019 for parameters={'train-test-split': '0.2', 'validataion-flag': 'True', 'env': 'dev', 'region': 'ap-southeast-1', 'entry-point': 's3://sagemaker-to-batch/code-repo/sagemaker-process-code/sagemaker_entry_point.py', 'input-bucket': 'sagemaker-to-batch', 'output-bucket': 'sagemaker-to-batch', 'code-bucket': 'sagemaker-to-batch'}, Response: {\n", " \"ResponseMetadata\": {\n", " \"RequestId\": \"cd5cb545-4c32-4671-a6a7-9166ae565f2b\",\n", " \"HTTPStatusCode\": 200,\n", " \"HTTPHeaders\": {\n", " \"date\": \"Tue, 28 Jun 2022 02:18:20 GMT\",\n", " \"content-type\": \"application/json\",\n", " \"content-length\": \"194\",\n", " \"connection\": \"keep-alive\",\n", " \"x-amzn-requestid\": \"cd5cb545-4c32-4671-a6a7-9166ae565f2b\",\n", " \"access-control-allow-origin\": \"*\",\n", " \"x-amz-apigw-id\": \"UaS06HNiSQ0FaMQ=\",\n", " \"access-control-expose-headers\": \"X-amzn-errortype,X-amzn-requestid,X-amzn-errormessage,X-amzn-trace-id,X-amz-apigw-id,date\",\n", " \"x-amzn-trace-id\": \"Root=1-62ba64ec-7faa4c1e192174fb6971e284\"\n", " },\n", " \"RetryAttempts\": 0\n", " },\n", " \"jobArn\": \"arn:aws:batch:ap-southeast-1:239577782971:job/daa6b300-774b-4885-bfca-243b18cfd4f1\",\n", " \"jobName\": \"data-process-aws-batch-run-202206281019\",\n", " \"jobId\": \"daa6b300-774b-4885-bfca-243b18cfd4f1\"\n", "}\n" ] } ], "source": [ "#ARM 64 JOB QUEUE\n", "job_queue=f'arn:aws:batch:{region}:{account}:job-queue/aws-dev-aws-batch-arm64-job-queue'\n", "job_definition=f'arn:aws:batch:{region}:{account}:job-definition/aws-dev-aws-batch-arm64-job-definition'\n", " \n", "job_id = run_process_job(train_test_split=\"0.2\",\n", " validation_flag=\"True\",\n", " job_queue=job_queue,\n", " job_definition=job_definition,\n", " )\n", " " ] }, { "cell_type": "markdown", "id": "0e541ba1", "metadata": {}, "source": [ "------" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3.8.9 64-bit", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.9" }, "vscode": { "interpreter": { "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6" } } }, "nbformat": 4, "nbformat_minor": 5 }