{ "cells": [ { "cell_type": "markdown", "id": "e4d27ec7", "metadata": {}, "source": [ "# TEST - Run the preprocessing step on SageMaker Processing\n", "This notebook will run preprocess.py on the standard SageMaker Processor SKLearn container using the default API.\n", "\n", "#### Dataset\n", "We will use the census dataset from `sagemaker-examples` for this demo. If you wish to test with another dataset, you will need to modify the logic within preprocess.py.\n", "\n" ] }, { "cell_type": "code", "execution_count": 1, "id": "33aae58b", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import utils\n", "import boto3\n", "import sagemaker\n", "import uuid\n", "from sagemaker.sklearn.processing import SKLearnProcessor\n", "from sagemaker.processing import ProcessingInput, ProcessingOutput" ] }, { "cell_type": "code", "execution_count": 2, "id": "76e92a03", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "role: arn:aws:iam::239577782971:role/andreac\n", "region: ap-southeast-1\n", "bucket: sagemaker-to-batch\n" ] } ], "source": [ "bucket = 'REPLACE ME'\n", "region = sagemaker.Session().boto_region_name\n", "\n", "# create a new role with only the permissions for S3FullAccess and SageMakerFullAccess, and paste the role ARN here\n", "role = \"REPLACE ME\"\n", "\n", "# PRINT\n", "print('role:',role)\n", "print('region:', region)\n", "print('bucket:', bucket)" ] }, { "cell_type": "markdown", "id": "fded8640", "metadata": {}, "source": [ "# Download dataset locally" ] }, { "cell_type": "code", "execution_count": 3, "id": "25311f34", "metadata": {}, "outputs": [], "source": [ "utils.mkpath_if_not_exist('data')" ] }, { "cell_type": "code", "execution_count": 4, "id": "d79ec1ee", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ageclass of workerdetailed industry recodedetailed occupation recodeeducationwage per hourenroll in edu inst last wkmarital statmajor industry codemajor occupation code...country of birth fathercountry of birth mothercountry of birth selfcitizenshipown business or self employedfill inc questionnaire for veteran's adminveterans benefitsweeks worked in yearyearincome
073Not in universe00High school graduate0Not in universeWidowedNot in universe or childrenNot in universe...United-StatesUnited-StatesUnited-StatesNative- Born in the United States0Not in universe2095- 50000.
158Self-employed-not incorporated434Some college but no degree0Not in universeDivorcedConstructionPrecision production craft & repair...United-StatesUnited-StatesUnited-StatesNative- Born in the United States0Not in universe25294- 50000.
218Not in universe0010th grade0High schoolNever marriedNot in universe or childrenNot in universe...VietnamVietnamVietnamForeign born- Not a citizen of U S0Not in universe2095- 50000.
39Not in universe00Children0Not in universeNever marriedNot in universe or childrenNot in universe...United-StatesUnited-StatesUnited-StatesNative- Born in the United States0Not in universe0094- 50000.
410Not in universe00Children0Not in universeNever marriedNot in universe or childrenNot in universe...United-StatesUnited-StatesUnited-StatesNative- Born in the United States0Not in universe0094- 50000.
\n", "

5 rows × 42 columns

\n", "
" ], "text/plain": [ " age class of worker detailed industry recode \\\n", "0 73 Not in universe 0 \n", "1 58 Self-employed-not incorporated 4 \n", "2 18 Not in universe 0 \n", "3 9 Not in universe 0 \n", "4 10 Not in universe 0 \n", "\n", " detailed occupation recode education wage per hour \\\n", "0 0 High school graduate 0 \n", "1 34 Some college but no degree 0 \n", "2 0 10th grade 0 \n", "3 0 Children 0 \n", "4 0 Children 0 \n", "\n", " enroll in edu inst last wk marital stat major industry code \\\n", "0 Not in universe Widowed Not in universe or children \n", "1 Not in universe Divorced Construction \n", "2 High school Never married Not in universe or children \n", "3 Not in universe Never married Not in universe or children \n", "4 Not in universe Never married Not in universe or children \n", "\n", " major occupation code ... country of birth father \\\n", "0 Not in universe ... United-States \n", "1 Precision production craft & repair ... United-States \n", "2 Not in universe ... Vietnam \n", "3 Not in universe ... United-States \n", "4 Not in universe ... United-States \n", "\n", " country of birth mother country of birth self \\\n", "0 United-States United-States \n", "1 United-States United-States \n", "2 Vietnam Vietnam \n", "3 United-States United-States \n", "4 United-States United-States \n", "\n", " citizenship own business or self employed \\\n", "0 Native- Born in the United States 0 \n", "1 Native- Born in the United States 0 \n", "2 Foreign born- Not a citizen of U S 0 \n", "3 Native- Born in the United States 0 \n", "4 Native- Born in the United States 0 \n", "\n", " fill inc questionnaire for veteran's admin veterans benefits \\\n", "0 Not in universe 2 \n", "1 Not in universe 2 \n", "2 Not in universe 2 \n", "3 Not in universe 0 \n", "4 Not in universe 0 \n", "\n", " weeks worked in year year income \n", "0 0 95 - 50000. \n", "1 52 94 - 50000. \n", "2 0 95 - 50000. \n", "3 0 94 - 50000. \n", "4 0 94 - 50000. \n", "\n", "[5 rows x 42 columns]" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "s3 = boto3.client(\"s3\")\n", "s3.download_file(\n", " \"sagemaker-sample-data-{}\".format(region),\n", " \"processing/census/census-income.csv\",\n", " \"data/census-income.csv\",\n", ")\n", "df = pd.read_csv(\"data/census-income.csv\")\n", "df.to_csv(\"data/dataset.csv\")\n", "df.head()" ] }, { "cell_type": "code", "execution_count": null, "id": "7f2500ae", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "id": "3467a4fb", "metadata": {}, "source": [ "# Upload Input data and code to S3" ] }, { "cell_type": "code", "execution_count": 9, "id": "7c701c4b", "metadata": {}, "outputs": [], "source": [ "s3.upload_file('data/dataset.csv',bucket,'data/sample/census/dataset.csv', )" ] }, { "cell_type": "code", "execution_count": 10, "id": "85289c2f", "metadata": {}, "outputs": [], "source": [ "s3.upload_file('preprocess.py',bucket,'code-repo/sagemaker-process-code/preprocess.py')\n", "s3.upload_file('sagemaker_entry_point.py',bucket,'code-repo/sagemaker-process-code/sagemaker_entry_point.py')\n", "s3.upload_file('utils.py',bucket,'code-repo/sagemaker-process-code/utils.py')\n", "s3.upload_file('row_function_multiproc.py', bucket, 'code-repo/sagemaker-process-code/row_function_multiproc.py')" ] }, { "cell_type": "markdown", "id": "d67df95f", "metadata": {}, "source": [ "# Run SageMaker Processing Job" ] }, { "cell_type": "code", "execution_count": null, "id": "60f74de6", "metadata": {}, "outputs": [], "source": [ "# pip install -U sagemaker" ] }, { "cell_type": "code", "execution_count": 5, "id": "df331c36", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "job_name: sagemaker-on-aws-batch-test-sm-processing-683cd9c1-8c44-413a-9\n", "\n", "Job Name: sagemaker-on-aws-batch-test-sm-processing-683cd9c1-8c44-413a-9\n", "Inputs: [{'InputName': 'input-1', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-to-batch/data/sample/census/dataset.csv', 'LocalPath': '/opt/ml/processing/input/one', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'input-2', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-to-batch/code-repo/sagemaker-process-code', 'LocalPath': '/opt/ml/processing/input/lib', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'code', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-to-batch/code-repo/sagemaker-process-code/sagemaker_entry_point.py', 'LocalPath': '/opt/ml/processing/input/code', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]\n", "Outputs: [{'OutputName': 'output-1', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://sagemaker-to-batch/output-data/sample/census/train', 'LocalPath': '/opt/ml/processing/output/data/train', 'S3UploadMode': 'EndOfJob'}}, {'OutputName': 'output-2', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://sagemaker-to-batch/output-data/sample/census/validation', 'LocalPath': '/opt/ml/processing/output/data/validation', 'S3UploadMode': 'EndOfJob'}}, {'OutputName': 'output-3', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://sagemaker-to-batch/output-data/sample/census/test', 'LocalPath': '/opt/ml/processing/output/data/test', 'S3UploadMode': 'EndOfJob'}}]\n", ".........................\u001b[34mReceived arguments Namespace(input_code='/opt/ml/processing/input/lib', input_data_one='/opt/ml/processing/input/one', output_data_test=PosixPath('/opt/ml/processing/output/data/test'), output_data_train=PosixPath('/opt/ml/processing/output/data/train'), output_data_validation=PosixPath('/opt/ml/processing/output/data/validation'), output_metrics=PosixPath('/opt/ml/processing/output/metrics'), train_test_split=0.2, validation_flag=True)\u001b[0m\n", "\u001b[34mCollecting annoy==1.17.0\n", " Downloading annoy-1.17.0.tar.gz (646 kB)\n", " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 646.2/646.2 kB 47.2 MB/s eta 0:00:00\n", " Preparing metadata (setup.py): started\n", " Preparing metadata (setup.py): finished with status 'done'\u001b[0m\n", "\u001b[34mBuilding wheels for collected packages: annoy\n", " Building wheel for annoy (setup.py): started\u001b[0m\n", "\u001b[34m Building wheel for annoy (setup.py): finished with status 'done'\n", " Created wheel for annoy: filename=annoy-1.17.0-cp38-cp38-linux_x86_64.whl size=394524 sha256=2ca843d44f9c6094753ae7dbf2d11e221d888c739fafe0fb2d3c3d639c3e3dbf\n", " Stored in directory: /root/.cache/pip/wheels/77/01/de/4421524f9997a25dfa7291121565d12ef514154945e80e907a\u001b[0m\n", "\u001b[34mSuccessfully built annoy\u001b[0m\n", "\u001b[34mInstalling collected packages: annoy\u001b[0m\n", "\u001b[34mSuccessfully installed annoy-1.17.0\u001b[0m\n", "\u001b[34mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\n", "\u001b[34mShape of data is: (199523, 43)\u001b[0m\n", "\u001b[34mShape of train set is: (159618, 43)\u001b[0m\n", "\u001b[34mShape of test set is: (39905, 43)\u001b[0m\n", "\u001b[34mShape of train set is: (127694, 43)\u001b[0m\n", "\u001b[34mShape of test set is: (31924, 43)\u001b[0m\n", "\u001b[34mSuccessfully created directory /opt/ml/processing/output/data/train\u001b[0m\n", "\u001b[34mSuccessfully created directory /opt/ml/processing/output/data/validation\u001b[0m\n", "\u001b[34mSuccessfully created directory /opt/ml/processing/output/data/test\u001b[0m\n", "\u001b[34mSuccessfully created directory /opt/ml/processing/output/metrics\u001b[0m\n", "\u001b[34mWrote train.csv files successfully to /opt/ml/processing/output/data/train\u001b[0m\n", "\u001b[34mWrote validation.csv files successfully to /opt/ml/processing/output/data/validation\u001b[0m\n", "\u001b[34mWrote test.csv files successfully to /opt/ml/processing/output/data/test\u001b[0m\n", "\u001b[34mWrote preprocess_metrics.csv files successfully to /opt/ml/processing/output/metrics\u001b[0m\n", "\u001b[34mTime taken for Preprocessing completed = 4.29 secs\u001b[0m\n", "\n" ] } ], "source": [ "job_name = 'sagemaker-on-aws-batch-test-sm-processing-{}'.format(uuid.uuid4())[:62]\n", "print('job_name: ',job_name)\n", "\n", "sklearn_processor = SKLearnProcessor(\n", " framework_version=\"1.0-1\", \n", " role=role, \n", " instance_type=\"ml.m5.xlarge\", \n", " instance_count=1\n", ")\n", "\n", "sklearn_processor.run(\n", " job_name=job_name,\n", " code=f\"s3://{bucket}/code-repo/sagemaker-process-code/sagemaker_entry_point.py\",\n", " arguments = [\"--train-test-split\", \"0.2\",\n", " \"--validation-flag\",\"true\"\n", " ], \n", " inputs=[ProcessingInput(source=f\"s3://{bucket}/data/sample/census/dataset.csv\", \n", " destination=\"/opt/ml/processing/input/one\"\n", " ),\n", " ProcessingInput(source=f\"s3://{bucket}/code-repo/sagemaker-process-code\", \n", " destination=\"/opt/ml/processing/input/lib\"\n", " )\n", " ],\n", " outputs=[\n", " ProcessingOutput(source=\"/opt/ml/processing/output/data/train\",\n", " destination=f\"s3://{bucket}/output-data/sample/census/train\",\n", " ),\n", " ProcessingOutput(source=\"/opt/ml/processing/output/data/validation\",\n", " destination=f\"s3://{bucket}/output-data/sample/census/validation\",\n", " ),\n", " ProcessingOutput(source=\"/opt/ml/processing/output/data/test\",\n", " destination=f\"s3://{bucket}/output-data/sample/census/test\",\n", " ),\n", " ],\n", ")" ] }, { "cell_type": "markdown", "id": "1e165065", "metadata": {}, "source": [ "-----" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3.8.9 64-bit", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.9" }, "vscode": { "interpreter": { "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6" } } }, "nbformat": 4, "nbformat_minor": 5 }