{ "cells": [ { "cell_type": "markdown", "id": "e4d27ec7", "metadata": {}, "source": [ "# TEST - Run the preprocessing step on SageMaker Processing\n", "This notebook will run preprocess.py on the standard SageMaker Processor SKLearn container using the default API.\n", "\n", "#### Dataset\n", "We will use the census dataset from `sagemaker-examples` for this demo. If you wish to test with another dataset, you will need to modify the logic within preprocess.py.\n", "\n" ] }, { "cell_type": "code", "execution_count": 1, "id": "33aae58b", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import utils\n", "import boto3\n", "import sagemaker\n", "import uuid\n", "from sagemaker.sklearn.processing import SKLearnProcessor\n", "from sagemaker.processing import ProcessingInput, ProcessingOutput" ] }, { "cell_type": "code", "execution_count": 2, "id": "76e92a03", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "role: arn:aws:iam::239577782971:role/andreac\n", "region: ap-southeast-1\n", "bucket: sagemaker-to-batch\n" ] } ], "source": [ "bucket = 'REPLACE ME'\n", "region = sagemaker.Session().boto_region_name\n", "\n", "# create a new role with only the permissions for S3FullAccess and SageMakerFullAccess, and paste the role ARN here\n", "role = \"REPLACE ME\"\n", "\n", "# PRINT\n", "print('role:',role)\n", "print('region:', region)\n", "print('bucket:', bucket)" ] }, { "cell_type": "markdown", "id": "fded8640", "metadata": {}, "source": [ "# Download dataset locally" ] }, { "cell_type": "code", "execution_count": 3, "id": "25311f34", "metadata": {}, "outputs": [], "source": [ "utils.mkpath_if_not_exist('data')" ] }, { "cell_type": "code", "execution_count": 4, "id": "d79ec1ee", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | age | \n", "class of worker | \n", "detailed industry recode | \n", "detailed occupation recode | \n", "education | \n", "wage per hour | \n", "enroll in edu inst last wk | \n", "marital stat | \n", "major industry code | \n", "major occupation code | \n", "... | \n", "country of birth father | \n", "country of birth mother | \n", "country of birth self | \n", "citizenship | \n", "own business or self employed | \n", "fill inc questionnaire for veteran's admin | \n", "veterans benefits | \n", "weeks worked in year | \n", "year | \n", "income | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "73 | \n", "Not in universe | \n", "0 | \n", "0 | \n", "High school graduate | \n", "0 | \n", "Not in universe | \n", "Widowed | \n", "Not in universe or children | \n", "Not in universe | \n", "... | \n", "United-States | \n", "United-States | \n", "United-States | \n", "Native- Born in the United States | \n", "0 | \n", "Not in universe | \n", "2 | \n", "0 | \n", "95 | \n", "- 50000. | \n", "
1 | \n", "58 | \n", "Self-employed-not incorporated | \n", "4 | \n", "34 | \n", "Some college but no degree | \n", "0 | \n", "Not in universe | \n", "Divorced | \n", "Construction | \n", "Precision production craft & repair | \n", "... | \n", "United-States | \n", "United-States | \n", "United-States | \n", "Native- Born in the United States | \n", "0 | \n", "Not in universe | \n", "2 | \n", "52 | \n", "94 | \n", "- 50000. | \n", "
2 | \n", "18 | \n", "Not in universe | \n", "0 | \n", "0 | \n", "10th grade | \n", "0 | \n", "High school | \n", "Never married | \n", "Not in universe or children | \n", "Not in universe | \n", "... | \n", "Vietnam | \n", "Vietnam | \n", "Vietnam | \n", "Foreign born- Not a citizen of U S | \n", "0 | \n", "Not in universe | \n", "2 | \n", "0 | \n", "95 | \n", "- 50000. | \n", "
3 | \n", "9 | \n", "Not in universe | \n", "0 | \n", "0 | \n", "Children | \n", "0 | \n", "Not in universe | \n", "Never married | \n", "Not in universe or children | \n", "Not in universe | \n", "... | \n", "United-States | \n", "United-States | \n", "United-States | \n", "Native- Born in the United States | \n", "0 | \n", "Not in universe | \n", "0 | \n", "0 | \n", "94 | \n", "- 50000. | \n", "
4 | \n", "10 | \n", "Not in universe | \n", "0 | \n", "0 | \n", "Children | \n", "0 | \n", "Not in universe | \n", "Never married | \n", "Not in universe or children | \n", "Not in universe | \n", "... | \n", "United-States | \n", "United-States | \n", "United-States | \n", "Native- Born in the United States | \n", "0 | \n", "Not in universe | \n", "0 | \n", "0 | \n", "94 | \n", "- 50000. | \n", "
5 rows × 42 columns
\n", "