{ "cells": [ { "cell_type": "markdown", "id": "2521fea6", "metadata": {}, "source": [ "# TEST - Run the preprocessing step on AWS Batch\n", "This notebook will run preprocess.py on the AWS Batch container using the default API.\n", "\n", "#### Dataset\n", "We will use the census dataset from `sagemaker-examples` for this demo. If you wish to test with another dataset, you will need to modify the logic within preprocess.py.\n", "\n" ] }, { "cell_type": "code", "execution_count": 1, "id": "e54db787", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import utils\n", "import boto3\n", "import json\n", "import sagemaker\n", "import uuid" ] }, { "cell_type": "code", "execution_count": 2, "id": "9a701fc6", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "region: ap-southeast-1\n", "bucket: sagemaker-to-batch\n", "account 239577782971\n" ] } ], "source": [ "# change bucket name to the name of the S3 bucket that was created\n", "bucket = 'REPLACE ME'\n", "region = sagemaker.Session().boto_region_name\n", "account = boto3.client('sts').get_caller_identity().get('Account')\n", "\n", "# PRINT\n", "print('region:', region)\n", "print('bucket:', bucket)\n", "print('account', account)" ] }, { "cell_type": "markdown", "id": "83367ac7", "metadata": {}, "source": [ "# Download dataset locally" ] }, { "cell_type": "code", "execution_count": 3, "id": "61936011", "metadata": {}, "outputs": [], "source": [ "utils.mkpath_if_not_exist('data')" ] }, { "cell_type": "code", "execution_count": 4, "id": "b72b8cd0", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | age | \n", "class of worker | \n", "detailed industry recode | \n", "detailed occupation recode | \n", "education | \n", "wage per hour | \n", "enroll in edu inst last wk | \n", "marital stat | \n", "major industry code | \n", "major occupation code | \n", "... | \n", "country of birth father | \n", "country of birth mother | \n", "country of birth self | \n", "citizenship | \n", "own business or self employed | \n", "fill inc questionnaire for veteran's admin | \n", "veterans benefits | \n", "weeks worked in year | \n", "year | \n", "income | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "73 | \n", "Not in universe | \n", "0 | \n", "0 | \n", "High school graduate | \n", "0 | \n", "Not in universe | \n", "Widowed | \n", "Not in universe or children | \n", "Not in universe | \n", "... | \n", "United-States | \n", "United-States | \n", "United-States | \n", "Native- Born in the United States | \n", "0 | \n", "Not in universe | \n", "2 | \n", "0 | \n", "95 | \n", "- 50000. | \n", "
1 | \n", "58 | \n", "Self-employed-not incorporated | \n", "4 | \n", "34 | \n", "Some college but no degree | \n", "0 | \n", "Not in universe | \n", "Divorced | \n", "Construction | \n", "Precision production craft & repair | \n", "... | \n", "United-States | \n", "United-States | \n", "United-States | \n", "Native- Born in the United States | \n", "0 | \n", "Not in universe | \n", "2 | \n", "52 | \n", "94 | \n", "- 50000. | \n", "
2 | \n", "18 | \n", "Not in universe | \n", "0 | \n", "0 | \n", "10th grade | \n", "0 | \n", "High school | \n", "Never married | \n", "Not in universe or children | \n", "Not in universe | \n", "... | \n", "Vietnam | \n", "Vietnam | \n", "Vietnam | \n", "Foreign born- Not a citizen of U S | \n", "0 | \n", "Not in universe | \n", "2 | \n", "0 | \n", "95 | \n", "- 50000. | \n", "
3 | \n", "9 | \n", "Not in universe | \n", "0 | \n", "0 | \n", "Children | \n", "0 | \n", "Not in universe | \n", "Never married | \n", "Not in universe or children | \n", "Not in universe | \n", "... | \n", "United-States | \n", "United-States | \n", "United-States | \n", "Native- Born in the United States | \n", "0 | \n", "Not in universe | \n", "0 | \n", "0 | \n", "94 | \n", "- 50000. | \n", "
4 | \n", "10 | \n", "Not in universe | \n", "0 | \n", "0 | \n", "Children | \n", "0 | \n", "Not in universe | \n", "Never married | \n", "Not in universe or children | \n", "Not in universe | \n", "... | \n", "United-States | \n", "United-States | \n", "United-States | \n", "Native- Born in the United States | \n", "0 | \n", "Not in universe | \n", "0 | \n", "0 | \n", "94 | \n", "- 50000. | \n", "
5 rows × 42 columns
\n", "