# TEST - Run the preprocessing step on SageMaker Processing
This notebook will run preprocess.py on the standard SageMaker Processor SKLearn container using the default API.

#### Dataset
We will use the census dataset from `sagemaker-examples` for this demo. If you wish to test with another dataset, you will need to modify the logic within preprocess.py.



In [1]:
import pandas as pd
import utils
import boto3
import sagemaker
import uuid
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import ProcessingInput, ProcessingOutput

In [2]:
bucket = 'REPLACE ME'
region = sagemaker.Session().boto_region_name

# create a new role with only the permissions for S3FullAccess and SageMakerFullAccess, and paste the role ARN here
role = "REPLACE ME"

# PRINT
print('role:',role)
print('region:', region)
print('bucket:', bucket)

role: arn:aws:iam::239577782971:role/andreac
region: ap-southeast-1
bucket: sagemaker-to-batch


# Download dataset locally

In [3]:
utils.mkpath_if_not_exist('data')

In [4]:
s3 = boto3.client("s3")
s3.download_file(
    "sagemaker-sample-data-{}".format(region),
    "processing/census/census-income.csv",
    "data/census-income.csv",
)
df = pd.read_csv("data/census-income.csv")
df.to_csv("data/dataset.csv")
df.head()

Unnamed: 0,age,class of worker,detailed industry recode,detailed occupation recode,education,wage per hour,enroll in edu inst last wk,marital stat,major industry code,major occupation code,...,country of birth father,country of birth mother,country of birth self,citizenship,own business or self employed,fill inc questionnaire for veteran's admin,veterans benefits,weeks worked in year,year,income
0,73,Not in universe,0,0,High school graduate,0,Not in universe,Widowed,Not in universe or children,Not in universe,...,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,2,0,95,- 50000.
1,58,Self-employed-not incorporated,4,34,Some college but no degree,0,Not in universe,Divorced,Construction,Precision production craft & repair,...,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,2,52,94,- 50000.
2,18,Not in universe,0,0,10th grade,0,High school,Never married,Not in universe or children,Not in universe,...,Vietnam,Vietnam,Vietnam,Foreign born- Not a citizen of U S,0,Not in universe,2,0,95,- 50000.
3,9,Not in universe,0,0,Children,0,Not in universe,Never married,Not in universe or children,Not in universe,...,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,0,0,94,- 50000.
4,10,Not in universe,0,0,Children,0,Not in universe,Never married,Not in universe or children,Not in universe,...,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,0,0,94,- 50000.


# Upload Input data and code to S3

In [9]:
s3.upload_file('data/dataset.csv',bucket,'data/sample/census/dataset.csv', )

In [10]:
s3.upload_file('preprocess.py',bucket,'code-repo/sagemaker-process-code/preprocess.py')
s3.upload_file('sagemaker_entry_point.py',bucket,'code-repo/sagemaker-process-code/sagemaker_entry_point.py')
s3.upload_file('utils.py',bucket,'code-repo/sagemaker-process-code/utils.py')
s3.upload_file('row_function_multiproc.py', bucket, 'code-repo/sagemaker-process-code/row_function_multiproc.py')

# Run SageMaker Processing Job

In [None]:
# pip install -U sagemaker

In [5]:
job_name = 'sagemaker-on-aws-batch-test-sm-processing-{}'.format(uuid.uuid4())[:62]
print('job_name: ',job_name)

sklearn_processor = SKLearnProcessor(
    framework_version="1.0-1", 
    role=role, 
    instance_type="ml.m5.xlarge", 
    instance_count=1
)

sklearn_processor.run(
    job_name=job_name,
    code=f"s3://{bucket}/code-repo/sagemaker-process-code/sagemaker_entry_point.py",
    arguments = ["--train-test-split", "0.2",
                 "--validation-flag","true"
                ], 
    inputs=[ProcessingInput(source=f"s3://{bucket}/data/sample/census/dataset.csv", 
                            destination="/opt/ml/processing/input/one"
                           ),
            ProcessingInput(source=f"s3://{bucket}/code-repo/sagemaker-process-code", 
                            destination="/opt/ml/processing/input/lib"
                           )
           ],
    outputs=[
        ProcessingOutput(source="/opt/ml/processing/output/data/train",
                         destination=f"s3://{bucket}/output-data/sample/census/train",
                        ),
        ProcessingOutput(source="/opt/ml/processing/output/data/validation",
                         destination=f"s3://{bucket}/output-data/sample/census/validation",
                        ),
        ProcessingOutput(source="/opt/ml/processing/output/data/test",
                         destination=f"s3://{bucket}/output-data/sample/census/test",
                        ),
    ],
)

job_name:  sagemaker-on-aws-batch-test-sm-processing-683cd9c1-8c44-413a-9

Job Name:  sagemaker-on-aws-batch-test-sm-processing-683cd9c1-8c44-413a-9
Inputs:  [{'InputName': 'input-1', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-to-batch/data/sample/census/dataset.csv', 'LocalPath': '/opt/ml/processing/input/one', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'input-2', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-to-batch/code-repo/sagemaker-process-code', 'LocalPath': '/opt/ml/processing/input/lib', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'code', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-to-batch/code-repo/sagemaker-process-code/sagemaker_entry_point.py', 'LocalPath': '/opt/ml/processing/input/code', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3Da

-----