In [None]:
from sagemaker import AutoML

# This script expects an input of a CSV file with a header.
# It that input file, saves the header as a var, strips it out,
# and then uploads the header-less CSV back to S3 for the AutoPilot
# system to process as an input

# You only need to modify these three lines to customize to your env

autopilot_experiment_name = "<YOUR_EXPERIMENT_NAME>"
holdout_sample_bucket = '<YOUR_HOLDOUT_SAMPLE_BUCKET_NAME>'
holdout_sample_key = '<YOUR_HOLDOUT_SAMPLE_KEY_NAME>'


holdout_sample_path=f's3://{holdout_sample_bucket}/{holdout_sample_key}'
holdout_sample_no_header_key = f'{holdout_sample_key}.no_header'
holdout_sample_no_header_full_path=f's3://{holdout_sample_bucket}/{holdout_sample_no_header_key}'

In [None]:
def process_csv_file(holdout_sample_bucket, holdout_sample_key, holdout_sample_no_header_key):
    
    import boto3
    
    # copy the file locally
    
    print("Copying file locally...", end='')
    obj = boto3.resource('s3').Object(holdout_sample_bucket, holdout_sample_key)
    temp_file = obj.get()['Body'].read().decode()
    print("done.")

    # grab and save the header

    csv_header = temp_file.partition('\n')[0].split(',')
    print(f'Retrieved header: {csv_header}')

    # we'll append these additional columns to the header for later 
    csv_header +=  ['Prediction','PredictionProb']

    # remove the header from temp file

    headerless_csv_body = temp_file.split("\n",1)[1]

    # upload back to S3 headerless

    print(f'Writing headerless output CSV...', end='')
    headerless_csv_object = boto3.resource('s3').Object(holdout_sample_bucket, holdout_sample_no_header_key)
    headerless_csv_object.put(Body=headerless_csv_body)
    print("done.")

    return csv_header

In [None]:
csv_header = process_csv_file(holdout_sample_bucket, holdout_sample_key, holdout_sample_no_header_key)

In [None]:
automl = AutoML.attach(auto_ml_job_name=autopilot_experiment_name)
candidate_name=None
#candidate_name="<YOUR_CANDIDATE_NAME>"

In [None]:
automl_experiment=automl.describe_auto_ml_job()
all_candidates = automl.list_candidates(sort_by='FinalObjectiveMetricValue',
                                        sort_order='Descending',
                                        max_results=100)
best_candidate = automl_experiment['BestCandidate']
best_candidate_name = best_candidate['CandidateName']
inference_response_keys = ["predicted_label", "probability"]

model = automl.create_model(name=best_candidate_name, 
                  candidate=best_candidate, 
                  inference_response_keys=inference_response_keys)

if candidate_name is not None:
    
    for candidate in all_candidates:
        if candidate['CandidateName'] == "":
            candidate_name = candidate['CandidateName']
            model = automl.create_model(name=candidate_name, 
                                        candidate=candidate, 
                                        inference_response_keys=inference_response_keys)
            break 
            model
    

In [None]:
import IPython
from IPython.display import HTML
from urllib.parse import urlparse
import boto3
s3 = boto3.resource('s3')

best_candidate_name=best_candidate['CandidateName']
model_artifact_loc=best_candidate['CandidateProperties']['CandidateArtifactLocations']['ModelInsights']
best_model_artifact_loc=model_artifact_loc+'/'+best_candidate_name
model_insights_pdf_report=best_model_artifact_loc+"/report.pdf"
model_insights_pdf_report_uri=urlparse(model_insights_pdf_report)

content_object = s3.Object(model_insights_pdf_report_uri.netloc, model_insights_pdf_report_uri.path[1:])
file_content = content_object.get()['Body'].read()

print(model_insights_pdf_report_uri.netloc)
IPython.display.display_pdf(file_content,raw=True)

In [None]:
transformer = model.transformer(instance_count=1, 
                                instance_type='ml.m5.xlarge',
                                assemble_with='Line',
                                output_path=holdout_sample_path, accept="text/csv")

transformer.transform(data=holdout_sample_no_header_full_path,
                      split_type='Line',
                      content_type='text/csv',
                      input_filter='$[0:7]',
                      join_source="Input",
                      wait=False)

print("Starting transform job {}".format(transformer._current_job_name))

In [None]:
import s3fs
from time import sleep
import pandas as pd

sm = boto3.Session().client(service_name="sagemaker", region_name='us-east-2')
desc = sm.describe_transform_job(TransformJobName=transformer._current_job_name)

while desc['TransformJobStatus']=='InProgress':
    desc = sm.describe_transform_job(TransformJobName=transformer._current_job_name)
    print(desc['TransformJobName']+' is in progress')
    sleep(5)

if desc['TransformJobStatus']=='Completed':
    path=desc['TransformOutput']['S3OutputPath']
    s3 = s3fs.S3FileSystem(anon=False)

    # get data file names
    filenames = s3.glob(path + "/*.out")
    dfs = []
    for filename in filenames:
        dfs.append(pd.read_csv('s3://'+filename, header=None))

    output_df = pd.concat(dfs, ignore_index=True)
else:
    print(desc['TransformJobName']+' Failed!') 
    

print(f"{desc['TransformJobStatus']}")
output_df

In [None]:
output_df.columns = csv_header
output_df=output_df.iloc[1:,:]
output_df



In [None]:
df=output_df.convert_dtypes()
df.dtypes

In [None]:
import seaborn as sns
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt

cm = confusion_matrix(output_df['Outcome'].astype(str).astype(int), output_df['Prediction'])
f = sns.heatmap(cm, annot=True, fmt='d')
plt.show()

In [None]:
from sklearn import metrics
from sklearn.metrics import RocCurveDisplay


fpr, tpr, thresholds = metrics.roc_curve(output_df['Outcome'].astype(str).astype(int), output_df['PredictionProb'])
roc_auc = metrics.auc(fpr, tpr)
display = metrics.RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc,estimator_name='Holdout/Test Data - ROC curve')
display.plot()
plt.show()

In [None]:
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import PrecisionRecallDisplay

prec, recall, _ = precision_recall_curve(output_df['Outcome'].astype(str).astype(int), output_df['PredictionProb'])
average_precision= metrics.average_precision_score(output_df['Outcome'].astype(str).astype(int), output_df['PredictionProb'] )
pr_display = PrecisionRecallDisplay(precision=prec, recall=recall, average_precision=average_precision, estimator_name='Holdout/Test Data - AUPRC curve')
pr_display.plot()
plt.show()