# Tune a Scikit-Learn model in SageMaker and track with MLFlow

## Setup environment

In [None]:
import sagemaker
import pandas as pd
from sklearn.datasets import load_boston
from sagemaker.sklearn.estimator import SKLearn
from sklearn.model_selection import train_test_split
from sagemaker.tuner import IntegerParameter, HyperparameterTuner

sess = sagemaker.Session()
role = sagemaker.get_execution_role()
bucket = sess.default_bucket()

# uri of your remote mlflow server
tracking_uri = '' 

## Prepare data
We load a dataset from sklearn, split it and send it to S3

In [None]:
# we use the Boston housing dataset 
data = load_boston()

X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.25, random_state=42)

trainX = pd.DataFrame(X_train, columns=data.feature_names)
trainX['target'] = y_train

testX = pd.DataFrame(X_test, columns=data.feature_names)
testX['target'] = y_test

trainX.to_csv('boston_train.csv')
testX.to_csv('boston_test.csv')

In [None]:
# send data to S3. SageMaker will take training data from s3
train_path = sess.upload_data(path='boston_train.csv', bucket=bucket, key_prefix='sagemaker/sklearncontainer')
test_path = sess.upload_data(path='boston_test.csv', bucket=bucket, key_prefix='sagemaker/sklearncontainer')

## Tune

In [None]:
hyperparameters = {
 'tracking_uri': tracking_uri,
 'experiment_name': 'boston-housing',
 'features': 'CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT',
 'target': 'target'
}

metric_definitions = [{'Name': 'median-AE', 'Regex': "AE-at-50th-percentile: ([0-9.]+).*$"}]

In [None]:
estimator = SKLearn(
 entry_point='train.py',
 source_dir='source_dir',
 role=role,
 instance_count=1,
 instance_type='ml.m5.xlarge',
 hyperparameters=hyperparameters,
 metric_definitions=metric_definitions,
 framework_version='1.0-1',
 py_version='py3'
)

hyperparameter_ranges = {
 'n-estimators': IntegerParameter(50, 200),
 'min-samples-leaf': IntegerParameter(1, 10)
}

objective_metric_name = 'median-AE'
objective_type = 'Minimize'

In [None]:
tuner = HyperparameterTuner(estimator,
 objective_metric_name,
 hyperparameter_ranges,
 metric_definitions,
 max_jobs=20,
 max_parallel_jobs=2,
 objective_type=objective_type,
 base_tuning_job_name='mlflow')

tuner.fit({'train':train_path, 'test': test_path})