# Train a Scikit-Learn model in SageMaker and track with MLFlow

## Setup Environment

In [None]:
!pip install -q --upgrade pip
!pip install -q --upgrade sagemaker==2.117.0

In [None]:
import sagemaker
import pandas as pd
from sklearn.datasets import load_boston
from sagemaker.sklearn.estimator import SKLearn
from sklearn.model_selection import train_test_split

sess = sagemaker.Session()
role = sagemaker.get_execution_role()
bucket = sess.default_bucket()

# uri of your remote mlflow server
tracking_uri = '' 

## Prepare data
We load a dataset from sklearn, split it and send it to S3

In [None]:
# we use the Boston housing dataset 
data = load_boston()

X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.25, random_state=42)

trainX = pd.DataFrame(X_train, columns=data.feature_names)
trainX['target'] = y_train

testX = pd.DataFrame(X_test, columns=data.feature_names)
testX['target'] = y_test

trainX.to_csv('boston_train.csv')
testX.to_csv('boston_test.csv')

In [None]:
# send data to S3. SageMaker will take training data from s3
train_path = sess.upload_data(path='boston_train.csv', bucket=bucket, key_prefix='sagemaker/sklearncontainer')
test_path = sess.upload_data(path='boston_test.csv', bucket=bucket, key_prefix='sagemaker/sklearncontainer')

## Train

In [None]:
hyperparameters = {
 'tracking_uri': tracking_uri,
 'experiment_name': 'boston-housing',
 'n-estimators': 100,
 'min-samples-leaf': 3,
 'features': 'CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT',
 'target': 'target'
}

metric_definitions = [{'Name': 'median-AE', 'Regex': "AE-at-50th-percentile: ([0-9.]+).*$"}]

estimator = SKLearn(
 entry_point='train.py',
 source_dir='source_dir',
 role=role,
 metric_definitions=metric_definitions,
 hyperparameters=hyperparameters,
 instance_count=1,
 instance_type='ml.m5.xlarge',
 framework_version='1.0-1',
 base_job_name='mlflow',
)

In [None]:
estimator.fit({'train':train_path, 'test': test_path})