{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Train a Scikit-Learn model in SageMaker and track with MLFlow"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Setup Environment"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install -q --upgrade pip\n",
    "!pip install -q --upgrade sagemaker==2.117.0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sagemaker\n",
    "import pandas as pd\n",
    "from sklearn.datasets import load_boston\n",
    "from sagemaker.sklearn.estimator import SKLearn\n",
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "sess = sagemaker.Session()\n",
    "role = sagemaker.get_execution_role()\n",
    "bucket = sess.default_bucket()\n",
    "\n",
    "# uri of your remote mlflow server\n",
    "tracking_uri = '<YOUR MLFLOW SERVER URI>' "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Prepare data\n",
    "We load a dataset from sklearn, split it and send it to S3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# we use the Boston housing dataset \n",
    "data = load_boston()\n",
    "\n",
    "X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.25, random_state=42)\n",
    "\n",
    "trainX = pd.DataFrame(X_train, columns=data.feature_names)\n",
    "trainX['target'] = y_train\n",
    "\n",
    "testX = pd.DataFrame(X_test, columns=data.feature_names)\n",
    "testX['target'] = y_test\n",
    "\n",
    "trainX.to_csv('boston_train.csv')\n",
    "testX.to_csv('boston_test.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# send data to S3. SageMaker will take training data from s3\n",
    "train_path = sess.upload_data(path='boston_train.csv', bucket=bucket, key_prefix='sagemaker/sklearncontainer')\n",
    "test_path = sess.upload_data(path='boston_test.csv', bucket=bucket, key_prefix='sagemaker/sklearncontainer')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "hyperparameters = {\n",
    "    'tracking_uri': tracking_uri,\n",
    "    'experiment_name': 'boston-housing',\n",
    "    'n-estimators': 100,\n",
    "    'min-samples-leaf': 3,\n",
    "    'features': 'CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT',\n",
    "    'target': 'target'\n",
    "}\n",
    "\n",
    "metric_definitions = [{'Name': 'median-AE', 'Regex': \"AE-at-50th-percentile: ([0-9.]+).*$\"}]\n",
    "\n",
    "estimator = SKLearn(\n",
    "    entry_point='train.py',\n",
    "    source_dir='source_dir',\n",
    "    role=role,\n",
    "    metric_definitions=metric_definitions,\n",
    "    hyperparameters=hyperparameters,\n",
    "    instance_count=1,\n",
    "    instance_type='ml.m5.xlarge',\n",
    "    framework_version='1.0-1',\n",
    "    base_job_name='mlflow',\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "estimator.fit({'train':train_path, 'test': test_path})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "conda_python3",
   "language": "python",
   "name": "conda_python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.12"
  },
  "vscode": {
   "interpreter": {
    "hash": "3b41de70bedc0e302a3aeb58a0c77b854f2e56c8930e61a4aaa3340c96b01f1d"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}