{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Train a Scikit-Learn model in SageMaker and track with MLFlow" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Setup Environment" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!pip install -q --upgrade pip\n", "!pip install -q --upgrade sagemaker==2.117.0" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import sagemaker\n", "import pandas as pd\n", "from sklearn.datasets import load_boston\n", "from sagemaker.sklearn.estimator import SKLearn\n", "from sklearn.model_selection import train_test_split\n", "\n", "sess = sagemaker.Session()\n", "role = sagemaker.get_execution_role()\n", "bucket = sess.default_bucket()\n", "\n", "# uri of your remote mlflow server\n", "tracking_uri = '' " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Prepare data\n", "We load a dataset from sklearn, split it and send it to S3" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# we use the Boston housing dataset \n", "data = load_boston()\n", "\n", "X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.25, random_state=42)\n", "\n", "trainX = pd.DataFrame(X_train, columns=data.feature_names)\n", "trainX['target'] = y_train\n", "\n", "testX = pd.DataFrame(X_test, columns=data.feature_names)\n", "testX['target'] = y_test\n", "\n", "trainX.to_csv('boston_train.csv')\n", "testX.to_csv('boston_test.csv')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# send data to S3. SageMaker will take training data from s3\n", "train_path = sess.upload_data(path='boston_train.csv', bucket=bucket, key_prefix='sagemaker/sklearncontainer')\n", "test_path = sess.upload_data(path='boston_test.csv', bucket=bucket, key_prefix='sagemaker/sklearncontainer')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Train" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "hyperparameters = {\n", " 'tracking_uri': tracking_uri,\n", " 'experiment_name': 'boston-housing',\n", " 'n-estimators': 100,\n", " 'min-samples-leaf': 3,\n", " 'features': 'CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT',\n", " 'target': 'target'\n", "}\n", "\n", "metric_definitions = [{'Name': 'median-AE', 'Regex': \"AE-at-50th-percentile: ([0-9.]+).*$\"}]\n", "\n", "estimator = SKLearn(\n", " entry_point='train.py',\n", " source_dir='source_dir',\n", " role=role,\n", " metric_definitions=metric_definitions,\n", " hyperparameters=hyperparameters,\n", " instance_count=1,\n", " instance_type='ml.m5.xlarge',\n", " framework_version='1.0-1',\n", " base_job_name='mlflow',\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "estimator.fit({'train':train_path, 'test': test_path})" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "conda_python3", "language": "python", "name": "conda_python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.12" }, "vscode": { "interpreter": { "hash": "3b41de70bedc0e302a3aeb58a0c77b854f2e56c8930e61a4aaa3340c96b01f1d" } } }, "nbformat": 4, "nbformat_minor": 2 }