{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Tune a Scikit-Learn model in SageMaker and track with MLFlow" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Setup environment" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import sagemaker\n", "import pandas as pd\n", "from sklearn.datasets import load_boston\n", "from sagemaker.sklearn.estimator import SKLearn\n", "from sklearn.model_selection import train_test_split\n", "from sagemaker.tuner import IntegerParameter, HyperparameterTuner\n", "\n", "sess = sagemaker.Session()\n", "role = sagemaker.get_execution_role()\n", "bucket = sess.default_bucket()\n", "\n", "# uri of your remote mlflow server\n", "tracking_uri = '' " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Prepare data\n", "We load a dataset from sklearn, split it and send it to S3" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# we use the Boston housing dataset \n", "data = load_boston()\n", "\n", "X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.25, random_state=42)\n", "\n", "trainX = pd.DataFrame(X_train, columns=data.feature_names)\n", "trainX['target'] = y_train\n", "\n", "testX = pd.DataFrame(X_test, columns=data.feature_names)\n", "testX['target'] = y_test\n", "\n", "trainX.to_csv('boston_train.csv')\n", "testX.to_csv('boston_test.csv')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# send data to S3. SageMaker will take training data from s3\n", "train_path = sess.upload_data(path='boston_train.csv', bucket=bucket, key_prefix='sagemaker/sklearncontainer')\n", "test_path = sess.upload_data(path='boston_test.csv', bucket=bucket, key_prefix='sagemaker/sklearncontainer')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Tune" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "hyperparameters = {\n", " 'tracking_uri': tracking_uri,\n", " 'experiment_name': 'boston-housing',\n", " 'features': 'CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT',\n", " 'target': 'target'\n", "}\n", "\n", "metric_definitions = [{'Name': 'median-AE', 'Regex': \"AE-at-50th-percentile: ([0-9.]+).*$\"}]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "estimator = SKLearn(\n", " entry_point='train.py',\n", " source_dir='source_dir',\n", " role=role,\n", " instance_count=1,\n", " instance_type='ml.m5.xlarge',\n", " hyperparameters=hyperparameters,\n", " metric_definitions=metric_definitions,\n", " framework_version='1.0-1',\n", " py_version='py3'\n", ")\n", "\n", "hyperparameter_ranges = {\n", " 'n-estimators': IntegerParameter(50, 200),\n", " 'min-samples-leaf': IntegerParameter(1, 10)\n", "}\n", "\n", "objective_metric_name = 'median-AE'\n", "objective_type = 'Minimize'" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "tuner = HyperparameterTuner(estimator,\n", " objective_metric_name,\n", " hyperparameter_ranges,\n", " metric_definitions,\n", " max_jobs=20,\n", " max_parallel_jobs=2,\n", " objective_type=objective_type,\n", " base_tuning_job_name='mlflow')\n", "\n", "tuner.fit({'train':train_path, 'test': test_path})" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "conda_python3", "language": "python", "name": "conda_python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.12" }, "notice": "Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the \"License\"). You may not use this file except in compliance with the License. A copy of the License is located at http://aws.amazon.com/apache2.0/ or in the \"license\" file accompanying this file. This file is distributed on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.", "vscode": { "interpreter": { "hash": "3b41de70bedc0e302a3aeb58a0c77b854f2e56c8930e61a4aaa3340c96b01f1d" } } }, "nbformat": 4, "nbformat_minor": 2 }