{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Configuration Management" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "This notebook is designed to be run with `Python 3 (Data Science)` kernel." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Input & Output Configuration" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "In the following notebooks, we will use an S3 bucket to store raw data, processed data, feataures and trained models. Therefore we retrieve the configuration, which we build and stored earlier in the Parameter Store." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Includes" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import os\n", "import boto3\n", "\n", "session = boto3.Session()\n", "ssm = session.client('ssm')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "bucket = ssm.get_parameter(Name=\"/aik/data-bucket\")[\"Parameter\"][\"Value\"]\n", "bucket" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now as we have our base bucket, we can prepare the various prefixes which will be used for data processing and the model training later on." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# download url for the example data set\n", "download_url = \"https://www.kaggle.com/lastsummer/ipinyou/download\"\n", "\n", "# destination where we store the raw data\n", "raw_data = \"s3://\" + bucket + \"/raw/ipinyou-data\"\n", "# taking a subset of the rawdata to speed up processing and training during development\n", "bid_source = \"s3://\" + bucket + \"/raw/ipinyou-data/training1st/bid.20130311.txt.bz2\"\n", "imp_source = \"s3://\" + bucket + \"/raw/ipinyou-data/training1st/imp.20130311.txt.bz2\"\n", "\n", "# output destinations for the data processing \n", "output_train = \"s3://\" + bucket + \"/processed/sample/train\"\n", "output_test = \"s3://\" + bucket + \"/processed/sample/test\"\n", "output_verify = \"s3://\" + bucket + \"/processed/sample/valid\"\n", "output_transformed= \"s3://\" + bucket + \"/processed/sample/transformed\"\n", "pipelineModelArtifactPath = \"s3://\" + bucket + \"/pipeline-model/model.zip\"\n", "inference_data = \"s3://\" + bucket + \"/pipeline-model/inference-data/\"\n", "inference_schema = \"s3://\" + bucket + \"/pipeline-model/pipeline-schema.json\"\n", "binary_model = \"s3://\" + bucket + \"/binary-model/xgboost.bin\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Store Configuration data for consumption in following notebooks." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Store Parameter in the System Manger ParameterStore" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "A good alternative way of storing parameters is the AWS Systems Manager Parameter Store" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "ssm.put_parameter(Name=\"/aik/download_url\", Value=download_url, Type=\"String\", Overwrite=True)\n", "ssm.put_parameter(Name=\"/aik/raw_data\", Value=raw_data, Type=\"String\", Overwrite=True)\n", "ssm.put_parameter(Name=\"/aik/bid_source\", Value=bid_source, Type=\"String\", Overwrite=True)\n", "ssm.put_parameter(Name=\"/aik/imp_source\", Value=imp_source, Type=\"String\", Overwrite=True)\n", "ssm.put_parameter(Name=\"/aik/output_train\", Value=output_train, Type=\"String\", Overwrite=True)\n", "ssm.put_parameter(Name=\"/aik/output_test\", Value=output_test, Type=\"String\", Overwrite=True)\n", "ssm.put_parameter(Name=\"/aik/output_verify\", Value=output_verify, Type=\"String\", Overwrite=True)\n", "ssm.put_parameter(Name=\"/aik/output_transformed\", Value= output_transformed, Type=\"String\", Overwrite=True)\n", "ssm.put_parameter(Name=\"/aik/pipelineModelArtifactPath\", Value= pipelineModelArtifactPath, Type=\"String\", Overwrite=True)\n", "ssm.put_parameter(Name=\"/aik/inference_data\", Value=inference_data, Type=\"String\", Overwrite=True)\n", "ssm.put_parameter(Name=\"/aik/xgboost/path\", Value=binary_model, Type=\"String\", Overwrite=True)\n", "ssm.put_parameter(Name=\"/aik/pipelineModelArtifactSchemaPath\", Value=inference_schema, Type=\"String\", Overwrite=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Read Parameter from the System Manager Parameter Store" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "bucket = ssm.get_parameter(Name=\"/aik/data-bucket\")[\"Parameter\"][\"Value\"]\n", "download_url = ssm.get_parameter(Name=\"/aik/download_url\")[\"Parameter\"][\"Value\"]\n", "raw_data = ssm.get_parameter(Name=\"/aik/raw_data\")[\"Parameter\"][\"Value\"]\n", "bid_source = ssm.get_parameter(Name=\"/aik/bid_source\")[\"Parameter\"][\"Value\"]\n", "imp_source = ssm.get_parameter(Name=\"/aik/bid_source\")[\"Parameter\"][\"Value\"]\n", "output_train = ssm.get_parameter(Name=\"/aik/output_train\")[\"Parameter\"][\"Value\"]\n", "output_test = ssm.get_parameter(Name=\"/aik/output_test\")[\"Parameter\"][\"Value\"]\n", "output_verify = ssm.get_parameter(Name=\"/aik/output_verify\")[\"Parameter\"][\"Value\"] \n", "output_transformed = ssm.get_parameter(Name=\"/aik/output_transformed\")[\"Parameter\"][\"Value\"] \n", "pipelineModelArtifactPath = ssm.get_parameter(Name=\"/aik/pipelineModelArtifactPath\")[\"Parameter\"][\"Value\"] \n", "inference_data = ssm.get_parameter(Name=\"/aik/inference_data\")[\"Parameter\"][\"Value\"]\n", "binary_model = ssm.get_parameter(Name=\"/aik/xgboost/path\")[\"Parameter\"][\"Value\"]\n", "inference_schema= ssm.get_parameter(Name=\"/aik/pipelineModelArtifactSchemaPath\")[\"Parameter\"][\"Value\"]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Print current configuration" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(f'bucket={bucket}')\n", "print(f'download_url={download_url}')\n", "print(f'raw_data={raw_data}')\n", "print(f'bid_source={bid_source}')\n", "print(f'imp_source={imp_source}')\n", "print(f'output_train={output_train}')\n", "print(f'output_verify={output_verify}')\n", "print(f'output_test={output_test}')\n", "print(f'output_transformed={output_transformed}')\n", "print(f'pipelineModelArtifactPath={pipelineModelArtifactPath}')" ] } ], "metadata": { "instance_type": "ml.t3.medium", "kernelspec": { "display_name": "Python 3 (Data Science)", "language": "python", "name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:eu-west-1:470317259841:image/datascience-1.0" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.10" } }, "nbformat": 4, "nbformat_minor": 4 }