{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Data Scientist - Raw Data\n", "***\n", "*This notebook should work well with the Python 3 (Data Science) kernel in SageMaker Studio*\n", "***\n", "\n", "For the demonstration workflow, you'll download synthetically generated data and upload it to the studio default S3 bucket." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Environment setup\n", "Import libraries, setup logging, and define few variables. " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import logging\n", "import requests\n", "import sagemaker\n", "\n", "from pathlib import Path\n", "from urllib import parse" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Set up a logger" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "logger = logging.getLogger(\"__name__\")\n", "logger.setLevel(logging.INFO)\n", "logger.addHandler(logging.StreamHandler())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Define SageMaker and Boto3 sessions and few additional parameters" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sagemaker_session = sagemaker.Session()\n", "sagemaker_client = sagemaker_session.sagemaker_client\n", "\n", "boto_session = sagemaker_session.boto_session\n", "region = sagemaker_session.boto_region_name\n", "role = sagemaker.get_execution_role()\n", "\n", "s3_uploader = sagemaker.s3.S3Uploader\n", "\n", "bucket = sagemaker_session.default_bucket()\n", "prefix = \"mlops-demo\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Data Download\n", "The inputs for building our model and workflow are two tables of insurance data: a claims table and a customers table." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "base_url = \"https://raw.githubusercontent.com/aws/amazon-sagemaker-examples/main/end_to_end/fraud_detection/data/\"\n", "file_list = [\"claims.csv\", \"customers.csv\"]\n", "feature_eng_base_path = Path(\"feature_engineering\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "local_path = Path(\"data\")\n", "local_path.mkdir(exist_ok=True)\n", "for file_url in file_list:\n", " file_url = base_url + file_url\n", " parsed_url = parse.urlparse(file_url)\n", " file_name = Path(parsed_url.path).name\n", " file_path = local_path / file_name\n", " with file_path.open(\"wb\") as f, requests.get(file_url, stream=True) as r:\n", " for chunk in r.iter_content():\n", " f.write(chunk)\n", " logger.info(f\"Retrieved {file_url}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Data Upload to S3" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "data_uri_prefix = s3_uploader.upload(local_path.as_posix(), f\"s3://{bucket}/{prefix}\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "claims_uri = data_uri_prefix + \"/claims.csv\"\n", "customers_uri = data_uri_prefix + \"/customers.csv\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%store claims_uri\n", "%store customers_uri" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (Data Science)", "language": "python", "name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:ap-southeast-1:492261229750:image/datascience-1.0" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.10" } }, "nbformat": 4, "nbformat_minor": 4 }