{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Partition data \n", "**This notebook partitions `orders` data into batches which will be used by the other notebooks in this module for batch ingestion to the feature store.**\n", "\n", "**Note:** Please set kernel to `Python 3 (Data Science)` and select instance to `ml.t3.medium`" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "---\n", "\n", "## Contents\n", "\n", "1. [Setup](#Setup)\n", "1. [Load data](#Load-data)\n", "1. [Groupby and partition](#Groupby-and-partition)\n", "1. [Copy partitions from local to S3](#Copy-partitions-from-local-to-S3)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Setup" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Imports " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import sagemaker\n", "import shutil\n", "import os" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Essentials" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sagemaker_session = sagemaker.Session()\n", "default_bucket = sagemaker_session.default_bucket()\n", "prefix = 'sagemaker-feature-store'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Load data" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Read `orders` data" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv('../data/raw/orders.csv')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df['year_month'] = df['purchased_on'].apply(lambda x: '-'.join([str(pd.to_datetime(x).year), \n", " str(pd.to_datetime(x).month)]))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df.head(5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Groupby and partition \n", "Groupby and partition `orders` dataframe into multiple dataframes based on `year` and `month`" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "grouped = df.groupby('year_month')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "for name, group in grouped:\n", " partition_dir = f'../data/partitions/{name}'\n", " if os.path.exists(partition_dir):\n", " shutil.rmtree(partition_dir)\n", " os.makedirs(partition_dir)\n", " partition_df = group.drop('year_month', axis=1)\n", " partition_df.to_csv(f'{partition_dir}/partition.csv', index=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Copy partitions from local to S3 " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!aws s3 cp ../data/partitions/ s3://{default_bucket}/{prefix}/partitions/ --recursive" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "instance_type": "ml.t3.medium", "kernelspec": { "display_name": "Python 3 (Data Science)", "language": "python", "name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:us-east-1:081325390199:image/datascience-1.0" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.10" } }, "nbformat": 4, "nbformat_minor": 4 }