{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "ffd6067e", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\u001b[33mWARNING: You are using pip version 21.1.2; however, version 21.1.3 is available.\n", "You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.\u001b[0m\n" ] } ], "source": [ "!pip install varname --quiet" ] }, { "cell_type": "code", "execution_count": 2, "id": "7ee1750d", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "from varname import nameof" ] }, { "cell_type": "markdown", "id": "5fe951a6", "metadata": {}, "source": [ "## Download the original sample dataset" ] }, { "cell_type": "code", "execution_count": 3, "id": "7606296b", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "download: s3://sagemaker-sample-files/datasets/tabular/synthetic/churn.txt to ./churn.txt\n" ] } ], "source": [ "!aws s3 cp s3://sagemaker-sample-files/datasets/tabular/synthetic/churn.txt ./" ] }, { "cell_type": "code", "execution_count": 4, "id": "fa03f4fc", "metadata": {}, "outputs": [], "source": [ "churn = pd.read_csv(\"./churn.txt\")" ] }, { "cell_type": "code", "execution_count": 5, "id": "49fc099a", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
StateAccount LengthArea CodePhoneInt'l PlanVMail PlanVMail MessageDay MinsDay CallsDay Charge...Eve CallsEve ChargeNight MinsNight CallsNight ChargeIntl MinsIntl CallsIntl ChargeCustServ CallsChurn?
0PA163806403-2562noyes3008.16220437.579174...46.5086394.0657591005.1116244.92816065.6732033True.
1SC15836158-8416yesno010.01899344.226289...09.9725927.1410402006.4361883.22174862.5597498False.
2MO131777896-6253noyes3004.70849034.768160...34.5667155.3632351005.1424517.13902326.2541574False.
3WY75878817-5729yesyes7001.26873432.567642...52.3336243.7735864503.8144132.24577961.0806926False.
4WY146878450-4942yesno02.69617735.908916...33.6704083.7516732502.7968126.90554547.1343436True.
\n", "

5 rows × 21 columns

\n", "
" ], "text/plain": [ " State Account Length Area Code Phone Int'l Plan VMail Plan \\\n", "0 PA 163 806 403-2562 no yes \n", "1 SC 15 836 158-8416 yes no \n", "2 MO 131 777 896-6253 no yes \n", "3 WY 75 878 817-5729 yes yes \n", "4 WY 146 878 450-4942 yes no \n", "\n", " VMail Message Day Mins Day Calls Day Charge ... Eve Calls \\\n", "0 300 8.162204 3 7.579174 ... 4 \n", "1 0 10.018993 4 4.226289 ... 0 \n", "2 300 4.708490 3 4.768160 ... 3 \n", "3 700 1.268734 3 2.567642 ... 5 \n", "4 0 2.696177 3 5.908916 ... 3 \n", "\n", " Eve Charge Night Mins Night Calls Night Charge Intl Mins Intl Calls \\\n", "0 6.508639 4.065759 100 5.111624 4.928160 6 \n", "1 9.972592 7.141040 200 6.436188 3.221748 6 \n", "2 4.566715 5.363235 100 5.142451 7.139023 2 \n", "3 2.333624 3.773586 450 3.814413 2.245779 6 \n", "4 3.670408 3.751673 250 2.796812 6.905545 4 \n", "\n", " Intl Charge CustServ Calls Churn? \n", "0 5.673203 3 True. \n", "1 2.559749 8 False. \n", "2 6.254157 4 False. \n", "3 1.080692 6 False. \n", "4 7.134343 6 True. \n", "\n", "[5 rows x 21 columns]" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "churn.head()" ] }, { "cell_type": "markdown", "id": "4c99f447", "metadata": {}, "source": [ "## Split the dataset into training, inference and holdout subset" ] }, { "cell_type": "code", "execution_count": 6, "id": "82b8c5cf", "metadata": {}, "outputs": [], "source": [ "total_len = len(churn)" ] }, { "cell_type": "code", "execution_count": 7, "id": "06735d24", "metadata": {}, "outputs": [], "source": [ "#ratio: 0.6 training, 0.2 inference, 0.2 holdout\n", "training = churn.iloc[:int(total_len*0.6)]\n", "inference = churn.iloc[int(total_len*0.6):int(total_len*0.8)]\n", "holdout = churn.iloc[int(total_len*0.8):]" ] }, { "cell_type": "code", "execution_count": 8, "id": "10ffe614", "metadata": {}, "outputs": [], "source": [ "#filter by State\n", "states = ['MD','RI']" ] }, { "cell_type": "code", "execution_count": 9, "id": "371ae83f", "metadata": {}, "outputs": [], "source": [ "for dataset in [training,inference,holdout]:\n", " for state in states:\n", " data = dataset[dataset.State==state]\n", " data.to_csv(\n", " f\"./data/{nameof(inference)}/raw/{state}/{state}.csv\"\n", " if nameof(inference) != \"holdout\"\n", " else f\"./data/{nameof(inference)}/raw/{state}/{state}.csv\",\n", " index=False,\n", " )" ] } ], "metadata": { "kernelspec": { "display_name": "conda_python3", "language": "python", "name": "conda_python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.13" } }, "nbformat": 4, "nbformat_minor": 5 }