{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "
How to use smallmatter.ds.cdf()

\n", "\n", "This notebook shows how to use the `smallmatter.ds.cdf()` function to quickly inspect and visualize the relationship\n", "among categorical columns: is it 1:1, 1:*N*, and the distribution of *N*." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%matplotlib inline\n", "%load_ext autoreload\n", "%autoreload 2\n", "%config InlineBackend.figure_format = 'retina'\n", "\n", "import numpy as np\n", "import pandas as pd\n", "from IPython.display import display, Markdown\n", "from typing import Optional\n", "\n", "from smallmatter.ds import cdf" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Utility functions\n", "\n", "Let's define several helper functions for the remainder of this notebook." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def head_cdf_results(*args):\n", " \"\"\"Show the head of cdf analysis.\n", "\n", " Args:\n", " *args: dataframes\n", " \"\"\"\n", " for cdf_df in args:\n", " display(cdf_df.head())\n", "\n", "\n", "def synthetic_df(np_seed: Optional[int] = None) -> pd.DataFrame:\n", " \"\"\"Get a dataframe where each random SKU has 1+ uoms, stock rooms, and replenishment dates.\n", "\n", " Args:\n", " np_seed (Optional[int]): when given an ``int``, fix random seed to this\n", " number (default: None).\n", "\n", " Returns:\n", " pd.DataFrame: a dataframe with these columns:\n", " ``['sku', 'uom', 'stock_room', 'replen_date', qty']``.\n", " \"\"\"\n", " # Fix random seed (if requested)\n", " if np_seed:\n", " np.random.seed(np_seed)\n", "\n", " # Pre-defined settings: counts & ratios\n", " sku_cnt = 100\n", " uoms = ['box', 'case', 'pack', 'each']\n", " stock_room_cnt = 8\n", " uom_ratios = (1.0, 0.4, 0.2, 0.05)\n", " stock_room_ratios = (1.0, 0.5, 0.3, 0.3, 0.1, 0.1, 0.5, 0.1)\n", "\n", " # Generate sku and stock_room.\n", " skus = [f'item-{i:02d}' for i in range(sku_cnt)]\n", " stock_rooms = [f'stock-room-{i:02d}' for i in range(stock_room_cnt)]\n", "\n", " # Generate sku x uom.\n", " arrs = []\n", " for uom, ratio in zip(uoms, uom_ratios):\n", " rows = int(ratio * sku_cnt)\n", " a = np.empty((rows, 2), dtype=\"U25\")\n", " a[:, 0] = np.random.choice(skus, size=rows, replace=False)\n", " a[:, 1] = uom\n", " arrs.append(a)\n", " a = np.vstack(arrs)\n", "\n", " # Generate (sku, uom) x stock_room.\n", " sku_uom_cnt = a.shape[0]\n", " arrs = []\n", " for stock_room, ratio in zip(stock_rooms, stock_room_ratios):\n", " rows = int(ratio * sku_uom_cnt)\n", " b = np.empty((rows, 3), dtype=\"U25\")\n", " sku_uom_indices = np.random.choice(sku_uom_cnt, rows, replace=False)\n", " b[:, :2] = a[sku_uom_indices]\n", " b[:, 2] = stock_room\n", " arrs.append(b)\n", " b = np.vstack(arrs)\n", " del a\n", "\n", " # Generate (sku, uom, stock_room) x replen_date\n", " arrs = []\n", " sku_uom_sr_cnt = b.shape[0]\n", " rows = int(0.8 * sku_uom_sr_cnt)\n", " for ts in pd.date_range('2020-01-01', '2020-12-31', freq='W'):\n", " c = np.empty((rows, 5), dtype=\"U25\")\n", " sku_uom_sr_indices = np.random.choice(sku_uom_sr_cnt, rows, replace=False)\n", " c[:, :3] = b[sku_uom_sr_indices]\n", " c[:, 3] = str(ts.date())\n", " arrs.append(c)\n", " c = np.vstack(arrs)\n", " del b\n", "\n", " # Add quantity to every (sku, uom, stock_room, replen_date).\n", " c[:, 4] = np.clip(\n", " np.random.normal(10000, 8000, size=c.shape[0]).astype(int),\n", " a_min=10,\n", " a_max=None,\n", " )\n", "\n", " # Randomly drop 35% to simulate a dataframe that does NOT record qty=0.\n", " indices = np.random.choice(c.shape[0], int(0.65 * c.shape[0]), replace=False)\n", " return pd.DataFrame(c[indices], columns=['sku', 'uom', 'stock_room', 'replen_date', 'qty'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Generate reports\n", "\n", "Run the next cell, then observed the generated files located under sub-directory `./reports`." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "output_dir = 'reports'\n", "\n", "# Create a sample, synthetic dataframe\n", "df = synthetic_df()\n", "\n", "# Generate timeseries-length reports, each timeseries corresponds to (sku, uom).\n", "display(Markdown('**Length of timeseries where timeseries=(sku, uom)**'))\n", "cdf_tslen = cdf(\n", " df,\n", " cdf_count_name='sku#uom_cnt',\n", " by=['sku', 'uom'],\n", " agg_funcs={'replen_date': 'nunique'},\n", ")\n", "cdf_tslen.save_reports(output_dir)\n", "head_cdf_results(*cdf_tslen.cdf.values())\n", "\n", "# Generate reports for each item\n", "display(Markdown('---\\n**How many sku per stockroom, uom?**'))\n", "cdf_by_sku = cdf(\n", " df,\n", " cdf_count_name='sku_cnt',\n", " by=['sku'],\n", " agg_funcs={\n", " 'stock_room': 'nunique',\n", " 'uom': 'nunique',\n", " },\n", ")\n", "cdf_by_sku.save_reports(output_dir)\n", "head_cdf_results(*cdf_by_sku.cdf.values())\n", "\n", "# Generate reports for each stock room\n", "display(Markdown('---\\n**How many stockroom per catalog, uom?**'))\n", "cdf_by_sr = cdf(\n", " df,\n", " cdf_count_name='cost_center_cnt',\n", " by=['stock_room'],\n", " agg_funcs={\n", " 'sku': 'nunique',\n", " 'uom': 'nunique',\n", " },\n", ")\n", "cdf_by_sr.save_reports(output_dir)\n", "head_cdf_results(*cdf_by_sr.cdf.values())" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "conda_python3", "language": "python", "name": "conda_python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.13" }, "toc-autonumbering": true }, "nbformat": 4, "nbformat_minor": 4 }