{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<div style=\"font-size:200%;font-weight:bold\">How to use <code>smallmatter.ds.cdf()</code></div><br>\n",
    "\n",
    "This notebook shows how to use the `smallmatter.ds.cdf()` function to quickly inspect and visualize the relationship\n",
    "among categorical columns: is it 1:1, 1:*N*, and the distribution of *N*."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%matplotlib inline\n",
    "%load_ext autoreload\n",
    "%autoreload 2\n",
    "%config InlineBackend.figure_format = 'retina'\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from IPython.display import display, Markdown\n",
    "from typing import Optional\n",
    "\n",
    "from smallmatter.ds import cdf"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Utility functions\n",
    "\n",
    "Let's define several helper functions for the remainder of this notebook."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def head_cdf_results(*args):\n",
    "    \"\"\"Show the head of cdf analysis.\n",
    "\n",
    "    Args:\n",
    "        *args: dataframes\n",
    "    \"\"\"\n",
    "    for cdf_df in args:\n",
    "        display(cdf_df.head())\n",
    "\n",
    "\n",
    "def synthetic_df(np_seed: Optional[int] = None) -> pd.DataFrame:\n",
    "    \"\"\"Get a dataframe where each random SKU has 1+ uoms, stock rooms, and replenishment dates.\n",
    "\n",
    "    Args:\n",
    "        np_seed (Optional[int]): when given an ``int``, fix random seed to this\n",
    "        number (default: None).\n",
    "\n",
    "    Returns:\n",
    "        pd.DataFrame: a dataframe with these columns:\n",
    "            ``['sku', 'uom', 'stock_room', 'replen_date', qty']``.\n",
    "    \"\"\"\n",
    "    # Fix random seed (if requested)\n",
    "    if np_seed:\n",
    "        np.random.seed(np_seed)\n",
    "\n",
    "    # Pre-defined settings: counts & ratios\n",
    "    sku_cnt = 100\n",
    "    uoms = ['box', 'case', 'pack', 'each']\n",
    "    stock_room_cnt = 8\n",
    "    uom_ratios = (1.0, 0.4, 0.2, 0.05)\n",
    "    stock_room_ratios = (1.0, 0.5, 0.3, 0.3, 0.1, 0.1, 0.5, 0.1)\n",
    "\n",
    "    # Generate sku and stock_room.\n",
    "    skus = [f'item-{i:02d}' for i in range(sku_cnt)]\n",
    "    stock_rooms = [f'stock-room-{i:02d}' for i in range(stock_room_cnt)]\n",
    "\n",
    "    # Generate sku x uom.\n",
    "    arrs = []\n",
    "    for uom, ratio in zip(uoms, uom_ratios):\n",
    "        rows = int(ratio * sku_cnt)\n",
    "        a = np.empty((rows, 2), dtype=\"U25\")\n",
    "        a[:, 0] = np.random.choice(skus, size=rows, replace=False)\n",
    "        a[:, 1] = uom\n",
    "        arrs.append(a)\n",
    "    a = np.vstack(arrs)\n",
    "\n",
    "    # Generate (sku, uom) x stock_room.\n",
    "    sku_uom_cnt = a.shape[0]\n",
    "    arrs = []\n",
    "    for stock_room, ratio in zip(stock_rooms, stock_room_ratios):\n",
    "        rows = int(ratio * sku_uom_cnt)\n",
    "        b = np.empty((rows, 3), dtype=\"U25\")\n",
    "        sku_uom_indices = np.random.choice(sku_uom_cnt, rows, replace=False)\n",
    "        b[:, :2] = a[sku_uom_indices]\n",
    "        b[:, 2] = stock_room\n",
    "        arrs.append(b)\n",
    "    b = np.vstack(arrs)\n",
    "    del a\n",
    "\n",
    "    # Generate (sku, uom, stock_room) x replen_date\n",
    "    arrs = []\n",
    "    sku_uom_sr_cnt = b.shape[0]\n",
    "    rows = int(0.8 * sku_uom_sr_cnt)\n",
    "    for ts in pd.date_range('2020-01-01', '2020-12-31', freq='W'):\n",
    "        c = np.empty((rows, 5), dtype=\"U25\")\n",
    "        sku_uom_sr_indices = np.random.choice(sku_uom_sr_cnt, rows, replace=False)\n",
    "        c[:, :3] = b[sku_uom_sr_indices]\n",
    "        c[:, 3] = str(ts.date())\n",
    "        arrs.append(c)\n",
    "    c = np.vstack(arrs)\n",
    "    del b\n",
    "\n",
    "    # Add quantity to every (sku, uom, stock_room, replen_date).\n",
    "    c[:, 4] = np.clip(\n",
    "        np.random.normal(10000, 8000, size=c.shape[0]).astype(int),\n",
    "        a_min=10,\n",
    "        a_max=None,\n",
    "    )\n",
    "\n",
    "    # Randomly drop 35% to simulate a dataframe that does NOT record qty=0.\n",
    "    indices = np.random.choice(c.shape[0], int(0.65 * c.shape[0]), replace=False)\n",
    "    return pd.DataFrame(c[indices], columns=['sku', 'uom', 'stock_room', 'replen_date', 'qty'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Generate reports\n",
    "\n",
    "Run the next cell, then observed the generated files located under sub-directory `./reports`."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "output_dir = 'reports'\n",
    "\n",
    "# Create a sample, synthetic dataframe\n",
    "df = synthetic_df()\n",
    "\n",
    "# Generate timeseries-length reports, each timeseries corresponds to (sku, uom).\n",
    "display(Markdown('**Length of timeseries where timeseries=(sku, uom)**'))\n",
    "cdf_tslen = cdf(\n",
    "    df,\n",
    "    cdf_count_name='sku#uom_cnt',\n",
    "    by=['sku', 'uom'],\n",
    "    agg_funcs={'replen_date': 'nunique'},\n",
    ")\n",
    "cdf_tslen.save_reports(output_dir)\n",
    "head_cdf_results(*cdf_tslen.cdf.values())\n",
    "\n",
    "# Generate reports for each item\n",
    "display(Markdown('---\\n**How many sku per stockroom, uom?**'))\n",
    "cdf_by_sku = cdf(\n",
    "    df,\n",
    "    cdf_count_name='sku_cnt',\n",
    "    by=['sku'],\n",
    "    agg_funcs={\n",
    "        'stock_room': 'nunique',\n",
    "        'uom': 'nunique',\n",
    "    },\n",
    ")\n",
    "cdf_by_sku.save_reports(output_dir)\n",
    "head_cdf_results(*cdf_by_sku.cdf.values())\n",
    "\n",
    "# Generate reports for each stock room\n",
    "display(Markdown('---\\n**How many stockroom per catalog, uom?**'))\n",
    "cdf_by_sr = cdf(\n",
    "    df,\n",
    "    cdf_count_name='cost_center_cnt',\n",
    "    by=['stock_room'],\n",
    "    agg_funcs={\n",
    "        'sku': 'nunique',\n",
    "        'uom': 'nunique',\n",
    "    },\n",
    ")\n",
    "cdf_by_sr.save_reports(output_dir)\n",
    "head_cdf_results(*cdf_by_sr.cdf.values())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "conda_python3",
   "language": "python",
   "name": "conda_python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.13"
  },
  "toc-autonumbering": true
 },
 "nbformat": 4,
 "nbformat_minor": 4
}