<div style="font-size:200%;font-weight:bold">How to use <code>smallmatter.ds.cdf()</code></div><br>

This notebook shows how to use the `smallmatter.ds.cdf()` function to quickly inspect and visualize the relationship
among categorical columns: is it 1:1, 1:*N*, and the distribution of *N*.

In [None]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
%config InlineBackend.figure_format = 'retina'

import numpy as np
import pandas as pd
from IPython.display import display, Markdown
from typing import Optional

from smallmatter.ds import cdf

# Utility functions

Let's define several helper functions for the remainder of this notebook.

In [None]:
def head_cdf_results(*args):
    """Show the head of cdf analysis.

    Args:
        *args: dataframes
    """
    for cdf_df in args:
        display(cdf_df.head())


def synthetic_df(np_seed: Optional[int] = None) -> pd.DataFrame:
    """Get a dataframe where each random SKU has 1+ uoms, stock rooms, and replenishment dates.

    Args:
        np_seed (Optional[int]): when given an ``int``, fix random seed to this
        number (default: None).

    Returns:
        pd.DataFrame: a dataframe with these columns:
            ``['sku', 'uom', 'stock_room', 'replen_date', qty']``.
    """
    # Fix random seed (if requested)
    if np_seed:
        np.random.seed(np_seed)

    # Pre-defined settings: counts & ratios
    sku_cnt = 100
    uoms = ['box', 'case', 'pack', 'each']
    stock_room_cnt = 8
    uom_ratios = (1.0, 0.4, 0.2, 0.05)
    stock_room_ratios = (1.0, 0.5, 0.3, 0.3, 0.1, 0.1, 0.5, 0.1)

    # Generate sku and stock_room.
    skus = [f'item-{i:02d}' for i in range(sku_cnt)]
    stock_rooms = [f'stock-room-{i:02d}' for i in range(stock_room_cnt)]

    # Generate sku x uom.
    arrs = []
    for uom, ratio in zip(uoms, uom_ratios):
        rows = int(ratio * sku_cnt)
        a = np.empty((rows, 2), dtype="U25")
        a[:, 0] = np.random.choice(skus, size=rows, replace=False)
        a[:, 1] = uom
        arrs.append(a)
    a = np.vstack(arrs)

    # Generate (sku, uom) x stock_room.
    sku_uom_cnt = a.shape[0]
    arrs = []
    for stock_room, ratio in zip(stock_rooms, stock_room_ratios):
        rows = int(ratio * sku_uom_cnt)
        b = np.empty((rows, 3), dtype="U25")
        sku_uom_indices = np.random.choice(sku_uom_cnt, rows, replace=False)
        b[:, :2] = a[sku_uom_indices]
        b[:, 2] = stock_room
        arrs.append(b)
    b = np.vstack(arrs)
    del a

    # Generate (sku, uom, stock_room) x replen_date
    arrs = []
    sku_uom_sr_cnt = b.shape[0]
    rows = int(0.8 * sku_uom_sr_cnt)
    for ts in pd.date_range('2020-01-01', '2020-12-31', freq='W'):
        c = np.empty((rows, 5), dtype="U25")
        sku_uom_sr_indices = np.random.choice(sku_uom_sr_cnt, rows, replace=False)
        c[:, :3] = b[sku_uom_sr_indices]
        c[:, 3] = str(ts.date())
        arrs.append(c)
    c = np.vstack(arrs)
    del b

    # Add quantity to every (sku, uom, stock_room, replen_date).
    c[:, 4] = np.clip(
        np.random.normal(10000, 8000, size=c.shape[0]).astype(int),
        a_min=10,
        a_max=None,
    )

    # Randomly drop 35% to simulate a dataframe that does NOT record qty=0.
    indices = np.random.choice(c.shape[0], int(0.65 * c.shape[0]), replace=False)
    return pd.DataFrame(c[indices], columns=['sku', 'uom', 'stock_room', 'replen_date', 'qty'])

# Generate reports

Run the next cell, then observed the generated files located under sub-directory `./reports`.

In [None]:
output_dir = 'reports'

# Create a sample, synthetic dataframe
df = synthetic_df()

# Generate timeseries-length reports, each timeseries corresponds to (sku, uom).
display(Markdown('**Length of timeseries where timeseries=(sku, uom)**'))
cdf_tslen = cdf(
    df,
    cdf_count_name='sku#uom_cnt',
    by=['sku', 'uom'],
    agg_funcs={'replen_date': 'nunique'},
)
cdf_tslen.save_reports(output_dir)
head_cdf_results(*cdf_tslen.cdf.values())

# Generate reports for each item
display(Markdown('---\n**How many sku per stockroom, uom?**'))
cdf_by_sku = cdf(
    df,
    cdf_count_name='sku_cnt',
    by=['sku'],
    agg_funcs={
        'stock_room': 'nunique',
        'uom': 'nunique',
    },
)
cdf_by_sku.save_reports(output_dir)
head_cdf_results(*cdf_by_sku.cdf.values())

# Generate reports for each stock room
display(Markdown('---\n**How many stockroom per catalog, uom?**'))
cdf_by_sr = cdf(
    df,
    cdf_count_name='cost_center_cnt',
    by=['stock_room'],
    agg_funcs={
        'sku': 'nunique',
        'uom': 'nunique',
    },
)
cdf_by_sr.save_reports(output_dir)
head_cdf_results(*cdf_by_sr.cdf.values())