# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
# SPDX-License-Identifier: MIT-0
"""Helper utilities for analysing synthetic dataset during generation"""
# Python Built-Ins:
from typing import Dict, List, Optional, Union

# External Dependencies:
from matplotlib import pyplot as plt
import pandas as pd


def agg_df_features(
    df: pd.DataFrame,
    config: Dict[str, Dict[str, Union[str, List[str]]]],
    drop_aggregated: bool = True,
) -> None:
    """Aggregate features of a DataFrame (in-place) by string concatenation

    Parameters
    ----------
    df :
        The DataFrame to be modified *IN PLACE*
    config :
        A dictionary of configurations keyed by target (aggregated) field name. Each aggregation
        config is itself a dict containing: `features` (the list of input column names) and
        optionally `sep` (the string separator to be used when concatenating fields).
    drop_aggregated :
        Set `False` to keep the affected original features in the DataFrame. By default (`True`),
        these will be dropped so only the aggregated features (and any columns not affected by
        aggregation) will remain.
    """
    aggregated_features = set()
    for aggname, cfg in config.items():
        df_col_names = cfg["features"]
        if len(df_col_names) > 1:
            lead_col = df[df_col_names[0]]
            aggd_col = lead_col.str.cat(df[df_col_names[1:]], sep=cfg.get("sep"))
            df.loc[:, aggname] = aggd_col
        else:
            df.loc[:, aggname] = df[df_col_names[0]]
        aggregated_features.update(df_col_names)

    if drop_aggregated:
        df.drop(columns=aggregated_features, inplace=True)
    return  # Return None to clarify that modification is in-place.


def log_log_plot(
    value_counts: pd.Series,
    xlabel: Optional[str] = None,
    ylabel: Optional[str] = None,
    title: Optional[str] = None,
    xnorm: bool = False,
    **kwargs,
) -> plt.Axes:
    """Generate a log-log frequency analysis

    These log-log plots help you answer questions like "how much of my dataset is at least [Y]".
    For example, "how many of my items have at least N days with sales" or "how many of my items
    sold at least N units". These are useful for characterising *sparsity* in datasets - to
    understand what proportion of items/groups meets the typical data density for a model to work
    well.

    The output is a line chart with negative slope, proceeding from ({minimum value}, {total number
    of items in the dataset}) to ({maximum value}, {number of items that exactly equal that value})

    Parameters
    ----------
    value_counts :
        pandas `value_counts` result for the underlying list you want to characterize.
    xlabel :
        Optional X axis label for the chart
    ylabel :
        Optional Y axis label for the chart
    title :
        Optional title for the chart
    xnorm :
        Set `True` to display X axis as a 0-1 proportion of the dataset. Default `False` displays
        absolute number of items. Useful for comparing datasets of different sizes.
    **kwargs :
        Any additional keyword args are passed through to pyplot `plot()` function.

    Returns
    -------
    ax :
        pyplot Axes for the generated graph.
    """
    # Produce reverse-sorted index of values (e.g. total sales, record counts), with to the
    # cumulative number of items meeting or exceeding each one:
    counts_cumsum = value_counts.sort_index(ascending=False).cumsum()
    if xnorm:
        counts_cumsum /= max(counts_cumsum)
    ax = plt.gca()
    ax.plot(counts_cumsum, counts_cumsum.index, **kwargs)
    ax.set_xscale("log")
    if xlabel is not None:
        ax.set_xlabel(xlabel)
    ax.set_yscale("log")
    if ylabel is not None:
        ax.set_ylabel(ylabel)
    if title is not None:
        ax.set_title(title)
    ax.grid(axis="both", which="minor")
    return ax