import os import pandas as pd import pathlib import numpy as np import argparse import json import boto3 from hts import HTSRegressor import joblib import ast import plotly.graph_objects as go import warnings warnings.filterwarnings("ignore") import logging logger = logging.getLogger() logger.setLevel(logging.CRITICAL) s3 = boto3.client("s3") # Aggregate data by region and Quantity def get_region_columns(df, region): return [col for col in df.columns if region in col] def prepare_data(df_raw): print('******************* Prepare Data **********************') product = df_raw[(df_raw['item'] == "mens_clothing")] print(product.head()) product["region_state"] = product.apply(lambda x: f"{x['region']}_{x['state']}", axis=1) region_states = product["region_state"].unique() grouped_sections = product.groupby(["region", "region_state"]) edges_hierarchy = list(grouped_sections.groups.keys()) # Now, we must not forget that total is our root node. second_level_nodes = product.region.unique() root_node = "total" root_edges = [(root_node, second_level_node) for second_level_node in second_level_nodes] root_edges += edges_hierarchy product_bottom_level = product.pivot(index="date", columns="region_state", values="quantity") regions = product["region"].unique().tolist() for region in regions: region_cols = get_region_columns(product_bottom_level, region) product_bottom_level[region] = product_bottom_level[region_cols].sum(axis=1) product_bottom_level["total"] = product_bottom_level[regions].sum(axis=1) # create hierarchy # Now that we have our dataset ready, let's define our hierarchy tree. # We need a dictionary, where each key is a column (node) in our hierarchy and a list of its children. hierarchy = dict() for edge in root_edges: parent, children = edge[0], edge[1] hierarchy.get(parent) if not hierarchy.get(parent): hierarchy[parent] = [children] else: hierarchy[parent] += [children] product_bottom_level.index = pd.to_datetime(product_bottom_level.index) product_bottom_level = product_bottom_level.resample("D").sum() print('******************* End Prepare Data **********************') return hierarchy, product_bottom_level, region_states