#!/usr/bin/env python3

import os
import pandas as pd
from glob import glob
import argparse

os.system("du -a /opt/ml")

SRC_TS = glob("/opt/ml/processing/input_train/*.csv")[0]
print(SRC_TS)

DST_TRAIN_TS = "/opt/ml/processing/target/target.csv"
DST_RELATED_TS = "/opt/ml/processing/related/related.csv"


def create_dataframes(forecast_horizon, source_train_ts):
    """Create the target and related dataframe in a suitable format for Amazon Forecast.

    Parameters:
        forecast_horizon (int): number of time units you want to forecast
        source_train_ts (str): location of train.csv

    Returns:
        target_df (pd.DataFrame): target dataframe in Forecast format
        rts_df (pd.DataFrame): related dataframe in Forecast format
    """
    df = pd.read_csv(source_train_ts, index_col=0, parse_dates=True)
    df = df.resample("H").sum() / 4
    df.reset_index(inplace=True)
    df = df.rename(columns={"index": "timestamp", "MT_001": "target_value"})

    # Use 2.5 weeks of hourly data to train Amazon Forecast. This is to save costs in generating the forecast.
    df = df[-2 * 7 * 24 - 24 * 3 :].copy()
    df["target_value"] = df["target_value"].astype("float")
    df["workingday"] = df["timestamp"].dt.weekday.apply(lambda x: 1 if x < 5 else 0).astype("float")
    df["item_id"] = "client_1"
    target_df = df[["item_id", "timestamp", "target_value"]][:-forecast_horizon]
    rts_df = df[["item_id", "timestamp", "workingday"]]

    return target_df, rts_df


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--forecast_horizon", type=str)
    args = parser.parse_args()

    forecast_horizon = int(args.forecast_horizon)
    target_df, rts_df = create_dataframes(forecast_horizon, SRC_TS)

    print(f"{len(target_df)} + {forecast_horizon} = {len(rts_df)}")

    # Assert equivalent lengths of dataframes. If no equivalence, a predictor cannot be created.
    assert len(target_df) + forecast_horizon == len(rts_df), "length doesn't match"

    # Assert that the related timeseries is not missing entries. If it is, a predictor cannot be created.
    assert len(rts_df) == len(
        pd.date_range(
            start=list(rts_df["timestamp"])[0],
            end=list(rts_df["timestamp"])[-1],
            freq="H",
        )
    ), "missing entries in the related time series"

    # Writing both dataframes to a csv file.
    target_df.to_csv(
        path_or_buf=DST_TRAIN_TS,
        header=False,
        index=False,
    )

    rts_df.to_csv(
        path_or_buf=DST_RELATED_TS,
        header=False,
        index=False,
    )