# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. # SPDX-License-Identifier: MIT-0 """Helper utilities for evaluating and comparing forecasts""" # Python Built-Ins: from datetime import datetime import re # External Dependencies: import pandas as pd def filter_to_period( df: pd.DataFrame, period_start: datetime, period_end: datetime, timestamp_col_name: str = "timestamp", ) -> pd.DataFrame: """Filter a DataFrame to a particular date/time range (including start, excluding end) Handles string, datetime, or period type timestamp columns, and checks that the result is not empty. """ n_raw = len(df) if timestamp_col_name not in df.columns: raise ValueError(f"Timestamp column '{timestamp_col_name}' not found in DataFrame") timestamp_dtype = df[timestamp_col_name].dtype if pd.api.types.is_datetime64_any_dtype(timestamp_dtype): # DataFrame timestamp column is already parsed as datetimes result = df[ (df[timestamp_col_name] >= period_start) & (df[timestamp_col_name] < period_end) ] elif pd.api.types.is_period_dtype(timestamp_dtype): # DataFrame timestamp column is a 'period' which works a bit differently in Pandas: start = pd.Period(period_start, freq=timestamp_dtype.freq) end = pd.Period(period_end, freq=timestamp_dtype.freq) result = df[(df[timestamp_col_name] >= start) & (df[timestamp_col_name] < end)] else: # Assume DF timestamp column is strings, but need to check format N_TS_FORMAT_SAMPLES = 5 # TODO: Ignore null values ts_tests = df[timestamp_col_name].iloc[:N_TS_FORMAT_SAMPLES] ts_tests_lens = tuple(len(txt) for txt in ts_tests) if len(set(ts_tests_lens)) != 1: raise ValueError( "Sampled '%s' values from dataframe have inconsistent length: Cannot infer " "date/time format (min %s to max %s)" % ( timestamp_col_name, min(ts_tests_lens), max(ts_tests_lens), ) ) if all(map(lambda txt: re.match("\d{4}-\d{2}-\d{2}T", txt), ts_tests)): # ISO format e.g. 2000-01-01T12:00:00 (maybe with timezone) result = df[ (df[timestamp_col_name] >= period_start.isoformat()) & (df[timestamp_col_name] < period_end.isoformat()) ] elif all(map(lambda txt: re.match("\d{4}-\d{2}-\d{2} ", txt), ts_tests)): # ISO-like format with space instead of T e.g. 2000-01-01 12:00:00 # ISO format e.g. 2000-01-01T12:00:00 (maybe with timezone) result = df[ (df[timestamp_col_name] >= period_start.isoformat().replace("T", " ")) & (df[timestamp_col_name] < period_end.isoformat().replace("T", " ")) ] elif (ts_tests_lens[0] < 11) and all( map(lambda txt: re.match("\d{4}(-\d{2}){0,2}", txt), ts_tests) ): # Truncated ISO-like date e.g. just YYYY-MM or even YYYY. start_trunc = period_start.isoformat()[: ts_tests_lens[0]] end_trunc = period_end.isoformat()[: ts_tests_lens[0]] print( "DataFrame has truncated timestamps: Filtering from %s to %s" % (start_trunc, end_trunc) ) result = df[ (df[timestamp_col_name] >= start_trunc) & (df[timestamp_col_name] < end_trunc) ] else: raise ValueError( "Couldn't infer date/time format of column '%s' for filtering. Got samples: %s" % (timestamp_col_name, ts_tests) ) n_result = len(result) if n_result == 0: raise ValueError( "Filtering dataframe by %s <= [%s] < %s got no results. Check your filter expression" % (period_start, timestamp_col_name, period_end) ) print(f"Time period filter kept {n_result} out of {n_raw} records") return result.copy()