{ "cells": [ { "cell_type": "markdown", "id": "3d4fab9e", "metadata": {}, "source": [ "# Notebook for updating dateset timestamps\n", "Amazon Fraud Detector only retain 18 months of data for ingested events. This notebook provide functions to shift dateset timestamps to most recent months. " ] }, { "cell_type": "code", "execution_count": 1, "id": "e3f51928", "metadata": { "ExecuteTime": { "end_time": "2023-03-07T15:05:13.794536Z", "start_time": "2023-03-07T15:05:13.422274Z" } }, "outputs": [], "source": [ "import pandas as pd\n", "from datetime import datetime, timezone, timedelta\n", "import glob\n", "import zipfile\n", "import os\n", "import glob\n", "\n", "def update_timestamp(file):\n", " # Input: \n", " # file: file_path to csv \n", "\n", " df = pd.read_csv(file,\n", " dtype='object',\n", " keep_default_na=False,\n", " na_values='')\n", "\n", " df['EVENT_TIMESTAMP'] = pd.to_datetime(df['EVENT_TIMESTAMP'])\n", " min_dt = min(df['EVENT_TIMESTAMP']).replace(tzinfo=None)\n", " max_dt = max(df['EVENT_TIMESTAMP']).replace(tzinfo=None)\n", " \n", " if 'LABEL_TIMESTAMP' in df.columns:\n", " df['LABEL_TIMESTAMP'] = pd.to_datetime(df['LABEL_TIMESTAMP'])\n", " min_dt = min(min_dt, df['LABEL_TIMESTAMP'].min().replace(tzinfo=None))\n", " max_dt = max(max_dt, df['LABEL_TIMESTAMP'].max().replace(tzinfo=None))\n", " \n", " print('Orignal dates')\n", " print(min_dt, max_dt)\n", " \n", " tz_info = max_dt.tzinfo\n", "\n", " assert max_dt-min_dt