# Notebook for updating dateset timestamps
Amazon Fraud Detector only retain 18 months of data for ingested events. This notebook provide functions to shift dateset timestamps to most recent months. 

In [1]:
import pandas as pd
from datetime import datetime, timezone, timedelta
import glob
import zipfile
import os
import glob

def update_timestamp(file):
    # Input: 
    #     file: file_path to csv 

    df = pd.read_csv(file,
                dtype='object',
                keep_default_na=False,
                na_values='')

    df['EVENT_TIMESTAMP'] = pd.to_datetime(df['EVENT_TIMESTAMP'])
    min_dt = min(df['EVENT_TIMESTAMP']).replace(tzinfo=None)
    max_dt = max(df['EVENT_TIMESTAMP']).replace(tzinfo=None)
    
    if 'LABEL_TIMESTAMP' in df.columns:
        df['LABEL_TIMESTAMP'] = pd.to_datetime(df['LABEL_TIMESTAMP'])
        min_dt = min(min_dt, df['LABEL_TIMESTAMP'].min().replace(tzinfo=None))
        max_dt = max(max_dt, df['LABEL_TIMESTAMP'].max().replace(tzinfo=None))
        
    print('Orignal dates')
    print(min_dt, max_dt)
    
    tz_info = max_dt.tzinfo

    assert max_dt-min_dt<timedelta(days=547)

    time_diff = datetime.now(tz_info)-max_dt-timedelta(days=1)

    df['EVENT_TIMESTAMP'] = df['EVENT_TIMESTAMP'] + time_diff
    print('Updated dates')
    print(df['EVENT_TIMESTAMP'].min(), df['EVENT_TIMESTAMP'].max())
    if 'LABEL_TIMESTAMP' in df.columns:
        df['LABEL_TIMESTAMP'] = df['LABEL_TIMESTAMP'] + time_diff
        print(df['LABEL_TIMESTAMP'].min(), df['LABEL_TIMESTAMP'].max())
        
    df['EVENT_TIMESTAMP'] = df['EVENT_TIMESTAMP'].dt.strftime("%Y-%m-%dT%H:%M:%SZ")
    if 'LABEL_TIMESTAMP' in df.columns:
        df['LABEL_TIMESTAMP'] = df['LABEL_TIMESTAMP'].dt.strftime("%Y-%m-%dT%H:%M:%SZ")
        
    
    df.to_csv(file,index=False)
    return 'SUCCESS'



### Under data folder

In [2]:
update_timestamp('registration_data_20K_full.csv')

update_timestamp('registration_data_20K_minimum.csv')

update_timestamp('transaction_data_100K_full.csv')

with zipfile.ZipFile("ato_data_800K_full.csv.zip","r") as zip_ref:
    zip_ref.extractall(".")
update_timestamp('ato_data_800K_full.csv')
zipfile.ZipFile('ato_data_800K_full.csv.zip', mode='w').write("ato_data_800K_full.csv", compress_type=zipfile.ZIP_DEFLATED)
os.remove('ato_data_800K_full.csv')

Orignal dates
2021-09-01 13:05:39 2022-09-01 17:51:02
Updated dates
2022-03-06 10:19:54.806105+00:00 2023-03-06 15:05:17.806105+00:00
Orignal dates
2021-09-01 12:38:19 2022-09-01 17:51:19
Updated dates
2022-03-06 09:52:18.135153+00:00 2023-03-06 15:05:18.135153+00:00
Orignal dates
2022-05-05 21:51:22 2022-09-01 21:51:22
Updated dates
2022-11-07 15:05:19.543791+00:00 2023-03-06 15:05:19.543791+00:00
2022-11-07 15:05:19.543791+00:00 2023-03-06 15:05:19.543791+00:00
Orignal dates
2022-03-04 07:30:48 2022-09-01 21:51:47
Updated dates
2022-09-06 00:44:29.041042+00:00 2023-03-06 00:43:47.041042+00:00
2022-09-09 10:00:42.041042+00:00 2023-03-06 15:05:28.041042+00:00


### Under demo_scripts/data folder

In [3]:
files = glob.glob('../demo_scripts/data/*')

In [4]:
for f in files:
    if f.endswith('.csv'):
        print(f)
        update_timestamp(f)
        print('====')

../demo_scripts/data/Abuse_FreeTrialReferralAbuse_100k.csv
Orignal dates
2021-10-05 11:42:36 2022-10-05 17:19:26
Updated dates
2022-03-06 09:28:54.710904+00:00 2023-03-06 15:02:05.710904+00:00
2023-03-06 15:05:44.710904+00:00 2023-03-06 15:05:44.710904+00:00
====
../demo_scripts/data/Advertisement_AdClickFraud_20k.csv
Orignal dates
2021-10-05 12:16:42 2022-10-05 17:19:29
Updated dates
2022-03-06 10:02:59.665429+00:00 2023-03-06 14:25:51.665429+00:00
2023-03-06 15:05:46.665429+00:00 2023-03-06 15:05:46.665429+00:00
====
../demo_scripts/data/ContentModeration_FakeReviews_100k.csv
Orignal dates
2021-10-05 11:33:49 2022-10-05 17:19:29
Updated dates
2022-03-06 09:20:07.627030+00:00 2023-03-06 15:00:48.627030+00:00
2023-03-06 15:05:47.627030+00:00 2023-03-06 15:05:47.627030+00:00
====
../demo_scripts/data/Insurance_FraudulentAutoInsuranceClaims_100k.csv
Orignal dates
2021-10-05 11:43:38 2022-10-05 17:19:31
Updated dates
2022-03-06 09:29:57.185178+00:00 2023-03-06 15:03:23.185178+00:00
2023-0