Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.

SPDX-License-Identifier: Apache-2.0


# Prepare Financial Fraud dataset for dynamic graph model (TADDY)

The TADDY model an anomaly detection that detects anomalous edges in dynamic (changing over time) graphs. It learns edge embeddings that combine spatial (neighboring nodes and edges) of the graph as well as temporal information. A fully connected layer then classifies the embeddings as anomaly/not anomaly.

The model expects graph snapshots with labeled edges, so this notebook prepares the BankSim dataset for TADDY modeling framework.

## Table of Contents
1. Process raw transaction data
 * Get edge and node list to build graph from the raw transaction data. Each transaction can be represented as an edge sourced from the customer node to the merchant node. 
 * Deduping data. We only keep the most recent transaction for each (customer, merchant) pair. Hence we only conducted 1 classification for their most recent interaction of each (customer, merchant) pair. 
 * Create and save raw node names/ids (str) to node indexes mapping. These indexes will be used to formulate graphs represented as sparse adjacency matrix during training. Namely, the indexes created here will determine their position in the adjacency matrix. Hence, we checked several times in the notebook to make sure the indexes are correctly aligned. 
 * Save the labels for each edge with the correct order. 
 * Train and test graph snapshots split. Earlier snapshots are used for training and later snapshots are used for testing. 
 
2. Save all the processed data 
 * Source nodes of edges are stored as row indexes 
 * Target nodes of edges are stored as col indexes
 * Node indexes of all edges are stored in a sparse matrix (list of list as headtail)

In [None]:
import sys 
import os

In [None]:
sys.path.append('../../src/')

In [None]:
import pandas as pd
import numpy as np
import pickle

from anomaly_detection_spatial_temporal_data.utils import ensure_directory

# Load raw data

In [None]:
raw_data_path = '../../data/01_raw/financial_fraud/bs140513_032310.csv'

raw_trans_data = pd.read_csv(raw_data_path)

raw_trans_data.shape

In [None]:
raw_net_data_path = '../../data/01_raw/financial_fraud/bsNET140513_032310.csv'

raw_net_trans_data = pd.read_csv(raw_net_data_path)

raw_net_trans_data.shape

# Process edge data for dynamic graph model 
## Customer can be treated as source node and merchant can be treated as target node 

In [None]:
edges = raw_trans_data[['step','customer','merchant','category','amount','fraud']]

In [None]:
# remove self loops where customer bought from self
edges = edges.loc[edges.customer!=edges.merchant]

edges.shape

### check duplicated (customer, merchant) pairs 

In [None]:
customer_merchant_trans_count = edges.groupby(
 by=['customer','merchant']
).agg({'step':'count'}) #there are 47132 unique pairs 

In [None]:
customer_merchant_trans_fraud = edges.groupby(by=['customer','merchant']).agg({'fraud':'sum'})

In [None]:
customer_merchant_trans_fraud.columns

### Observation: 1065 (customer, merchant) pairs had been flagged as fraud for more than 1 time

In [None]:
customer_merchant_trans_fraud.loc[customer_merchant_trans_fraud.fraud>1]

### Observation: 1108 (customer, merchant) pairs had changing labels

In [None]:
customer_merchant_trans_fraud_consistency = edges.groupby(by=['customer','merchant']).agg({'fraud':'mean'})
customer_merchant_trans_fraud_consistency

In [None]:
customer_merchant_trans_fraud_consistency.loc[
 (customer_merchant_trans_fraud_consistency.fraud!=1) & (customer_merchant_trans_fraud_consistency.fraud!=0) 
]

# Dedupe (customer, merchant) pair, only keep the last transaction (the latest)

In [None]:
edges_deduped = edges.drop_duplicates(subset=['customer','merchant'], keep='last', )

In [None]:
edges_deduped.shape

In [None]:
edges_array = np.array(edges_deduped[['customer','merchant']])

### convert str ids to int indexes 

In [None]:
vertexs, edges_1d = np.unique(edges_array, return_inverse=True)

In [None]:
# vertexs, len(vertexs)

### save str ids to int indexes mapping

In [None]:
vertex_to_id = {}
for i,vertex in enumerate(vertexs):
 vertex_to_id.setdefault(vertex,i)

In [None]:
vertex_to_id_df = pd.DataFrame.from_dict(
 vertex_to_id, 
 orient='index', 
 columns=['idx']
).reset_index().rename(columns={"index": "name"})

#### save id to index mapping


In [None]:
vertex_to_id_file_path = "../../data/02_intermediate/financial_fraud/node_id.csv"

ensure_directory(vertex_to_id_file_path)

vertex_to_id_df.to_csv("../../data/02_intermediate/financial_fraud/node_id.csv", index=False)

In [None]:
edges_idx = np.reshape(edges_1d, [-1, 2])

In [None]:
edges_idx, len(edges_idx)

### Check whether the node indexes for the top 3 edge list records are correct 
It's critical that the indexes are correctly aligned with raw data, and the indexes in the graph (represented as sparse graph)

In [None]:
### manually checkingg the node id for the note indexes
# (vertexs[3317], vertexs[4148]), (vertexs[2363], vertexs[4154]),(vertexs[3396], vertexs[4127]), (vertexs[3304], vertexs[4130])

In [None]:
### consistent with the raw data 
# edges_deduped.head(3)

In [None]:
# print('vertex:', len(vertexs), 'edge:', len(edges_idx))

# Find labels for the edge

In [None]:
from tqdm import tqdm

In [None]:
edge_label_arr = np.zeros([edges_deduped.shape[0], 3], dtype=np.int32)
for idx, row in tqdm(edges_deduped.reset_index().iterrows(), total=edges_deduped.shape[0]): #using deduped trans 
 edge_label_arr[idx][0] = vertex_to_id[row['customer']]
 edge_label_arr[idx][1] = vertex_to_id[row['merchant']]
 edge_label_arr[idx][2] = row['fraud']

In [None]:
edge_label_arr.shape

In [None]:
edge_label_postprocessed_df = pd.DataFrame(edge_label_arr, columns=['source','target','label'])

In [None]:
edge_label_postprocessed_df.head()

In [None]:
edge_label_df_file_path = "../../data/02_intermediate/financial_fraud/edge_label.csv"
edge_list_arr_file_path = "../../data/02_intermediate/financial_fraud/edge_list.npz"

ensure_directory(edge_label_df_file_path)
ensure_directory(edge_list_arr_file_path)

In [None]:
with open(edge_list_arr_file_path, mode="wb") as f:
 np.savez(f,data=edge_label_arr)

### check again the processed data are consistent with the raw data 

In [None]:
# (vertexs[edge_label_arr[0][0]], vertexs[edge_label_arr[0][1]])

In [None]:
# edges_deduped.loc[(edges_deduped.customer ==vertexs[edge_label_arr[0][0]] )& (edges_deduped.merchant ==vertexs[edge_label_arr[0][1]])]

In [None]:
#check fraud ratio
edge_label_postprocessed_df['label'].value_counts(normalize=True)

# Split train/test data and generate data for graph dataloader 

In [None]:
edges_deduped.shape

In [None]:
edges_deduped.head()

In [None]:
m = len(edge_label_arr) #edge number 
n = len(vertex_to_id_df) #node number 

print(f"Number of edges: {m}, Number of nodes: {n}")

In [None]:
train_per = 0.5 #split in half 

train_num = int(np.floor(train_per * m))

train = edge_label_arr[0:train_num, :] #first half being training samples
test = edge_label_arr[train_num:, :] #second half being test samples 

In [None]:
train.shape, test.shape

# Build graph in the format of a sparse matrix with edge list 
Again, it's critical that the indexes are correctly aligned with raw data, and the indexes in the graph (represented as sparse graph)

In [None]:
from scipy.sparse import csr_matrix,coo_matrix,eye

In [None]:
train_mat = csr_matrix(
 (np.ones([np.size(train, 0)], dtype=np.int32), 
 (train[:, 0], train[:, 1])),
 shape=(n, n))

In [None]:
train_mat.shape

In [None]:
train_mat = train_mat + train_mat.transpose() #enforce symmetry 

#### check edgelist id with the sparse matrix idx

In [None]:
# train_mat[3317,4148], train_mat[4148,3317]

In [None]:
# train_mat[86,4145], train_mat[4145,86] #being 0 because this edge is in the test set 

In [None]:
train_mat = (train_mat + train_mat.transpose() + eye(n)).tolil() #Convert to List of Lists format

In [None]:
headtail = train_mat.rows #store the indexes of edges

In [None]:
# headtail

In [None]:
#check degrees of each source node 
degrees = np.array([len(x) for x in headtail])

# Creating snapshots of graphs for the dataloader of TADDY model

In [None]:
snap_size=5000

In [None]:
train_size = int(len(train) / snap_size + 0.5) #making slices of snapshots
test_size = int(len(test) / snap_size + 0.5)

In [None]:
train_size, test_size

In [None]:
rows = []
cols = []
weis = []
labs = []
for ii in range(train_size):
 start_loc = ii * snap_size
 end_loc = (ii + 1) * snap_size

 row = np.array(train[start_loc:end_loc, 0], dtype=np.int32) #source nodes of edges stored as row indexes 
 col = np.array(train[start_loc:end_loc, 1], dtype=np.int32) #target nodes of edges stored as col indexes 
 lab = np.array(train[start_loc:end_loc, 2], dtype=np.int32) #labels
 wei = np.ones_like(row, dtype=np.int32) #weights of edge (all set to be 1 in this experiment)

 rows.append(row)
 cols.append(col)
 weis.append(wei) #weights
 labs.append(lab) #label

In [None]:
for i in range(test_size):
 start_loc = i * snap_size
 end_loc = (i + 1) * snap_size

 row = np.array(test[start_loc:end_loc, 0], dtype=np.int32)
 col = np.array(test[start_loc:end_loc, 1], dtype=np.int32)
 lab = np.array(test[start_loc:end_loc, 2], dtype=np.int32)
 wei = np.ones_like(row, dtype=np.int32)

 rows.append(row)
 cols.append(col)
 weis.append(wei)
 labs.append(lab)

In [None]:
# len(rows), rows[0].shape

In [None]:
# rows[0]

In [None]:
# len(cols), cols[0].shape

In [None]:
# cols[0]

In [None]:
# len(labs), labs[0].shape

In [None]:
# labs[0]

### save all intermediate graph data

In [None]:
train_test_data_file_path = '../../data/03_primary/financial_fraud/training_data.pkl'
ensure_directory(train_test_data_file_path)

train_test_data = (rows,cols,labs,weis,headtail,train_size,test_size,n,m)

with open(train_test_data_file_path, 'wb') as f:
 pickle.dump(train_test_data, f)

# References

Edgar Alonso Lopez-Rojas and Stefan Axelsson. 2014. BANKSIM: A BANK PAYMENTS SIMULATOR FOR FRAUD DETECTION RESEARCH.

Yixin Liu, Shirui Pan, Yu Guang Wang, Fei Xiong, Liang Wang, Qingfeng Chen, and Vincent CS Lee. 2015. Anomaly Detection in Dynamic Graphs via Transformer.