Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.

SPDX-License-Identifier: Apache-2.0

# Set up Financial Fraud dataloader, model training and inference 

## Table of Contents
1. Load processed graph data (in notebook 1.1) into a data dict for the data loader for model training 
* The data dictionary is defined in the referenced TADDY modeling framework to easily fetch relevant data during training 
2. Load the model training and data sampling configurations 
* Use Eigenvalue decomposition based on the adjacency matrix for substructure sampling. Nodes are sampled across multiple snapshots for the edge of interest based on a defined time window. 
3. Pass the data dict in step (1) to the model 
4. Train the model 
5. Apply model inference on the specific snapshot

In [None]:
import sys 
import os

In [None]:
sys.path.append('../../')
sys.path.append('../../src/')

In [None]:
import pickle
import numpy as np
import torch
import yaml
from anomaly_detection_spatial_temporal_data.model.model_config import TaddyConfig
from anomaly_detection_spatial_temporal_data.utils import ensure_directory
from anomaly_detection_spatial_temporal_data.model.dynamic_graph import Taddy

# Load processed graph data in notebook 1.1

In [None]:
with open("../../data/03_primary/financial_fraud/training_data.pkl", 'rb') as file:
 data = pickle.load(file)

In [None]:
rows, cols, labels, weights, headtail, train_size, test_size, nb_nodes, nb_edges = data

In [None]:
degrees = np.array([len(x) for x in headtail])
num_snap = test_size + train_size
labels = [torch.LongTensor(label) for label in labels]

snap_train = list(range(num_snap))[:train_size]
snap_test = list(range(num_snap))[train_size:]

In [None]:
idx = list(range(nb_nodes))
index_id_map = {i:i for i in idx}
idx = np.array(idx)

# Set model training and data sampling configuration and create data dictionary

### load the model training hyperparameters

In [None]:
train_config_file = '../../conf/base/parameters/taddy.yml'

with open(train_config_file, "r") as stream:
 try:
 train_config=yaml.safe_load(stream)
 print(train_config)
 except yaml.YAMLError as exc:
 print(exc)

### load the data sampling parameters
 * window size is the number of snapshots looked back during node sampling 
 * neighbor number is the number of neighbors to sample close to the source and target nodes of the edge of interest 

In [None]:
eigen_file_name = "../../data/05_model_input/financial_fraud/eigen_tmp.pkl"
data_loader_config = train_config['data_load_options']

data_loader_config

### Define relevant functions to node sampling, the purpose of eah function is explained in brief docstring. 

In [None]:
import scipy.sparse as sp
from numpy.linalg import inv

def normalize(mx):
 """Row-normalize sparse matrix"""
 rowsum = np.array(mx.sum(1))
 r_inv = np.power(rowsum, -1).flatten()
 r_inv[np.isinf(r_inv)] = 0.
 r_mat_inv = sp.diags(r_inv)
 mx = r_mat_inv.dot(mx)
 return mx

def normalize_adj(adj):
 """Symmetrically normalize adjacency matrix. (0226)"""
 adj = sp.coo_matrix(adj)
 rowsum = np.array(adj.sum(1))
 d_inv_sqrt = np.power(rowsum, -0.5).flatten()
 d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0.
 d_mat_inv_sqrt = sp.diags(d_inv_sqrt)
 return adj.dot(d_mat_inv_sqrt).transpose().dot(d_mat_inv_sqrt).tocoo()

def adj_normalize(mx):
 """Row-normalize sparse matrix"""
 rowsum = np.array(mx.sum(1))
 r_inv = np.power(rowsum, -0.5).flatten()
 r_inv[np.isinf(r_inv)] = 0.
 r_mat_inv = sp.diags(r_inv)
 mx = r_mat_inv.dot(mx).dot(r_mat_inv)
 return mx

def sparse_mx_to_torch_sparse_tensor(sparse_mx):
 """Convert a scipy sparse matrix to a torch sparse tensor."""
 sparse_mx = sparse_mx.tocoo().astype(np.float32)
 indices = torch.from_numpy(
 np.vstack((sparse_mx.row, sparse_mx.col)).astype(np.int64))
 values = torch.from_numpy(sparse_mx.data)
 shape = torch.Size(sparse_mx.shape)
 return torch.sparse.FloatTensor(indices, values, shape)

def preprocess_adj(adj):
 """Preprocessing of adjacency matrix for simple GCN model and conversion to tuple representation. (0226)"""
 adj = adj + adj.T.multiply(adj.T > adj) - adj.multiply(adj.T > adj)
 # adj_np = np.array(adj.todense())
 adj_normalized = normalize_adj(adj + sp.eye(adj.shape[0]))
 adj_normalized = sparse_mx_to_torch_sparse_tensor(adj_normalized)
 return adj_normalized

def get_adjs(rows, cols, weights, nb_nodes, eigen_file_name, data_loader_config):
 """Generate adjacency matrix and conduct eigenvalue decomposition for node sampling"""
 if not os.path.exists(eigen_file_name):
 generate_eigen = True
 print('Generating eigen as: ' + eigen_file_name)
 else:
 generate_eigen = False
 print('Loading eigen from: ' + eigen_file_name)
 with open(eigen_file_name, 'rb') as f:
 eigen_adjs_sparse = pickle.load(f)
 eigen_adjs = []
 for eigen_adj_sparse in eigen_adjs_sparse:
 eigen_adjs.append(np.array(eigen_adj_sparse.todense()))

 adjs = []
 if generate_eigen:
 eigen_adjs = []
 eigen_adjs_sparse = []

 for i in range(len(rows)):
 adj = sp.csr_matrix((weights[i], (rows[i], cols[i])), shape=(nb_nodes, nb_nodes), dtype=np.float32)
 adjs.append(preprocess_adj(adj))
 if data_loader_config['compute_s']:
 if generate_eigen:
 eigen_adj = data_loader_config['c'] * inv((sp.eye(adj.shape[0]) - (1 - data_loader_config['c']) * adj_normalize(adj)).toarray())
 for p in range(adj.shape[0]):
 eigen_adj[p,p] = 0.
 eigen_adj = normalize(eigen_adj)
 eigen_adjs.append(eigen_adj)
 eigen_adjs_sparse.append(sp.csr_matrix(eigen_adj))

 else:
 eigen_adjs.append(None)

 if generate_eigen:
 with open(eigen_file_name, 'wb') as f:
 pickle.dump(eigen_adjs_sparse, f, pickle.HIGHEST_PROTOCOL)

 return adjs, eigen_adjs

In [None]:
ensure_directory(eigen_file_name)
edges = [np.vstack((rows[i], cols[i])).T for i in range(num_snap)]
adjs, eigen_adjs = get_adjs(rows, cols, weights, nb_nodes, eigen_file_name, data_loader_config)

### The data dictionary defined in TADDY modeling framework 
 * X is the node feature matrix (We did not generate node feature for this use case. Hence we are aiming to learn from the graph structural information and its evloving pattern) 
 * A is the adjacency matrix (a popular way to represent a graph)
 * S is the eigen decomposition result of A 
 * degrees stores all the node degrees
 * other keys are self-explanatory: edges store edge list, y is the edge label, snap_train are snapshots for training and snap_test are snapshots for testing. num_snap is the total number of snapshots. 

In [None]:
data_dict = {
 'X': None, 
 'A': adjs, 
 'S': eigen_adjs, 
 'index_id_map': index_id_map, 
 'edges': edges,
 'y': labels, 
 'idx': idx, 
 'snap_train': snap_train, 
 'degrees': degrees,
 'snap_test': snap_test, 
 'num_snap': num_snap}

In [None]:
train_config

In [None]:
#change save path for notebook
train_config['model_options']['save_directory'] = '../../data/07_model_output/financial_fraud' 

if not os.path.exists(train_config['model_options']['save_directory']):
 os.makedirs(train_config['model_options']['save_directory'])

In [None]:
model_config = TaddyConfig(config=train_config['model_options'])
model_obj = Taddy(data_dict, model_config)

model_config.save_directory

# Train model

In [None]:
learned_result,save_model_path = model_obj.run()

# Model training result 

In [None]:
learned_result

In [None]:
save_model_path

# Run inference on the specific snapshot 

### load trained model 

In [None]:
import torch
import transformers

In [None]:
model = torch.load(save_model_path)

In [None]:
snap_num = 9

In [None]:
pred = model.predict(snap_num)

In [None]:
from sklearn import metrics

auc = metrics.roc_auc_score(labels[snap_num],pred)

auc

# References

Edgar Alonso Lopez-Rojas and Stefan Axelsson. 2014. BANKSIM: A BANK PAYMENTS SIMULATOR FOR FRAUD DETECTION RESEARCH.

Yixin Liu, Shirui Pan, Yu Guang Wang, Fei Xiong, Liang Wang, Qingfeng Chen, and Vincent CS Lee. 2015. Anomaly Detection in Dynamic Graphs via Transformer.