sagemaker_graph_fraud_detection/0000755000175000001440000000000014215133750020250 5ustar sagemaker-useruserssagemaker_graph_fraud_detection/requirements.txt0000644000175000001440000000004214214110624023522 0ustar sagemaker-useruserssagemaker==1.72.0 awscli>=1.18.140sagemaker_graph_fraud_detection/.ipynb_checkpoints/0000755000175000001440000000000014227356123024045 5ustar sagemaker-useruserssagemaker_graph_fraud_detection/.ipynb_checkpoints/__init__-checkpoint.py0000644000175000001440000000000014214110624030257 0ustar sagemaker-useruserssagemaker_graph_fraud_detection/.ipynb_checkpoints/config-checkpoint.py0000644000175000001440000000322014214110624027774 0ustar sagemaker-userusersimport json import os import boto3 import sagemaker from pathlib import Path def get_current_folder(global_variables): # if calling from a file if "__file__" in global_variables: current_file = Path(global_variables["__file__"]) current_folder = current_file.parent.resolve() # if calling from a notebook else: current_folder = Path(os.getcwd()) return current_folder region = boto3.session.Session().region_name account_id = boto3.client('sts').get_caller_identity().get('Account') default_bucket = sagemaker.session.Session(boto3.session.Session()).default_bucket() default_role = sagemaker.get_execution_role() cfn_stack_outputs = {} current_folder = get_current_folder(globals()) cfn_stack_outputs_filepath = Path(current_folder, '../stack_outputs.json').resolve() if os.path.exists(cfn_stack_outputs_filepath): with open(cfn_stack_outputs_filepath) as f: cfn_stack_outputs = json.load(f) aws_account = cfn_stack_outputs.get('AccountID', account_id) region_name = cfn_stack_outputs.get('AWSRegion', region) solution_name = cfn_stack_outputs.get('SolutionName') solution_upstream_bucket = cfn_stack_outputs.get('SolutionUpstreamS3Bucket') solution_prefix = cfn_stack_outputs.get('SolutionPrefix', 'sagemaker-soln-graph-fraud') solution_bucket = cfn_stack_outputs.get('SolutionS3Bucket', default_bucket) s3_data_prefix = cfn_stack_outputs.get('S3InputDataPrefix', 'raw-data') s3_processing_output = cfn_stack_outputs.get('S3ProcessingJobOutputPrefix', 'processed-data') s3_train_output = cfn_stack_outputs.get('S3TrainingJobOutputPrefix', 'training-output') role = cfn_stack_outputs.get('IamRole', default_role)sagemaker_graph_fraud_detection/.ipynb_checkpoints/requirements-checkpoint.in0000644000175000001440000000005314214110624031231 0ustar sagemaker-useruserssagemaker==1.72.0 awscli watchtower==1.0.0 sagemaker_graph_fraud_detection/.ipynb_checkpoints/requirements-checkpoint.txt0000644000175000001440000000004214214110624031440 0ustar sagemaker-useruserssagemaker==1.72.0 awscli>=1.18.140sagemaker_graph_fraud_detection/dgl_fraud_detection/0000755000175000001440000000000014217105162024233 5ustar sagemaker-useruserssagemaker_graph_fraud_detection/dgl_fraud_detection/train_dgl_mxnet_entry_point.py0000644000175000001440000003112514217105162032417 0ustar sagemaker-userusersimport os os.environ['DGLBACKEND'] = 'mxnet' import mxnet as mx from mxnet import nd, gluon, autograd import dgl import numpy as np import pandas as pd import time import logging import pickle import math from estimator_fns import * from graph import * from data import * from utils import * from model.mxnet import * from sampler import * def normalize(feature_matrix): mean = nd.mean(feature_matrix, axis=0) stdev = nd.sqrt(nd.sum((feature_matrix - mean)**2, axis=0)/feature_matrix.shape[0]) return (feature_matrix - mean) / stdev def get_dataloader(data_size, batch_size, mini_batch=True): batch_size = batch_size if mini_batch else data_size train_dataloader = gluon.data.BatchSampler(gluon.data.RandomSampler(data_size), batch_size, 'keep') test_dataloader = gluon.data.BatchSampler(gluon.data.SequentialSampler(data_size), batch_size, 'keep') return train_dataloader, test_dataloader def train(model, trainer, loss, features, labels, train_loader, test_loader, train_g, test_g, train_mask, valid_mask, test_mask, ctx, n_epochs, batch_size, output_dir, thresh, scale_pos_weight, compute_metrics=True, mini_batch=True): duration = [] for epoch in range(n_epochs): tic = time.time() loss_val = 0. for n, batch in enumerate(train_loader): # logging.info("Iteration: {:05d}".format(n)) node_flow, batch_nids = train_g.sample_block(nd.array(batch).astype('int64')) batch_indices = nd.array(batch, ctx=ctx) with autograd.record(): pred = model(node_flow, features[batch_nids.as_in_context(ctx)]) l = loss(pred, labels[batch_indices], mx.nd.expand_dims(scale_pos_weight*train_mask, 1)[batch_indices]) l = l.sum()/len(batch) l.backward() trainer.step(batch_size=1, ignore_stale_grad=True) loss_val += l.asscalar() # logging.info("Current loss {:04f}".format(loss_val/(n+1))) duration.append(time.time() - tic) train_metric, valid_metric = evaluate(model, train_g, features, labels, train_mask, valid_mask, ctx, batch_size, mini_batch) logging.info("Epoch {:05d} | Time(s) {:.4f} | Training Loss {:.4f} | Training F1 {:.4f} | Validation F1 {:.4f}".format( epoch, np.mean(duration), loss_val/(n+1), train_metric, valid_metric)) class_preds, pred_proba = get_model_class_predictions(model, test_g, test_loader, features, ctx, threshold=thresh) if compute_metrics: acc, f1, p, r, roc, pr, ap, cm = get_metrics(class_preds, pred_proba, labels, test_mask, output_dir) logging.info("Metrics") logging.info("""Confusion Matrix: {} f1: {:.4f}, precision: {:.4f}, recall: {:.4f}, acc: {:.4f}, roc: {:.4f}, pr: {:.4f}, ap: {:.4f} """.format(cm, f1, p, r, acc, roc, pr, ap)) return model, class_preds, pred_proba def evaluate(model, g, features, labels, train_mask, valid_mask, ctx, batch_size, mini_batch=True): train_f1, valid_f1 = mx.metric.F1(), mx.metric.F1() preds = [] batch_size = batch_size if mini_batch else features.shape[0] dataloader = gluon.data.BatchSampler(gluon.data.SequentialSampler(features.shape[0]), batch_size, 'keep') for batch in dataloader: node_flow, batch_nids = g.sample_block(nd.array(batch).astype('int64')) preds.append(model(node_flow, features[batch_nids.as_in_context(ctx)])) nd.waitall() # preds = nd.concat(*preds, dim=0).argmax(axis=1) preds = nd.concat(*preds, dim=0) train_mask = nd.array(np.where(train_mask.asnumpy()), ctx=ctx) valid_mask = nd.array(np.where(valid_mask.asnumpy()), ctx=ctx) train_f1.update(preds=nd.softmax(preds[train_mask], axis=1).reshape(-3, 0), labels=labels[train_mask].reshape(-1,)) valid_f1.update(preds=nd.softmax(preds[valid_mask], axis=1).reshape(-3, 0), labels=labels[valid_mask].reshape(-1,)) return train_f1.get()[1], valid_f1.get()[1] def get_model_predictions(model, g, dataloader, features, ctx): pred = [] for batch in dataloader: node_flow, batch_nids = g.sample_block(nd.array(batch).astype('int64')) pred.append(model(node_flow, features[batch_nids.as_in_context(ctx)])) nd.waitall() return nd.concat(*pred, dim=0) def get_model_class_predictions(model, g, datalaoder, features, ctx, threshold=None): unnormalized_preds = get_model_predictions(model, g, datalaoder, features, ctx) pred_proba = nd.softmax(unnormalized_preds)[:, 1].asnumpy().flatten() if not threshold: return unnormalized_preds.argmax(axis=1).asnumpy().flatten().astype(int), pred_proba return np.where(pred_proba > threshold, 1, 0), pred_proba def save_prediction(pred, pred_proba, id_to_node, training_dir, new_accounts, output_dir, predictions_file): prediction_query = read_masked_nodes(os.path.join(training_dir, new_accounts)) pred_indices = np.array([id_to_node[query] for query in prediction_query]) pd.DataFrame.from_dict({'target': prediction_query, 'pred_proba': pred_proba[pred_indices], 'pred': pred[pred_indices]}).to_csv(os.path.join(output_dir, predictions_file), index=False) def save_model(g, model, model_dir, hyperparams): model.save_parameters(os.path.join(model_dir, 'model.params')) with open(os.path.join(model_dir, 'model_hyperparams.pkl'), 'wb') as f: pickle.dump(hyperparams, f) with open(os.path.join(model_dir, 'graph.pkl'), 'wb') as f: pickle.dump(g, f) def get_model(g, hyperparams, in_feats, n_classes, ctx, model_dir=None): if model_dir: # load using saved model state with open(os.path.join(model_dir, 'model_hyperparams.pkl'), 'rb') as f: hyperparams = pickle.load(f) with open(os.path.join(model_dir, 'graph.pkl'), 'rb') as f: g = pickle.load(f) if hyperparams['heterogeneous']: model = HeteroRGCN(g, in_feats, hyperparams['n_hidden'], n_classes, hyperparams['n_layers'], hyperparams['embedding_size'], ctx) else: if hyperparams['model'] == 'gcn': model = GCN(g, in_feats, hyperparams['n_hidden'], n_classes, hyperparams['n_layers'], nd.relu, hyperparams['dropout']) elif hyperparams['model'] == 'graphsage': model = GraphSAGE(g, in_feats, hyperparams['n_hidden'], n_classes, hyperparams['n_layers'], nd.relu, hyperparams['dropout'], hyperparams['aggregator_type']) else: heads = ([hyperparams['num_heads']] * hyperparams['n_layers']) + [hyperparams['num_out_heads']] model = GAT(g, in_feats, hyperparams['n_hidden'], n_classes, hyperparams['n_layers'], heads, gluon.nn.Lambda(lambda data: nd.LeakyReLU(data, act_type='elu')), hyperparams['dropout'], hyperparams['attn_drop'], hyperparams['alpha'], hyperparams['residual']) if hyperparams['no_features']: model = NodeEmbeddingGNN(model, in_feats, hyperparams['embedding_size']) if model_dir: model.load_parameters(os.path.join(model_dir, 'model.params')) else: model.initialize(ctx=ctx) return model if __name__ == '__main__': logging = get_logger(__name__) logging.info('numpy version:{} MXNet version:{} DGL version:{}'.format(np.__version__, mx.__version__, dgl.__version__)) args = parse_args() args.edges = get_edgelists(args.edges, args.training_dir) g, features, id_to_node = construct_graph(args.training_dir, args.edges, args.nodes, args.target_ntype, args.heterogeneous) features = normalize(nd.array(features)) if args.heterogeneous: g.nodes['target'].data['features'] = features else: g.ndata['features'] = features logging.info("Getting labels") n_nodes = g.number_of_nodes('target') if args.heterogeneous else g.number_of_nodes() labels, train_mask, valid_mask, test_mask = get_labels( id_to_node, n_nodes, args.target_ntype, os.path.join(args.training_dir, args.labels), os.path.join(args.training_dir, args.validation_data), os.path.join(args.training_dir, args.new_accounts), ) logging.info("Got labels") labels = nd.array(labels).astype('float32') train_mask = nd.array(train_mask).astype('float32') valid_mask = nd.array(valid_mask).astype('float32') test_mask = nd.array(test_mask).astype('float32') n_nodes = sum([g.number_of_nodes(n_type) for n_type in g.ntypes]) if args.heterogeneous else g.number_of_nodes() n_edges = sum([g.number_of_edges(e_type) for e_type in g.etypes]) if args.heterogeneous else g.number_of_edges() logging.info("""----Data statistics------' #Nodes: {} #Edges: {} #Features Shape: {} #Labeled Train samples: {} #Unlabeled Test samples: {}""".format(n_nodes, n_edges, features.shape, train_mask.sum().asscalar(), test_mask.sum().asscalar())) if args.num_gpus: cuda = True ctx = mx.gpu(0) else: cuda = False ctx = mx.cpu(0) logging.info("Initializing Model") in_feats = args.embedding_size if args.no_features else features.shape[1] n_classes = 2 model = get_model(g, vars(args), in_feats, n_classes, ctx) logging.info("Initialized Model") if args.no_features: features = nd.array(g.nodes('target'), ctx) if args.heterogeneous else nd.array(g.nodes(), ctx) else: features = features.as_in_context(ctx) labels = labels.as_in_context(ctx) train_mask = train_mask.as_in_context(ctx) valid_mask = valid_mask.as_in_context(ctx) test_mask = test_mask.as_in_context(ctx) if not args.heterogeneous: # normalization degs = g.in_degrees().astype('float32') norm = mx.nd.power(degs, -0.5) if cuda: norm = norm.as_in_context(ctx) g.ndata['norm'] = mx.nd.expand_dims(norm, 1) if args.mini_batch: train_g = HeteroGraphNeighborSampler(g, 'target', args.n_layers, args.n_neighbors) if args.heterogeneous\ else NeighborSampler(g, args.n_layers, args.n_neighbors) test_g = HeteroGraphNeighborSampler(g, 'target', args.n_layers) if args.heterogeneous\ else NeighborSampler(g, args.n_layers) else: train_g, test_g = FullGraphSampler(g, args.n_layers), FullGraphSampler(g, args.n_layers) train_data, test_data = get_dataloader(features.shape[0], args.batch_size, args.mini_batch) loss = gluon.loss.SoftmaxCELoss() scale_pos_weight = nd.sqrt((train_mask.shape[0] - train_mask.sum()) / train_mask.sum()) logging.info(model) logging.info(model.collect_params()) trainer = gluon.Trainer(model.collect_params(), args.optimizer, {'learning_rate': args.lr, 'wd': args.weight_decay}) logging.info("Starting Model training") model, pred, pred_proba = train(model, trainer, loss, features, labels, train_data, test_data, train_g, test_g, train_mask, valid_mask, test_mask, ctx, args.n_epochs, args.batch_size, args.output_dir, args.threshold, scale_pos_weight, args.compute_metrics, args.mini_batch) logging.info("Finished Model training") logging.info("Saving model") save_model(g, model, args.model_dir, vars(args)) logging.info("Saving model predictions for new accounts") save_prediction(pred, pred_proba, id_to_node, args.training_dir, args.new_accounts, args.output_dir, args.predictions) sagemaker_graph_fraud_detection/dgl_fraud_detection/utils.py0000644000175000001440000000575614214110625025757 0ustar sagemaker-userusersimport os import pandas as pd import numpy as np from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score import networkx as nx import matplotlib.pyplot as plt def get_metrics(pred, pred_proba, labels, mask, out_dir): labels, mask = labels.asnumpy().flatten().astype(int), mask.asnumpy().flatten().astype(int) labels, pred, pred_proba = labels[np.where(mask)], pred[np.where(mask)], pred_proba[np.where(mask)] acc = ((pred == labels)).sum() / mask.sum() true_pos = (np.where(pred == 1, 1, 0) + np.where(labels == 1, 1, 0) > 1).sum() false_pos = (np.where(pred == 1, 1, 0) + np.where(labels == 0, 1, 0) > 1).sum() false_neg = (np.where(pred == 0, 1, 0) + np.where(labels == 1, 1, 0) > 1).sum() true_neg = (np.where(pred == 0, 1, 0) + np.where(labels == 0, 1, 0) > 1).sum() precision = true_pos/(true_pos + false_pos) if (true_pos + false_pos) > 0 else 0 recall = true_pos/(true_pos + false_neg) if (true_pos + false_neg) > 0 else 0 f1 = 2*(precision*recall)/(precision + recall) if (precision + recall) > 0 else 0 confusion_matrix = pd.DataFrame(np.array([[true_pos, false_pos], [false_neg, true_neg]]), columns=["labels positive", "labels negative"], index=["predicted positive", "predicted negative"]) ap = average_precision_score(labels, pred_proba) fpr, tpr, _ = roc_curve(labels, pred_proba) prc, rec, _ = precision_recall_curve(labels, pred_proba) roc_auc = auc(fpr, tpr) pr_auc = auc(rec, prc) save_roc_curve(fpr, tpr, roc_auc, os.path.join(out_dir, "roc_curve.png")) save_pr_curve(prc, rec, pr_auc, ap, os.path.join(out_dir, "pr_curve.png")) return acc, f1, precision, recall, roc_auc, pr_auc, ap, confusion_matrix def save_roc_curve(fpr, tpr, roc_auc, location): f = plt.figure() lw = 2 plt.plot(fpr, tpr, color='darkorange', lw=lw, label='ROC curve (area = %0.2f)' % roc_auc) plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Model ROC curve') plt.legend(loc="lower right") f.savefig(location) def save_pr_curve(fpr, tpr, pr_auc, ap, location): f = plt.figure() lw = 2 plt.plot(fpr, tpr, color='darkorange', lw=lw, label='PR curve (area = %0.2f)' % pr_auc) plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('Recall') plt.ylabel('Precision') plt.title('Model PR curve: AP={0:0.2f}'.format(ap)) plt.legend(loc="lower right") f.savefig(location) def save_graph_drawing(g, location): plt.figure(figsize=(12, 8)) node_colors = {node: 0.0 if 'user' in node else 0.5 for node in g.nodes()} nx.draw(g, node_size=10000, pos=nx.spring_layout(g), with_labels=True, font_size=14, node_color=list(node_colors.values()), font_color='white') plt.savefig(location, bbox_inches='tight') sagemaker_graph_fraud_detection/dgl_fraud_detection/requirements.txt0000644000175000001440000000000014214110624027501 0ustar sagemaker-useruserssagemaker_graph_fraud_detection/dgl_fraud_detection/.ipynb_checkpoints/0000755000175000001440000000000014227356176030042 5ustar sagemaker-useruserssagemaker_graph_fraud_detection/dgl_fraud_detection/.ipynb_checkpoints/sampler-checkpoint.py0000644000175000001440000000527514214110624034173 0ustar sagemaker-userusersimport dgl class HeteroGraphNeighborSampler: """Neighbor sampler on heterogeneous graphs Parameters ---------- g : DGLHeteroGraph Full graph category : str Category name of the seed nodes. nhops : int Number of hops to sample/number of layers in the node flow. fanout : int Fanout of each hop starting from the seed nodes. If a fanout is None, sample full neighbors. """ def __init__(self, g, category, nhops, fanout=None): self.g = g self.category = category self.fanouts = [fanout] * nhops def sample_block(self, seeds): blocks = [] seeds = {self.category: seeds} cur = seeds for fanout in self.fanouts: if fanout is None: frontier = dgl.in_subgraph(self.g, cur) else: frontier = dgl.sampling.sample_neighbors(self.g, cur, fanout) block = dgl.to_block(frontier, cur) cur = {} for ntype in block.srctypes: cur[ntype] = block.srcnodes[ntype].data[dgl.NID] blocks.insert(0, block) return blocks, cur[self.category] class NeighborSampler: """Neighbor sampler on homogenous graphs Parameters ---------- g : DGLGraph Full graph nhops : int Number of hops to sample/number of layers in the node flow. fanout : int Fanout of each hop starting from the seed nodes. If a fanout is None, sample full neighbors. """ def __init__(self, g, nhops, fanout=None): self.g = g self.fanouts = [fanout] * nhops self.nhops = nhops def sample_block(self, seeds): blocks = [] for fanout in self.fanouts: # For each seed node, sample ``fanout`` neighbors. if fanout is None: frontier = dgl.in_subgraph(self.g, seeds) else: frontier = dgl.sampling.sample_neighbors(self.g, seeds, fanout, replace=False) # Then we compact the frontier into a bipartite graph for message passing. block = dgl.to_block(frontier, seeds) # Obtain the seed nodes for next layer. seeds = block.srcdata[dgl.NID] blocks.insert(0, block) return blocks, blocks[0].srcdata[dgl.NID] class FullGraphSampler: """Does nothing and just returns the full graph Parameters ---------- g : DGLGraph Full graph nhops : int Number of hops to sample/number of layers in the node flow. """ def __init__(self, g, nhops): self.g = g self.nhops = nhops def sample_block(self, seeds): return [self.g] * self.nhops, seeds sagemaker_graph_fraud_detection/dgl_fraud_detection/.ipynb_checkpoints/__init__-checkpoint.py0000644000175000001440000000000014214110624034244 0ustar sagemaker-useruserssagemaker_graph_fraud_detection/dgl_fraud_detection/.ipynb_checkpoints/estimator_fns-checkpoint.py0000644000175000001440000000716314215134727035416 0ustar sagemaker-userusersimport os import argparse import logging def parse_args(): parser = argparse.ArgumentParser() parser.add_argument('--training-dir', type=str, default=os.environ['SM_CHANNEL_TRAIN']) parser.add_argument('--model-dir', type=str, default=os.environ['SM_MODEL_DIR']) parser.add_argument('--output-dir', type=str, default=os.environ['SM_OUTPUT_DATA_DIR']) parser.add_argument('--nodes', type=str, default='features.csv') parser.add_argument('--target-ntype', type=str, default='NPI') parser.add_argument('--edges', type=str, default='relation*') parser.add_argument('--heterogeneous', type=lambda x: (str(x).lower() in ['true', '1', 'yes']), default=True, help='use hetero graph') parser.add_argument('--no-features', type=lambda x: (str(x).lower() in ['true', '1', 'yes']), default=False, help='do not use node features') parser.add_argument('--mini-batch', type=lambda x: (str(x).lower() in ['true', '1', 'yes']), default=True, help='use mini-batch training and sample graph') parser.add_argument('--labels', type=str, default='tags.csv') parser.add_argument('--validation-data', type=str, default='validation.csv') parser.add_argument('--new-accounts', type=str, default='test.csv') parser.add_argument('--predictions', type=str, default='preds.csv', help='file to save predictions on new-accounts') parser.add_argument('--compute-metrics', type=lambda x: (str(x).lower() in ['true', '1', 'yes']), default=True, help='compute evaluation metrics after training') parser.add_argument('--threshold', type=float, default=0, help='threshold for making predictions, default : argmax') parser.add_argument('--model', type=str, default='rgcn', help='gnn to use. options: gcn, graphsage, gat, gem') parser.add_argument('--num-gpus', type=int, default=1) parser.add_argument('--batch-size', type=int, default=500) parser.add_argument('--optimizer', type=str, default='adam') parser.add_argument('--lr', type=float, default=1e-2) parser.add_argument('--n-epochs', type=int, default=20) parser.add_argument('--n-neighbors', type=int, default=10, help='number of neighbors to sample') parser.add_argument('--n-hidden', type=int, default=16, help='number of hidden units') parser.add_argument('--n-layers', type=int, default=3, help='number of hidden layers') parser.add_argument('--weight-decay', type=float, default=5e-4, help='Weight for L2 loss') parser.add_argument('--dropout', type=float, default=0.2, help='dropout probability, for gat only features') parser.add_argument('--attn-drop', type=float, default=0.6, help='attention dropout for gat/gem') parser.add_argument('--num-heads', type=int, default=4, help='number of hidden attention heads for gat/gem') parser.add_argument('--num-out-heads', type=int, default=1, help='number of output attention heads for gat/gem') parser.add_argument('--residual', action="store_true", default=False, help='use residual connection for gat') parser.add_argument('--alpha', type=float, default=0.2, help='the negative slop of leaky relu') parser.add_argument('--aggregator-type', type=str, default="gcn", help="graphsage aggregator: mean/gcn/pool/lstm") parser.add_argument('--embedding-size', type=int, default=360, help="embedding size for node embedding") return parser.parse_args() def get_logger(name): logger = logging.getLogger(name) log_format = '%(asctime)s %(levelname)s %(name)s: %(message)s' logging.basicConfig(format=log_format, level=logging.INFO) logger.setLevel(logging.INFO) return loggersagemaker_graph_fraud_detection/dgl_fraud_detection/.ipynb_checkpoints/data-checkpoint.py0000644000175000001440000001744514214110624033443 0ustar sagemaker-userusersimport numpy as np import pandas as pd def get_features(id_to_node, node_features): """ :param id_to_node: dictionary mapping node names(id) to dgl node idx :param node_features: path to file containing node features :return: (np.ndarray, list) node feature matrix in order and new nodes not yet in the graph """ indices, features, new_nodes = [], [], [] max_node = max(id_to_node.values()) with open(node_features, "r") as fh: for line in fh: node_feats = line.strip().split(",") node_id = node_feats[0] feats = np.array(list(map(float, node_feats[1:]))) features.append(feats) if node_id not in id_to_node: max_node += 1 id_to_node[node_id] = max_node new_nodes.append(max_node) indices.append(id_to_node[node_id]) features = np.array(features).astype('float32') features = features[np.argsort(indices), :] return features, new_nodes def get_labels(id_to_node, n_nodes, target_node_type, labels_path, masked_nodes_path_valid, masked_nodes_path_test, additional_mask_rate=0): """ :param id_to_node: dictionary mapping node names(id) to dgl node idx :param n_nodes: number of user nodes in the graph :param target_node_type: column name for target node type :param labels_path: filepath containing labelled nodes :param masked_nodes_path: filepath containing list of nodes to be masked :param additional_mask_rate: additional_mask_rate: float for additional masking of nodes with labels during training :return: (list, list) train and test mask array """ node_to_id = {v: k for k, v in id_to_node.items()} user_to_label = pd.read_csv(labels_path).astype({target_node_type:str}).set_index(target_node_type) labels = user_to_label.loc[pd.Series(node_to_id)[np.arange(n_nodes)].values].values.flatten() masked_nodes_valid = read_masked_nodes(masked_nodes_path_valid) masked_nodes_test = read_masked_nodes(masked_nodes_path_test) train_mask, valid_mask, test_mask = _get_mask(id_to_node, node_to_id, n_nodes, masked_nodes_valid, masked_nodes_test, additional_mask_rate=additional_mask_rate) return labels, train_mask, valid_mask, test_mask def read_masked_nodes(masked_nodes_path): """ Returns a list of nodes extracted from the path passed in :param masked_nodes_path: filepath containing list of nodes to be masked i.e test users :return: list """ with open(masked_nodes_path, "r") as fh: masked_nodes = [line.strip() for line in fh] return masked_nodes def _get_mask(id_to_node, node_to_id, num_nodes, masked_nodes_valid, masked_nodes_test, additional_mask_rate): """ Returns the train and test mask arrays :param id_to_node: dictionary mapping node names(id) to dgl node idx :param node_to_id: dictionary mapping dgl node idx to node names(id) :param num_nodes: number of user/account nodes in the graph :param masked_nodes: list of nodes to be masked during training, nodes without labels :param additional_mask_rate: float for additional masking of nodes with labels during training :return: (list, list) train and test mask array """ train_mask = np.ones(num_nodes) valid_mask = np.zeros(num_nodes) test_mask = np.zeros(num_nodes) for node_id in masked_nodes_valid: train_mask[id_to_node[node_id]] = 0 valid_mask[id_to_node[node_id]] = 1 for node_id in masked_nodes_test: train_mask[id_to_node[node_id]] = 0 test_mask[id_to_node[node_id]] = 1 if additional_mask_rate and additional_mask_rate < 1: unmasked = np.array([idx for idx in range(num_nodes) if node_to_id[idx] not in masked_nodes]) yet_unmasked = np.random.permutation(unmasked)[:int(additional_mask_rate*num_nodes)] train_mask[yet_unmasked] = 0 return train_mask, valid_mask, test_mask def _get_node_idx(id_to_node, node_type, node_id, ptr): if node_type in id_to_node: if node_id in id_to_node[node_type]: node_idx = id_to_node[node_type][node_id] else: id_to_node[node_type][node_id] = ptr node_idx = ptr ptr += 1 else: id_to_node[node_type] = {} id_to_node[node_type][node_id] = ptr node_idx = ptr ptr += 1 return node_idx, id_to_node, ptr def parse_edgelist(edges, id_to_node, header=False, source_type='user', sink_type='user'): """ Parse an edgelist path file and return the edges as a list of tuple :param edges: path to comma separated file containing bipartite edges with header for edgetype :param id_to_node: dictionary containing mapping for node names(id) to dgl node indices :param header: boolean whether or not the file has a header row :param source_type: type of the source node in the edge. defaults to 'user' if no header :param sink_type: type of the sink node in the edge. defaults to 'user' if no header. :return: (list, dict) a list containing edges of a single relationship type as tuples and updated id_to_node dict. """ edge_list = [] source_pointer, sink_pointer = 0, 0 with open(edges, "r") as fh: for i, line in enumerate(fh): source, sink = line.strip().split(",") if i == 0: if header: source_type, sink_type = source, sink if source_type in id_to_node: source_pointer = max(id_to_node[source_type].values()) + 1 if sink_type in id_to_node: sink_pointer = max(id_to_node[sink_type].values()) + 1 continue source_node, id_to_node, source_pointer = _get_node_idx(id_to_node, source_type, source, source_pointer) if source_type == sink_type: sink_node, id_to_node, source_pointer = _get_node_idx(id_to_node, sink_type, sink, source_pointer) else: sink_node, id_to_node, sink_pointer = _get_node_idx(id_to_node, sink_type, sink, sink_pointer) edge_list.append((source_node, sink_node)) return edge_list, id_to_node, source_type, sink_type def read_edges(edges, nodes=None): """ Read edges and node features :param edges: path to comma separated file containing all edges :param nodes: path to comma separated file containing all nodes + features :return: (list, list, list, dict) sources, sinks, features and id_to_node dictionary containing mappings from node names(id) to dgl node indices """ node_pointer = 0 id_to_node = {} features = [] sources, sinks = [], [] if nodes is not None: with open(nodes, "r") as fh: for line in fh: node_feats = line.strip().split(",") node_id = node_feats[0] if node_id not in id_to_node: id_to_node[node_id] = node_pointer node_pointer += 1 if len(node_feats) > 1: feats = np.array(list(map(float, node_feats[1:]))) features.append(feats) with open(edges, "r") as fh: for line in fh: source, sink = line.strip().split(",") sources.append(id_to_node[source]) sinks.append(id_to_node[sink]) else: with open(edges, "r") as fh: for line in fh: source, sink = line.strip().split(",") if source not in id_to_node: id_to_node[source] = node_pointer node_pointer += 1 if sink not in id_to_node: id_to_node[sink] = node_pointer node_pointer += 1 sources.append(id_to_node[source]) sinks.append(id_to_node[sink]) return sources, sinks, features, id_to_node sagemaker_graph_fraud_detection/dgl_fraud_detection/.ipynb_checkpoints/graph-checkpoint.py0000644000175000001440000000560714214110624033630 0ustar sagemaker-userusersimport os import re import dgl import numpy as np from data import * def get_edgelists(edgelist_expression, directory): if "," in edgelist_expression: return edgelist_expression.split(",") files = os.listdir(directory) compiled_expression = re.compile(edgelist_expression) return [filename for filename in files if compiled_expression.match(filename)] def construct_graph(training_dir, edges, nodes, target_node_type, heterogeneous=True): if heterogeneous: print("Getting relation graphs from the following edge lists : {} ".format(edges)) edgelists, id_to_node = {}, {} for i, edge in enumerate(edges): edgelist, id_to_node, src, dst = parse_edgelist(os.path.join(training_dir, edge), id_to_node, header=True) if src == target_node_type: src = 'target' if dst == target_node_type: dst = 'target' edgelists[(src, 'relation{}'.format(i), dst)] = edgelist print("Read edges for relation{} from edgelist: {}".format(i, os.path.join(training_dir, edge))) # reverse edge list so that relation is undirected edgelists[(dst, 'reverse_relation{}'.format(i), src)] = [(b, a) for a, b in edgelist] # get features for target nodes features, new_nodes = get_features(id_to_node[target_node_type], os.path.join(training_dir, nodes)) print("Read in features for target nodes") # handle target nodes that have features but don't have any connections # if new_nodes: # edgelists[('target', 'relation'.format(i+1), 'none')] = [(node, 0) for node in new_nodes] # edgelists[('none', 'reverse_relation{}'.format(i + 1), 'target')] = [(0, node) for node in new_nodes] # add self relation edgelists[('target', 'self_relation', 'target')] = [(t, t) for t in id_to_node[target_node_type].values()] g = dgl.heterograph(edgelists) print( "Constructed heterograph with the following metagraph structure: Node types {}, Edge types{}".format( g.ntypes, g.canonical_etypes)) print("Number of nodes of type target : {}".format(g.number_of_nodes('target'))) g.nodes['target'].data['features'] = features id_to_node = id_to_node[target_node_type] else: sources, sinks, features, id_to_node = read_edges(os.path.join(training_dir, edges[0]), os.path.join(training_dir, nodes)) # add self relation all_nodes = sorted(id_to_node.values()) sources.extend(all_nodes) sinks.extend(all_nodes) g = dgl.graph((sources, sinks)) if features: g.ndata['features'] = np.array(features).astype('float32') print('read graph from node list and edge list') features = g.ndata['features'] return g, features, id_to_node ././@LongLink0000000000000000000000000000016100000000000011563 Lustar rootrootsagemaker_graph_fraud_detection/dgl_fraud_detection/.ipynb_checkpoints/train_dgl_mxnet_entry_point-checkpoint.pysagemaker_graph_fraud_detection/dgl_fraud_detection/.ipynb_checkpoints/train_dgl_mxnet_entry_point-c0000644000175000001440000003112514217105162036001 0ustar sagemaker-userusersimport os os.environ['DGLBACKEND'] = 'mxnet' import mxnet as mx from mxnet import nd, gluon, autograd import dgl import numpy as np import pandas as pd import time import logging import pickle import math from estimator_fns import * from graph import * from data import * from utils import * from model.mxnet import * from sampler import * def normalize(feature_matrix): mean = nd.mean(feature_matrix, axis=0) stdev = nd.sqrt(nd.sum((feature_matrix - mean)**2, axis=0)/feature_matrix.shape[0]) return (feature_matrix - mean) / stdev def get_dataloader(data_size, batch_size, mini_batch=True): batch_size = batch_size if mini_batch else data_size train_dataloader = gluon.data.BatchSampler(gluon.data.RandomSampler(data_size), batch_size, 'keep') test_dataloader = gluon.data.BatchSampler(gluon.data.SequentialSampler(data_size), batch_size, 'keep') return train_dataloader, test_dataloader def train(model, trainer, loss, features, labels, train_loader, test_loader, train_g, test_g, train_mask, valid_mask, test_mask, ctx, n_epochs, batch_size, output_dir, thresh, scale_pos_weight, compute_metrics=True, mini_batch=True): duration = [] for epoch in range(n_epochs): tic = time.time() loss_val = 0. for n, batch in enumerate(train_loader): # logging.info("Iteration: {:05d}".format(n)) node_flow, batch_nids = train_g.sample_block(nd.array(batch).astype('int64')) batch_indices = nd.array(batch, ctx=ctx) with autograd.record(): pred = model(node_flow, features[batch_nids.as_in_context(ctx)]) l = loss(pred, labels[batch_indices], mx.nd.expand_dims(scale_pos_weight*train_mask, 1)[batch_indices]) l = l.sum()/len(batch) l.backward() trainer.step(batch_size=1, ignore_stale_grad=True) loss_val += l.asscalar() # logging.info("Current loss {:04f}".format(loss_val/(n+1))) duration.append(time.time() - tic) train_metric, valid_metric = evaluate(model, train_g, features, labels, train_mask, valid_mask, ctx, batch_size, mini_batch) logging.info("Epoch {:05d} | Time(s) {:.4f} | Training Loss {:.4f} | Training F1 {:.4f} | Validation F1 {:.4f}".format( epoch, np.mean(duration), loss_val/(n+1), train_metric, valid_metric)) class_preds, pred_proba = get_model_class_predictions(model, test_g, test_loader, features, ctx, threshold=thresh) if compute_metrics: acc, f1, p, r, roc, pr, ap, cm = get_metrics(class_preds, pred_proba, labels, test_mask, output_dir) logging.info("Metrics") logging.info("""Confusion Matrix: {} f1: {:.4f}, precision: {:.4f}, recall: {:.4f}, acc: {:.4f}, roc: {:.4f}, pr: {:.4f}, ap: {:.4f} """.format(cm, f1, p, r, acc, roc, pr, ap)) return model, class_preds, pred_proba def evaluate(model, g, features, labels, train_mask, valid_mask, ctx, batch_size, mini_batch=True): train_f1, valid_f1 = mx.metric.F1(), mx.metric.F1() preds = [] batch_size = batch_size if mini_batch else features.shape[0] dataloader = gluon.data.BatchSampler(gluon.data.SequentialSampler(features.shape[0]), batch_size, 'keep') for batch in dataloader: node_flow, batch_nids = g.sample_block(nd.array(batch).astype('int64')) preds.append(model(node_flow, features[batch_nids.as_in_context(ctx)])) nd.waitall() # preds = nd.concat(*preds, dim=0).argmax(axis=1) preds = nd.concat(*preds, dim=0) train_mask = nd.array(np.where(train_mask.asnumpy()), ctx=ctx) valid_mask = nd.array(np.where(valid_mask.asnumpy()), ctx=ctx) train_f1.update(preds=nd.softmax(preds[train_mask], axis=1).reshape(-3, 0), labels=labels[train_mask].reshape(-1,)) valid_f1.update(preds=nd.softmax(preds[valid_mask], axis=1).reshape(-3, 0), labels=labels[valid_mask].reshape(-1,)) return train_f1.get()[1], valid_f1.get()[1] def get_model_predictions(model, g, dataloader, features, ctx): pred = [] for batch in dataloader: node_flow, batch_nids = g.sample_block(nd.array(batch).astype('int64')) pred.append(model(node_flow, features[batch_nids.as_in_context(ctx)])) nd.waitall() return nd.concat(*pred, dim=0) def get_model_class_predictions(model, g, datalaoder, features, ctx, threshold=None): unnormalized_preds = get_model_predictions(model, g, datalaoder, features, ctx) pred_proba = nd.softmax(unnormalized_preds)[:, 1].asnumpy().flatten() if not threshold: return unnormalized_preds.argmax(axis=1).asnumpy().flatten().astype(int), pred_proba return np.where(pred_proba > threshold, 1, 0), pred_proba def save_prediction(pred, pred_proba, id_to_node, training_dir, new_accounts, output_dir, predictions_file): prediction_query = read_masked_nodes(os.path.join(training_dir, new_accounts)) pred_indices = np.array([id_to_node[query] for query in prediction_query]) pd.DataFrame.from_dict({'target': prediction_query, 'pred_proba': pred_proba[pred_indices], 'pred': pred[pred_indices]}).to_csv(os.path.join(output_dir, predictions_file), index=False) def save_model(g, model, model_dir, hyperparams): model.save_parameters(os.path.join(model_dir, 'model.params')) with open(os.path.join(model_dir, 'model_hyperparams.pkl'), 'wb') as f: pickle.dump(hyperparams, f) with open(os.path.join(model_dir, 'graph.pkl'), 'wb') as f: pickle.dump(g, f) def get_model(g, hyperparams, in_feats, n_classes, ctx, model_dir=None): if model_dir: # load using saved model state with open(os.path.join(model_dir, 'model_hyperparams.pkl'), 'rb') as f: hyperparams = pickle.load(f) with open(os.path.join(model_dir, 'graph.pkl'), 'rb') as f: g = pickle.load(f) if hyperparams['heterogeneous']: model = HeteroRGCN(g, in_feats, hyperparams['n_hidden'], n_classes, hyperparams['n_layers'], hyperparams['embedding_size'], ctx) else: if hyperparams['model'] == 'gcn': model = GCN(g, in_feats, hyperparams['n_hidden'], n_classes, hyperparams['n_layers'], nd.relu, hyperparams['dropout']) elif hyperparams['model'] == 'graphsage': model = GraphSAGE(g, in_feats, hyperparams['n_hidden'], n_classes, hyperparams['n_layers'], nd.relu, hyperparams['dropout'], hyperparams['aggregator_type']) else: heads = ([hyperparams['num_heads']] * hyperparams['n_layers']) + [hyperparams['num_out_heads']] model = GAT(g, in_feats, hyperparams['n_hidden'], n_classes, hyperparams['n_layers'], heads, gluon.nn.Lambda(lambda data: nd.LeakyReLU(data, act_type='elu')), hyperparams['dropout'], hyperparams['attn_drop'], hyperparams['alpha'], hyperparams['residual']) if hyperparams['no_features']: model = NodeEmbeddingGNN(model, in_feats, hyperparams['embedding_size']) if model_dir: model.load_parameters(os.path.join(model_dir, 'model.params')) else: model.initialize(ctx=ctx) return model if __name__ == '__main__': logging = get_logger(__name__) logging.info('numpy version:{} MXNet version:{} DGL version:{}'.format(np.__version__, mx.__version__, dgl.__version__)) args = parse_args() args.edges = get_edgelists(args.edges, args.training_dir) g, features, id_to_node = construct_graph(args.training_dir, args.edges, args.nodes, args.target_ntype, args.heterogeneous) features = normalize(nd.array(features)) if args.heterogeneous: g.nodes['target'].data['features'] = features else: g.ndata['features'] = features logging.info("Getting labels") n_nodes = g.number_of_nodes('target') if args.heterogeneous else g.number_of_nodes() labels, train_mask, valid_mask, test_mask = get_labels( id_to_node, n_nodes, args.target_ntype, os.path.join(args.training_dir, args.labels), os.path.join(args.training_dir, args.validation_data), os.path.join(args.training_dir, args.new_accounts), ) logging.info("Got labels") labels = nd.array(labels).astype('float32') train_mask = nd.array(train_mask).astype('float32') valid_mask = nd.array(valid_mask).astype('float32') test_mask = nd.array(test_mask).astype('float32') n_nodes = sum([g.number_of_nodes(n_type) for n_type in g.ntypes]) if args.heterogeneous else g.number_of_nodes() n_edges = sum([g.number_of_edges(e_type) for e_type in g.etypes]) if args.heterogeneous else g.number_of_edges() logging.info("""----Data statistics------' #Nodes: {} #Edges: {} #Features Shape: {} #Labeled Train samples: {} #Unlabeled Test samples: {}""".format(n_nodes, n_edges, features.shape, train_mask.sum().asscalar(), test_mask.sum().asscalar())) if args.num_gpus: cuda = True ctx = mx.gpu(0) else: cuda = False ctx = mx.cpu(0) logging.info("Initializing Model") in_feats = args.embedding_size if args.no_features else features.shape[1] n_classes = 2 model = get_model(g, vars(args), in_feats, n_classes, ctx) logging.info("Initialized Model") if args.no_features: features = nd.array(g.nodes('target'), ctx) if args.heterogeneous else nd.array(g.nodes(), ctx) else: features = features.as_in_context(ctx) labels = labels.as_in_context(ctx) train_mask = train_mask.as_in_context(ctx) valid_mask = valid_mask.as_in_context(ctx) test_mask = test_mask.as_in_context(ctx) if not args.heterogeneous: # normalization degs = g.in_degrees().astype('float32') norm = mx.nd.power(degs, -0.5) if cuda: norm = norm.as_in_context(ctx) g.ndata['norm'] = mx.nd.expand_dims(norm, 1) if args.mini_batch: train_g = HeteroGraphNeighborSampler(g, 'target', args.n_layers, args.n_neighbors) if args.heterogeneous\ else NeighborSampler(g, args.n_layers, args.n_neighbors) test_g = HeteroGraphNeighborSampler(g, 'target', args.n_layers) if args.heterogeneous\ else NeighborSampler(g, args.n_layers) else: train_g, test_g = FullGraphSampler(g, args.n_layers), FullGraphSampler(g, args.n_layers) train_data, test_data = get_dataloader(features.shape[0], args.batch_size, args.mini_batch) loss = gluon.loss.SoftmaxCELoss() scale_pos_weight = nd.sqrt((train_mask.shape[0] - train_mask.sum()) / train_mask.sum()) logging.info(model) logging.info(model.collect_params()) trainer = gluon.Trainer(model.collect_params(), args.optimizer, {'learning_rate': args.lr, 'wd': args.weight_decay}) logging.info("Starting Model training") model, pred, pred_proba = train(model, trainer, loss, features, labels, train_data, test_data, train_g, test_g, train_mask, valid_mask, test_mask, ctx, args.n_epochs, args.batch_size, args.output_dir, args.threshold, scale_pos_weight, args.compute_metrics, args.mini_batch) logging.info("Finished Model training") logging.info("Saving model") save_model(g, model, args.model_dir, vars(args)) logging.info("Saving model predictions for new accounts") save_prediction(pred, pred_proba, id_to_node, args.training_dir, args.new_accounts, args.output_dir, args.predictions) sagemaker_graph_fraud_detection/dgl_fraud_detection/.ipynb_checkpoints/utils-checkpoint.py0000644000175000001440000000575614214110625033675 0ustar sagemaker-userusersimport os import pandas as pd import numpy as np from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score import networkx as nx import matplotlib.pyplot as plt def get_metrics(pred, pred_proba, labels, mask, out_dir): labels, mask = labels.asnumpy().flatten().astype(int), mask.asnumpy().flatten().astype(int) labels, pred, pred_proba = labels[np.where(mask)], pred[np.where(mask)], pred_proba[np.where(mask)] acc = ((pred == labels)).sum() / mask.sum() true_pos = (np.where(pred == 1, 1, 0) + np.where(labels == 1, 1, 0) > 1).sum() false_pos = (np.where(pred == 1, 1, 0) + np.where(labels == 0, 1, 0) > 1).sum() false_neg = (np.where(pred == 0, 1, 0) + np.where(labels == 1, 1, 0) > 1).sum() true_neg = (np.where(pred == 0, 1, 0) + np.where(labels == 0, 1, 0) > 1).sum() precision = true_pos/(true_pos + false_pos) if (true_pos + false_pos) > 0 else 0 recall = true_pos/(true_pos + false_neg) if (true_pos + false_neg) > 0 else 0 f1 = 2*(precision*recall)/(precision + recall) if (precision + recall) > 0 else 0 confusion_matrix = pd.DataFrame(np.array([[true_pos, false_pos], [false_neg, true_neg]]), columns=["labels positive", "labels negative"], index=["predicted positive", "predicted negative"]) ap = average_precision_score(labels, pred_proba) fpr, tpr, _ = roc_curve(labels, pred_proba) prc, rec, _ = precision_recall_curve(labels, pred_proba) roc_auc = auc(fpr, tpr) pr_auc = auc(rec, prc) save_roc_curve(fpr, tpr, roc_auc, os.path.join(out_dir, "roc_curve.png")) save_pr_curve(prc, rec, pr_auc, ap, os.path.join(out_dir, "pr_curve.png")) return acc, f1, precision, recall, roc_auc, pr_auc, ap, confusion_matrix def save_roc_curve(fpr, tpr, roc_auc, location): f = plt.figure() lw = 2 plt.plot(fpr, tpr, color='darkorange', lw=lw, label='ROC curve (area = %0.2f)' % roc_auc) plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Model ROC curve') plt.legend(loc="lower right") f.savefig(location) def save_pr_curve(fpr, tpr, pr_auc, ap, location): f = plt.figure() lw = 2 plt.plot(fpr, tpr, color='darkorange', lw=lw, label='PR curve (area = %0.2f)' % pr_auc) plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('Recall') plt.ylabel('Precision') plt.title('Model PR curve: AP={0:0.2f}'.format(ap)) plt.legend(loc="lower right") f.savefig(location) def save_graph_drawing(g, location): plt.figure(figsize=(12, 8)) node_colors = {node: 0.0 if 'user' in node else 0.5 for node in g.nodes()} nx.draw(g, node_size=10000, pos=nx.spring_layout(g), with_labels=True, font_size=14, node_color=list(node_colors.values()), font_color='white') plt.savefig(location, bbox_inches='tight') sagemaker_graph_fraud_detection/dgl_fraud_detection/.ipynb_checkpoints/requirements-checkpoint.txt0000644000175000001440000000000014214110624035417 0ustar sagemaker-useruserssagemaker_graph_fraud_detection/dgl_fraud_detection/data.py0000644000175000001440000001744514214110624025525 0ustar sagemaker-userusersimport numpy as np import pandas as pd def get_features(id_to_node, node_features): """ :param id_to_node: dictionary mapping node names(id) to dgl node idx :param node_features: path to file containing node features :return: (np.ndarray, list) node feature matrix in order and new nodes not yet in the graph """ indices, features, new_nodes = [], [], [] max_node = max(id_to_node.values()) with open(node_features, "r") as fh: for line in fh: node_feats = line.strip().split(",") node_id = node_feats[0] feats = np.array(list(map(float, node_feats[1:]))) features.append(feats) if node_id not in id_to_node: max_node += 1 id_to_node[node_id] = max_node new_nodes.append(max_node) indices.append(id_to_node[node_id]) features = np.array(features).astype('float32') features = features[np.argsort(indices), :] return features, new_nodes def get_labels(id_to_node, n_nodes, target_node_type, labels_path, masked_nodes_path_valid, masked_nodes_path_test, additional_mask_rate=0): """ :param id_to_node: dictionary mapping node names(id) to dgl node idx :param n_nodes: number of user nodes in the graph :param target_node_type: column name for target node type :param labels_path: filepath containing labelled nodes :param masked_nodes_path: filepath containing list of nodes to be masked :param additional_mask_rate: additional_mask_rate: float for additional masking of nodes with labels during training :return: (list, list) train and test mask array """ node_to_id = {v: k for k, v in id_to_node.items()} user_to_label = pd.read_csv(labels_path).astype({target_node_type:str}).set_index(target_node_type) labels = user_to_label.loc[pd.Series(node_to_id)[np.arange(n_nodes)].values].values.flatten() masked_nodes_valid = read_masked_nodes(masked_nodes_path_valid) masked_nodes_test = read_masked_nodes(masked_nodes_path_test) train_mask, valid_mask, test_mask = _get_mask(id_to_node, node_to_id, n_nodes, masked_nodes_valid, masked_nodes_test, additional_mask_rate=additional_mask_rate) return labels, train_mask, valid_mask, test_mask def read_masked_nodes(masked_nodes_path): """ Returns a list of nodes extracted from the path passed in :param masked_nodes_path: filepath containing list of nodes to be masked i.e test users :return: list """ with open(masked_nodes_path, "r") as fh: masked_nodes = [line.strip() for line in fh] return masked_nodes def _get_mask(id_to_node, node_to_id, num_nodes, masked_nodes_valid, masked_nodes_test, additional_mask_rate): """ Returns the train and test mask arrays :param id_to_node: dictionary mapping node names(id) to dgl node idx :param node_to_id: dictionary mapping dgl node idx to node names(id) :param num_nodes: number of user/account nodes in the graph :param masked_nodes: list of nodes to be masked during training, nodes without labels :param additional_mask_rate: float for additional masking of nodes with labels during training :return: (list, list) train and test mask array """ train_mask = np.ones(num_nodes) valid_mask = np.zeros(num_nodes) test_mask = np.zeros(num_nodes) for node_id in masked_nodes_valid: train_mask[id_to_node[node_id]] = 0 valid_mask[id_to_node[node_id]] = 1 for node_id in masked_nodes_test: train_mask[id_to_node[node_id]] = 0 test_mask[id_to_node[node_id]] = 1 if additional_mask_rate and additional_mask_rate < 1: unmasked = np.array([idx for idx in range(num_nodes) if node_to_id[idx] not in masked_nodes]) yet_unmasked = np.random.permutation(unmasked)[:int(additional_mask_rate*num_nodes)] train_mask[yet_unmasked] = 0 return train_mask, valid_mask, test_mask def _get_node_idx(id_to_node, node_type, node_id, ptr): if node_type in id_to_node: if node_id in id_to_node[node_type]: node_idx = id_to_node[node_type][node_id] else: id_to_node[node_type][node_id] = ptr node_idx = ptr ptr += 1 else: id_to_node[node_type] = {} id_to_node[node_type][node_id] = ptr node_idx = ptr ptr += 1 return node_idx, id_to_node, ptr def parse_edgelist(edges, id_to_node, header=False, source_type='user', sink_type='user'): """ Parse an edgelist path file and return the edges as a list of tuple :param edges: path to comma separated file containing bipartite edges with header for edgetype :param id_to_node: dictionary containing mapping for node names(id) to dgl node indices :param header: boolean whether or not the file has a header row :param source_type: type of the source node in the edge. defaults to 'user' if no header :param sink_type: type of the sink node in the edge. defaults to 'user' if no header. :return: (list, dict) a list containing edges of a single relationship type as tuples and updated id_to_node dict. """ edge_list = [] source_pointer, sink_pointer = 0, 0 with open(edges, "r") as fh: for i, line in enumerate(fh): source, sink = line.strip().split(",") if i == 0: if header: source_type, sink_type = source, sink if source_type in id_to_node: source_pointer = max(id_to_node[source_type].values()) + 1 if sink_type in id_to_node: sink_pointer = max(id_to_node[sink_type].values()) + 1 continue source_node, id_to_node, source_pointer = _get_node_idx(id_to_node, source_type, source, source_pointer) if source_type == sink_type: sink_node, id_to_node, source_pointer = _get_node_idx(id_to_node, sink_type, sink, source_pointer) else: sink_node, id_to_node, sink_pointer = _get_node_idx(id_to_node, sink_type, sink, sink_pointer) edge_list.append((source_node, sink_node)) return edge_list, id_to_node, source_type, sink_type def read_edges(edges, nodes=None): """ Read edges and node features :param edges: path to comma separated file containing all edges :param nodes: path to comma separated file containing all nodes + features :return: (list, list, list, dict) sources, sinks, features and id_to_node dictionary containing mappings from node names(id) to dgl node indices """ node_pointer = 0 id_to_node = {} features = [] sources, sinks = [], [] if nodes is not None: with open(nodes, "r") as fh: for line in fh: node_feats = line.strip().split(",") node_id = node_feats[0] if node_id not in id_to_node: id_to_node[node_id] = node_pointer node_pointer += 1 if len(node_feats) > 1: feats = np.array(list(map(float, node_feats[1:]))) features.append(feats) with open(edges, "r") as fh: for line in fh: source, sink = line.strip().split(",") sources.append(id_to_node[source]) sinks.append(id_to_node[sink]) else: with open(edges, "r") as fh: for line in fh: source, sink = line.strip().split(",") if source not in id_to_node: id_to_node[source] = node_pointer node_pointer += 1 if sink not in id_to_node: id_to_node[sink] = node_pointer node_pointer += 1 sources.append(id_to_node[source]) sinks.append(id_to_node[sink]) return sources, sinks, features, id_to_node sagemaker_graph_fraud_detection/dgl_fraud_detection/model/0000755000175000001440000000000014215134214025331 5ustar sagemaker-useruserssagemaker_graph_fraud_detection/dgl_fraud_detection/model/pytorch.py0000644000175000001440000001537014214110624027377 0ustar sagemaker-userusersimport torch import torch.nn as nn import torch.nn.functional as F from dgl.nn.pytorch import GraphConv, GATConv, SAGEConv import dgl.function as fn class HeteroRGCNLayer(nn.Module): def __init__(self, in_size, out_size, etypes): super(HeteroRGCNLayer, self).__init__() # W_r for each relation self.weight = nn.ModuleDict({ name: nn.Linear(in_size, out_size) for name in etypes }) def forward(self, G, feat_dict): # The input is a dictionary of node features for each type funcs = {} for srctype, etype, dsttype in G.canonical_etypes: # Compute W_r * h if srctype in feat_dict: Wh = self.weight[etype](feat_dict[srctype]) # Save it in graph for message passing G.nodes[srctype].data['Wh_%s' % etype] = Wh # Specify per-relation message passing functions: (message_func, reduce_func). # Note that the results are saved to the same destination feature 'h', which # hints the type wise reducer for aggregation. funcs[etype] = (fn.copy_u('Wh_%s' % etype, 'm'), fn.mean('m', 'h')) # Trigger message passing of multiple types. # The first argument is the message passing functions for each relation. # The second one is the type wise reducer, could be "sum", "max", # "min", "mean", "stack" G.multi_update_all(funcs, 'sum') # return the updated node feature dictionary return {ntype: G.dstnodes[ntype].data['h'] for ntype in G.ntypes if 'h' in G.dstnodes[ntype].data} class HeteroRGCN(nn.Module): def __init__(self, g, in_size, hidden_size, out_size, n_layers, embedding_size): super(HeteroRGCN, self).__init__() # Use trainable node embeddings as featureless inputs. embed_dict = {ntype: nn.Parameter(torch.Tensor(g.number_of_nodes(ntype), in_size)) for ntype in g.ntypes if ntype != 'user'} for key, embed in embed_dict.items(): nn.init.xavier_uniform_(embed) self.embed = nn.ParameterDict(embed_dict) # create layers self.layers = nn.Sequential() self.layers.add_module(HeteroRGCNLayer(embedding_size, hidden_size, g.etypes)) # hidden layers for i in range(n_layers - 1): self.layers.add_module = HeteroRGCNLayer(hidden_size, hidden_size, g.etypes) # output layer self.layers.add(nn.Dense(hidden_size, out_size)) def forward(self, g, features): # get embeddings for all node types. for user node type, use passed in user features h_dict = self.embed h_dict['user'] = features # pass through all layers for i, layer in enumerate(self.layers[:-1]): if i != 0: h_dict = {k: F.leaky_relu(h) for k, h in h_dict.items()} h_dict = layer(g[i], h_dict) # get user logits # return h_dict['user'] return self.layers[-1](h_dict['user']) class NodeEmbeddingGNN(nn.Module): def __init__(self, gnn, input_size, embedding_size): super(NodeEmbeddingGNN, self).__init__() self.embed = nn.Embedding(input_size, embedding_size) self.gnn = gnn def forward(self, g, nodes): features = self.embed(nodes) h = self.gnn(g, features) return h class GCN(nn.Module): def __init__(self, g, in_feats, n_hidden, n_classes, n_layers, activation, dropout): super(GCN, self).__init__() self.g = g self.layers = nn.Sequential() # input layer self.layers.add_module(GraphConv(in_feats, n_hidden, activation=activation)) # hidden layers for i in range(n_layers - 1): self.layers.add(GraphConv(n_hidden, n_hidden, activation=activation)) # output layer # self.layers.add(GraphConv(n_hidden, n_classes)) self.layers.add(nn.Linear(n_hidden, n_classes)) self.dropout = nn.Dropout(p=dropout) def forward(self, g, features): h = features for i, layer in enumerate(self.layers[:-1]): if i != 0: h = self.dropout(h) h = layer(g, h) return self.layers[-1](h) class GraphSAGE(nn.Module): def __init__(self, g, in_feats, n_hidden, n_classes, n_layers, activation, dropout, aggregator_type): super(GraphSAGE, self).__init__() self.g = g with self.name_scope(): self.layers = nn.Sequential() # input layer self.layers.add_module(SAGEConv(in_feats, n_hidden, aggregator_type, feat_drop=dropout, activation=activation)) # hidden layers for i in range(n_layers - 1): self.layers.add_module(SAGEConv(n_hidden, n_hidden, aggregator_type, feat_drop=dropout, activation=activation)) # output layer self.layers.add_module(nn.Linear(n_hidden, n_classes)) def forward(self, g, features): h = features for layer in self.layers[:-1]: h = layer(g, h) return self.layers[-1](h) class GAT(nn.Module): def __init__(self, g, in_dim, num_hidden, num_classes, num_layers, heads, activation, feat_drop, attn_drop, alpha, residual): super(GAT, self).__init__() self.g = g self.num_layers = num_layers self.gat_layers = nn.ModuleList() self.activation = activation # input projection (no residual) self.gat_layers.append(GATConv( in_dim, num_hidden, heads[0], feat_drop, attn_drop, alpha, False)) # hidden layers for l in range(1, num_layers): # due to multi-head, the in_dim = num_hidden * num_heads self.gat_layers.append(GATConv( num_hidden * heads[l-1], num_hidden, heads[l], feat_drop, attn_drop, alpha, residual)) # output projection self.gat_layers.append(GATConv( num_hidden * heads[-2], num_classes, heads[-1], feat_drop, attn_drop, alpha, residual)) def forward(self, g, inputs): h = inputs for l in range(self.num_layers): h = self.gat_layers[l](g, h).flatten() h = self.activation(h) # output projection logits = self.gat_layers[-1](g, h).mean(1) return logits sagemaker_graph_fraud_detection/dgl_fraud_detection/model/.ipynb_checkpoints/0000755000175000001440000000000014215134246031127 5ustar sagemaker-useruserssagemaker_graph_fraud_detection/dgl_fraud_detection/model/.ipynb_checkpoints/mxnet-checkpoint.py0000644000175000001440000001666614214110624034771 0ustar sagemaker-userusersfrom mxnet import gluon, nd from dgl.nn.mxnet import GraphConv, GATConv, SAGEConv import dgl.function as fn class HeteroRGCNLayer(gluon.Block): def __init__(self, in_size, out_size, etypes): super(HeteroRGCNLayer, self).__init__() # W_r for each relation with self.name_scope(): self.weight = {name: gluon.nn.Dense(out_size, use_bias=False) for name in etypes} for child in self.weight.values(): self.register_child(child) def forward(self, G, feat_dict): # The input is a dictionary of node features for each type funcs = {} for srctype, etype, dsttype in G.canonical_etypes: # Compute W_r * h if srctype in feat_dict: Wh = self.weight[etype](feat_dict[srctype]) # Save it in graph for message passing G.srcnodes[srctype].data['Wh_%s' % etype] = Wh # Specify per-relation message passing functions: (message_func, reduce_func). # Note that the results are saved to the same destination feature 'h', which # hints the type wise reducer for aggregation. funcs[etype] = (fn.copy_u('Wh_%s' % etype, 'm'), fn.mean('m', 'h')) # Trigger message passing of multiple types. # The first argument is the message passing functions for each relation. # The second one is the type wise reducer, could be "sum", "max", # "min", "mean", "stack" G.multi_update_all(funcs, 'sum') # return the updated node feature dictionary return {ntype: G.dstnodes[ntype].data['h'] for ntype in G.ntypes if 'h' in G.dstnodes[ntype].data} class HeteroRGCN(gluon.Block): def __init__(self, g, in_size, hidden_size, out_size, n_layers, embedding_size, ctx): super(HeteroRGCN, self).__init__() self.g = g self.ctx = ctx # Use trainable node embeddings as featureless inputs for all non target node types. with self.name_scope(): self.embed_dict = {ntype: gluon.nn.Embedding(g.number_of_nodes(ntype), embedding_size) for ntype in g.ntypes if ntype != 'target'} for child in self.embed_dict.values(): self.register_child(child) # create layers # input layer self.layers = gluon.nn.Sequential() self.layers.add(HeteroRGCNLayer(embedding_size, hidden_size, g.etypes)) # hidden layers for i in range(n_layers - 1): self.layers.add(HeteroRGCNLayer(hidden_size, hidden_size, g.etypes)) # output layer # self.layers.add(HeteroRGCNLayer(hidden_size, out_size, g.etypes)) self.layers.add(gluon.nn.Dense(out_size)) def forward(self, g, features): # get embeddings for all node types. for target node type, use passed in target features h_dict = {'target': features} for ntype in self.embed_dict: if g[0].number_of_nodes(ntype) > 0: h_dict[ntype] = self.embed_dict[ntype](nd.array(g[0].nodes(ntype), self.ctx)) # pass through all layers for i, layer in enumerate(self.layers[:-1]): if i != 0: h_dict = {k: nd.LeakyReLU(h) for k, h in h_dict.items()} h_dict = layer(g[i], h_dict) # get target logits # return h_dict['target'] return self.layers[-1](h_dict['target']) class NodeEmbeddingGNN(gluon.Block): def __init__(self, gnn, input_size, embedding_size): super(NodeEmbeddingGNN, self).__init__() with self.name_scope(): self.embed = gluon.nn.Embedding(input_size, embedding_size) self.gnn = gnn def forward(self, g, nodes): features = self.embed(nodes) h = self.gnn(g, features) return h class GCN(gluon.Block): def __init__(self, g, in_feats, n_hidden, n_classes, n_layers, activation, dropout): super(GCN, self).__init__() self.g = g self.layers = gluon.nn.Sequential() # input layer self.layers.add(GraphConv(in_feats, n_hidden, activation=activation)) # hidden layers for i in range(n_layers - 1): self.layers.add(GraphConv(n_hidden, n_hidden, activation=activation)) # output layer # self.layers.add(GraphConv(n_hidden, n_classes)) self.layers.add(gluon.nn.Dense(n_classes)) self.dropout = gluon.nn.Dropout(rate=dropout) def forward(self, g, features): h = features for i, layer in enumerate(self.layers[:-1]): if i != 0: h = self.dropout(h) h = layer(g[i], h) return self.layers[-1](h) class GraphSAGE(gluon.Block): def __init__(self, g, in_feats, n_hidden, n_classes, n_layers, activation, dropout, aggregator_type): super(GraphSAGE, self).__init__() self.g = g with self.name_scope(): self.layers = gluon.nn.Sequential() # input layer self.layers.add(SAGEConv(in_feats, n_hidden, aggregator_type, feat_drop=dropout, activation=activation)) # hidden layers for i in range(n_layers - 1): self.layers.add(SAGEConv(n_hidden, n_hidden, aggregator_type, feat_drop=dropout, activation=activation)) # output layer self.layers.add(gluon.nn.Dense(n_classes)) def forward(self, g, features): h = features for i, layer in enumerate(self.layers[:-1]): h_dst = h[:g[i].number_of_dst_nodes()] h = layer(g[i], (h, h_dst)) return self.layers[-1](h) class GAT(gluon.Block): def __init__(self, g, in_dim, num_hidden, num_classes, num_layers, heads, activation, feat_drop, attn_drop, alpha, residual): super(GAT, self).__init__() self.g = g self.num_layers = num_layers self.gat_layers = [] self.activation = activation # input projection (no residual) self.gat_layers.append(GATConv( (in_dim, in_dim), num_hidden, heads[0], feat_drop, attn_drop, alpha, False)) # hidden layers for l in range(1, num_layers): # due to multi-head, the in_dim = num_hidden * num_heads self.gat_layers.append(GATConv( (num_hidden * heads[l-1], num_hidden * heads[l-1]), num_hidden, heads[l], feat_drop, attn_drop, alpha, residual)) # output projection self.output_proj = gluon.nn.Dense(num_classes) for i, layer in enumerate(self.gat_layers): self.register_child(layer, "gat_layer_{}".format(i)) self.register_child(self.output_proj, "dense_layer") def forward(self, g, inputs): h = inputs for l in range(self.num_layers): h_dst = h[:g[l].number_of_dst_nodes()] h = self.gat_layers[l](g[l], (h, h_dst)).flatten() h = self.activation(h) # output projection logits = self.output_proj(h) return logits sagemaker_graph_fraud_detection/dgl_fraud_detection/model/.ipynb_checkpoints/__init__-checkpoint.py0000644000175000001440000000000014214110624035344 0ustar sagemaker-useruserssagemaker_graph_fraud_detection/dgl_fraud_detection/model/.ipynb_checkpoints/pytorch-checkpoint.py0000644000175000001440000001537014214110624035315 0ustar sagemaker-userusersimport torch import torch.nn as nn import torch.nn.functional as F from dgl.nn.pytorch import GraphConv, GATConv, SAGEConv import dgl.function as fn class HeteroRGCNLayer(nn.Module): def __init__(self, in_size, out_size, etypes): super(HeteroRGCNLayer, self).__init__() # W_r for each relation self.weight = nn.ModuleDict({ name: nn.Linear(in_size, out_size) for name in etypes }) def forward(self, G, feat_dict): # The input is a dictionary of node features for each type funcs = {} for srctype, etype, dsttype in G.canonical_etypes: # Compute W_r * h if srctype in feat_dict: Wh = self.weight[etype](feat_dict[srctype]) # Save it in graph for message passing G.nodes[srctype].data['Wh_%s' % etype] = Wh # Specify per-relation message passing functions: (message_func, reduce_func). # Note that the results are saved to the same destination feature 'h', which # hints the type wise reducer for aggregation. funcs[etype] = (fn.copy_u('Wh_%s' % etype, 'm'), fn.mean('m', 'h')) # Trigger message passing of multiple types. # The first argument is the message passing functions for each relation. # The second one is the type wise reducer, could be "sum", "max", # "min", "mean", "stack" G.multi_update_all(funcs, 'sum') # return the updated node feature dictionary return {ntype: G.dstnodes[ntype].data['h'] for ntype in G.ntypes if 'h' in G.dstnodes[ntype].data} class HeteroRGCN(nn.Module): def __init__(self, g, in_size, hidden_size, out_size, n_layers, embedding_size): super(HeteroRGCN, self).__init__() # Use trainable node embeddings as featureless inputs. embed_dict = {ntype: nn.Parameter(torch.Tensor(g.number_of_nodes(ntype), in_size)) for ntype in g.ntypes if ntype != 'user'} for key, embed in embed_dict.items(): nn.init.xavier_uniform_(embed) self.embed = nn.ParameterDict(embed_dict) # create layers self.layers = nn.Sequential() self.layers.add_module(HeteroRGCNLayer(embedding_size, hidden_size, g.etypes)) # hidden layers for i in range(n_layers - 1): self.layers.add_module = HeteroRGCNLayer(hidden_size, hidden_size, g.etypes) # output layer self.layers.add(nn.Dense(hidden_size, out_size)) def forward(self, g, features): # get embeddings for all node types. for user node type, use passed in user features h_dict = self.embed h_dict['user'] = features # pass through all layers for i, layer in enumerate(self.layers[:-1]): if i != 0: h_dict = {k: F.leaky_relu(h) for k, h in h_dict.items()} h_dict = layer(g[i], h_dict) # get user logits # return h_dict['user'] return self.layers[-1](h_dict['user']) class NodeEmbeddingGNN(nn.Module): def __init__(self, gnn, input_size, embedding_size): super(NodeEmbeddingGNN, self).__init__() self.embed = nn.Embedding(input_size, embedding_size) self.gnn = gnn def forward(self, g, nodes): features = self.embed(nodes) h = self.gnn(g, features) return h class GCN(nn.Module): def __init__(self, g, in_feats, n_hidden, n_classes, n_layers, activation, dropout): super(GCN, self).__init__() self.g = g self.layers = nn.Sequential() # input layer self.layers.add_module(GraphConv(in_feats, n_hidden, activation=activation)) # hidden layers for i in range(n_layers - 1): self.layers.add(GraphConv(n_hidden, n_hidden, activation=activation)) # output layer # self.layers.add(GraphConv(n_hidden, n_classes)) self.layers.add(nn.Linear(n_hidden, n_classes)) self.dropout = nn.Dropout(p=dropout) def forward(self, g, features): h = features for i, layer in enumerate(self.layers[:-1]): if i != 0: h = self.dropout(h) h = layer(g, h) return self.layers[-1](h) class GraphSAGE(nn.Module): def __init__(self, g, in_feats, n_hidden, n_classes, n_layers, activation, dropout, aggregator_type): super(GraphSAGE, self).__init__() self.g = g with self.name_scope(): self.layers = nn.Sequential() # input layer self.layers.add_module(SAGEConv(in_feats, n_hidden, aggregator_type, feat_drop=dropout, activation=activation)) # hidden layers for i in range(n_layers - 1): self.layers.add_module(SAGEConv(n_hidden, n_hidden, aggregator_type, feat_drop=dropout, activation=activation)) # output layer self.layers.add_module(nn.Linear(n_hidden, n_classes)) def forward(self, g, features): h = features for layer in self.layers[:-1]: h = layer(g, h) return self.layers[-1](h) class GAT(nn.Module): def __init__(self, g, in_dim, num_hidden, num_classes, num_layers, heads, activation, feat_drop, attn_drop, alpha, residual): super(GAT, self).__init__() self.g = g self.num_layers = num_layers self.gat_layers = nn.ModuleList() self.activation = activation # input projection (no residual) self.gat_layers.append(GATConv( in_dim, num_hidden, heads[0], feat_drop, attn_drop, alpha, False)) # hidden layers for l in range(1, num_layers): # due to multi-head, the in_dim = num_hidden * num_heads self.gat_layers.append(GATConv( num_hidden * heads[l-1], num_hidden, heads[l], feat_drop, attn_drop, alpha, residual)) # output projection self.gat_layers.append(GATConv( num_hidden * heads[-2], num_classes, heads[-1], feat_drop, attn_drop, alpha, residual)) def forward(self, g, inputs): h = inputs for l in range(self.num_layers): h = self.gat_layers[l](g, h).flatten() h = self.activation(h) # output projection logits = self.gat_layers[-1](g, h).mean(1) return logits sagemaker_graph_fraud_detection/dgl_fraud_detection/model/mxnet.py0000644000175000001440000001666614214110624027053 0ustar sagemaker-userusersfrom mxnet import gluon, nd from dgl.nn.mxnet import GraphConv, GATConv, SAGEConv import dgl.function as fn class HeteroRGCNLayer(gluon.Block): def __init__(self, in_size, out_size, etypes): super(HeteroRGCNLayer, self).__init__() # W_r for each relation with self.name_scope(): self.weight = {name: gluon.nn.Dense(out_size, use_bias=False) for name in etypes} for child in self.weight.values(): self.register_child(child) def forward(self, G, feat_dict): # The input is a dictionary of node features for each type funcs = {} for srctype, etype, dsttype in G.canonical_etypes: # Compute W_r * h if srctype in feat_dict: Wh = self.weight[etype](feat_dict[srctype]) # Save it in graph for message passing G.srcnodes[srctype].data['Wh_%s' % etype] = Wh # Specify per-relation message passing functions: (message_func, reduce_func). # Note that the results are saved to the same destination feature 'h', which # hints the type wise reducer for aggregation. funcs[etype] = (fn.copy_u('Wh_%s' % etype, 'm'), fn.mean('m', 'h')) # Trigger message passing of multiple types. # The first argument is the message passing functions for each relation. # The second one is the type wise reducer, could be "sum", "max", # "min", "mean", "stack" G.multi_update_all(funcs, 'sum') # return the updated node feature dictionary return {ntype: G.dstnodes[ntype].data['h'] for ntype in G.ntypes if 'h' in G.dstnodes[ntype].data} class HeteroRGCN(gluon.Block): def __init__(self, g, in_size, hidden_size, out_size, n_layers, embedding_size, ctx): super(HeteroRGCN, self).__init__() self.g = g self.ctx = ctx # Use trainable node embeddings as featureless inputs for all non target node types. with self.name_scope(): self.embed_dict = {ntype: gluon.nn.Embedding(g.number_of_nodes(ntype), embedding_size) for ntype in g.ntypes if ntype != 'target'} for child in self.embed_dict.values(): self.register_child(child) # create layers # input layer self.layers = gluon.nn.Sequential() self.layers.add(HeteroRGCNLayer(embedding_size, hidden_size, g.etypes)) # hidden layers for i in range(n_layers - 1): self.layers.add(HeteroRGCNLayer(hidden_size, hidden_size, g.etypes)) # output layer # self.layers.add(HeteroRGCNLayer(hidden_size, out_size, g.etypes)) self.layers.add(gluon.nn.Dense(out_size)) def forward(self, g, features): # get embeddings for all node types. for target node type, use passed in target features h_dict = {'target': features} for ntype in self.embed_dict: if g[0].number_of_nodes(ntype) > 0: h_dict[ntype] = self.embed_dict[ntype](nd.array(g[0].nodes(ntype), self.ctx)) # pass through all layers for i, layer in enumerate(self.layers[:-1]): if i != 0: h_dict = {k: nd.LeakyReLU(h) for k, h in h_dict.items()} h_dict = layer(g[i], h_dict) # get target logits # return h_dict['target'] return self.layers[-1](h_dict['target']) class NodeEmbeddingGNN(gluon.Block): def __init__(self, gnn, input_size, embedding_size): super(NodeEmbeddingGNN, self).__init__() with self.name_scope(): self.embed = gluon.nn.Embedding(input_size, embedding_size) self.gnn = gnn def forward(self, g, nodes): features = self.embed(nodes) h = self.gnn(g, features) return h class GCN(gluon.Block): def __init__(self, g, in_feats, n_hidden, n_classes, n_layers, activation, dropout): super(GCN, self).__init__() self.g = g self.layers = gluon.nn.Sequential() # input layer self.layers.add(GraphConv(in_feats, n_hidden, activation=activation)) # hidden layers for i in range(n_layers - 1): self.layers.add(GraphConv(n_hidden, n_hidden, activation=activation)) # output layer # self.layers.add(GraphConv(n_hidden, n_classes)) self.layers.add(gluon.nn.Dense(n_classes)) self.dropout = gluon.nn.Dropout(rate=dropout) def forward(self, g, features): h = features for i, layer in enumerate(self.layers[:-1]): if i != 0: h = self.dropout(h) h = layer(g[i], h) return self.layers[-1](h) class GraphSAGE(gluon.Block): def __init__(self, g, in_feats, n_hidden, n_classes, n_layers, activation, dropout, aggregator_type): super(GraphSAGE, self).__init__() self.g = g with self.name_scope(): self.layers = gluon.nn.Sequential() # input layer self.layers.add(SAGEConv(in_feats, n_hidden, aggregator_type, feat_drop=dropout, activation=activation)) # hidden layers for i in range(n_layers - 1): self.layers.add(SAGEConv(n_hidden, n_hidden, aggregator_type, feat_drop=dropout, activation=activation)) # output layer self.layers.add(gluon.nn.Dense(n_classes)) def forward(self, g, features): h = features for i, layer in enumerate(self.layers[:-1]): h_dst = h[:g[i].number_of_dst_nodes()] h = layer(g[i], (h, h_dst)) return self.layers[-1](h) class GAT(gluon.Block): def __init__(self, g, in_dim, num_hidden, num_classes, num_layers, heads, activation, feat_drop, attn_drop, alpha, residual): super(GAT, self).__init__() self.g = g self.num_layers = num_layers self.gat_layers = [] self.activation = activation # input projection (no residual) self.gat_layers.append(GATConv( (in_dim, in_dim), num_hidden, heads[0], feat_drop, attn_drop, alpha, False)) # hidden layers for l in range(1, num_layers): # due to multi-head, the in_dim = num_hidden * num_heads self.gat_layers.append(GATConv( (num_hidden * heads[l-1], num_hidden * heads[l-1]), num_hidden, heads[l], feat_drop, attn_drop, alpha, residual)) # output projection self.output_proj = gluon.nn.Dense(num_classes) for i, layer in enumerate(self.gat_layers): self.register_child(layer, "gat_layer_{}".format(i)) self.register_child(self.output_proj, "dense_layer") def forward(self, g, inputs): h = inputs for l in range(self.num_layers): h_dst = h[:g[l].number_of_dst_nodes()] h = self.gat_layers[l](g[l], (h, h_dst)).flatten() h = self.activation(h) # output projection logits = self.output_proj(h) return logits sagemaker_graph_fraud_detection/dgl_fraud_detection/model/__init__.py0000644000175000001440000000000014214110624027426 0ustar sagemaker-useruserssagemaker_graph_fraud_detection/dgl_fraud_detection/graph.py0000644000175000001440000000560714214110624025712 0ustar sagemaker-userusersimport os import re import dgl import numpy as np from data import * def get_edgelists(edgelist_expression, directory): if "," in edgelist_expression: return edgelist_expression.split(",") files = os.listdir(directory) compiled_expression = re.compile(edgelist_expression) return [filename for filename in files if compiled_expression.match(filename)] def construct_graph(training_dir, edges, nodes, target_node_type, heterogeneous=True): if heterogeneous: print("Getting relation graphs from the following edge lists : {} ".format(edges)) edgelists, id_to_node = {}, {} for i, edge in enumerate(edges): edgelist, id_to_node, src, dst = parse_edgelist(os.path.join(training_dir, edge), id_to_node, header=True) if src == target_node_type: src = 'target' if dst == target_node_type: dst = 'target' edgelists[(src, 'relation{}'.format(i), dst)] = edgelist print("Read edges for relation{} from edgelist: {}".format(i, os.path.join(training_dir, edge))) # reverse edge list so that relation is undirected edgelists[(dst, 'reverse_relation{}'.format(i), src)] = [(b, a) for a, b in edgelist] # get features for target nodes features, new_nodes = get_features(id_to_node[target_node_type], os.path.join(training_dir, nodes)) print("Read in features for target nodes") # handle target nodes that have features but don't have any connections # if new_nodes: # edgelists[('target', 'relation'.format(i+1), 'none')] = [(node, 0) for node in new_nodes] # edgelists[('none', 'reverse_relation{}'.format(i + 1), 'target')] = [(0, node) for node in new_nodes] # add self relation edgelists[('target', 'self_relation', 'target')] = [(t, t) for t in id_to_node[target_node_type].values()] g = dgl.heterograph(edgelists) print( "Constructed heterograph with the following metagraph structure: Node types {}, Edge types{}".format( g.ntypes, g.canonical_etypes)) print("Number of nodes of type target : {}".format(g.number_of_nodes('target'))) g.nodes['target'].data['features'] = features id_to_node = id_to_node[target_node_type] else: sources, sinks, features, id_to_node = read_edges(os.path.join(training_dir, edges[0]), os.path.join(training_dir, nodes)) # add self relation all_nodes = sorted(id_to_node.values()) sources.extend(all_nodes) sinks.extend(all_nodes) g = dgl.graph((sources, sinks)) if features: g.ndata['features'] = np.array(features).astype('float32') print('read graph from node list and edge list') features = g.ndata['features'] return g, features, id_to_node sagemaker_graph_fraud_detection/dgl_fraud_detection/estimator_fns.py0000644000175000001440000000716314215134727027500 0ustar sagemaker-userusersimport os import argparse import logging def parse_args(): parser = argparse.ArgumentParser() parser.add_argument('--training-dir', type=str, default=os.environ['SM_CHANNEL_TRAIN']) parser.add_argument('--model-dir', type=str, default=os.environ['SM_MODEL_DIR']) parser.add_argument('--output-dir', type=str, default=os.environ['SM_OUTPUT_DATA_DIR']) parser.add_argument('--nodes', type=str, default='features.csv') parser.add_argument('--target-ntype', type=str, default='NPI') parser.add_argument('--edges', type=str, default='relation*') parser.add_argument('--heterogeneous', type=lambda x: (str(x).lower() in ['true', '1', 'yes']), default=True, help='use hetero graph') parser.add_argument('--no-features', type=lambda x: (str(x).lower() in ['true', '1', 'yes']), default=False, help='do not use node features') parser.add_argument('--mini-batch', type=lambda x: (str(x).lower() in ['true', '1', 'yes']), default=True, help='use mini-batch training and sample graph') parser.add_argument('--labels', type=str, default='tags.csv') parser.add_argument('--validation-data', type=str, default='validation.csv') parser.add_argument('--new-accounts', type=str, default='test.csv') parser.add_argument('--predictions', type=str, default='preds.csv', help='file to save predictions on new-accounts') parser.add_argument('--compute-metrics', type=lambda x: (str(x).lower() in ['true', '1', 'yes']), default=True, help='compute evaluation metrics after training') parser.add_argument('--threshold', type=float, default=0, help='threshold for making predictions, default : argmax') parser.add_argument('--model', type=str, default='rgcn', help='gnn to use. options: gcn, graphsage, gat, gem') parser.add_argument('--num-gpus', type=int, default=1) parser.add_argument('--batch-size', type=int, default=500) parser.add_argument('--optimizer', type=str, default='adam') parser.add_argument('--lr', type=float, default=1e-2) parser.add_argument('--n-epochs', type=int, default=20) parser.add_argument('--n-neighbors', type=int, default=10, help='number of neighbors to sample') parser.add_argument('--n-hidden', type=int, default=16, help='number of hidden units') parser.add_argument('--n-layers', type=int, default=3, help='number of hidden layers') parser.add_argument('--weight-decay', type=float, default=5e-4, help='Weight for L2 loss') parser.add_argument('--dropout', type=float, default=0.2, help='dropout probability, for gat only features') parser.add_argument('--attn-drop', type=float, default=0.6, help='attention dropout for gat/gem') parser.add_argument('--num-heads', type=int, default=4, help='number of hidden attention heads for gat/gem') parser.add_argument('--num-out-heads', type=int, default=1, help='number of output attention heads for gat/gem') parser.add_argument('--residual', action="store_true", default=False, help='use residual connection for gat') parser.add_argument('--alpha', type=float, default=0.2, help='the negative slop of leaky relu') parser.add_argument('--aggregator-type', type=str, default="gcn", help="graphsage aggregator: mean/gcn/pool/lstm") parser.add_argument('--embedding-size', type=int, default=360, help="embedding size for node embedding") return parser.parse_args() def get_logger(name): logger = logging.getLogger(name) log_format = '%(asctime)s %(levelname)s %(name)s: %(message)s' logging.basicConfig(format=log_format, level=logging.INFO) logger.setLevel(logging.INFO) return loggersagemaker_graph_fraud_detection/dgl_fraud_detection/sampler.py0000644000175000001440000000527514214110624026255 0ustar sagemaker-userusersimport dgl class HeteroGraphNeighborSampler: """Neighbor sampler on heterogeneous graphs Parameters ---------- g : DGLHeteroGraph Full graph category : str Category name of the seed nodes. nhops : int Number of hops to sample/number of layers in the node flow. fanout : int Fanout of each hop starting from the seed nodes. If a fanout is None, sample full neighbors. """ def __init__(self, g, category, nhops, fanout=None): self.g = g self.category = category self.fanouts = [fanout] * nhops def sample_block(self, seeds): blocks = [] seeds = {self.category: seeds} cur = seeds for fanout in self.fanouts: if fanout is None: frontier = dgl.in_subgraph(self.g, cur) else: frontier = dgl.sampling.sample_neighbors(self.g, cur, fanout) block = dgl.to_block(frontier, cur) cur = {} for ntype in block.srctypes: cur[ntype] = block.srcnodes[ntype].data[dgl.NID] blocks.insert(0, block) return blocks, cur[self.category] class NeighborSampler: """Neighbor sampler on homogenous graphs Parameters ---------- g : DGLGraph Full graph nhops : int Number of hops to sample/number of layers in the node flow. fanout : int Fanout of each hop starting from the seed nodes. If a fanout is None, sample full neighbors. """ def __init__(self, g, nhops, fanout=None): self.g = g self.fanouts = [fanout] * nhops self.nhops = nhops def sample_block(self, seeds): blocks = [] for fanout in self.fanouts: # For each seed node, sample ``fanout`` neighbors. if fanout is None: frontier = dgl.in_subgraph(self.g, seeds) else: frontier = dgl.sampling.sample_neighbors(self.g, seeds, fanout, replace=False) # Then we compact the frontier into a bipartite graph for message passing. block = dgl.to_block(frontier, seeds) # Obtain the seed nodes for next layer. seeds = block.srcdata[dgl.NID] blocks.insert(0, block) return blocks, blocks[0].srcdata[dgl.NID] class FullGraphSampler: """Does nothing and just returns the full graph Parameters ---------- g : DGLGraph Full graph nhops : int Number of hops to sample/number of layers in the node flow. """ def __init__(self, g, nhops): self.g = g self.nhops = nhops def sample_block(self, seeds): return [self.g] * self.nhops, seeds sagemaker_graph_fraud_detection/dgl_fraud_detection/__init__.py0000644000175000001440000000000014214110624026326 0ustar sagemaker-useruserssagemaker_graph_fraud_detection/__pycache__/0000755000175000001440000000000014214113577022464 5ustar sagemaker-useruserssagemaker_graph_fraud_detection/__pycache__/__init__.cpython-37.pyc0000644000175000001440000000025714214113577026656 0ustar sagemaker-userusersB ”‘0bã@sdS)N©rrrúX/root/S3Downloads/jumpstart-prod-fdfn_s5r606/sagemaker_graph_fraud_detection/__init__.pyÚósagemaker_graph_fraud_detection/__pycache__/config.cpython-37.pyc0000644000175000001440000000271014214113577026360 0ustar sagemaker-userusersB ”‘0bã @s0ddlZddlZddlZddlZddlmZdd„Zej ¡j Z e  d¡  ¡  d¡Zej ej ¡¡ ¡Ze ¡ZiZeeƒƒZeedƒ ¡Zej e¡r¸eeƒZe e¡ZWdQRXe  de¡Ze  d e ¡Z e  d ¡Ze  d ¡Ze  d d ¡Ze  de¡Z e  dd¡Z!e  dd¡Z"e  dd¡Z#e  de¡Z$dS)éN)ÚPathcCs0d|kr t|dƒ}|j ¡}n tt ¡ƒ}|S)NÚ__file__)rÚparentÚresolveÚosÚgetcwd)Zglobal_variablesÚ current_fileÚcurrent_folder©r úV/root/S3Downloads/jumpstart-prod-fdfn_s5r606/sagemaker_graph_fraud_detection/config.pyÚget_current_folders    r ÚstsÚAccountz../stack_outputs.jsonZ AccountIDZ AWSRegionZ SolutionNameZSolutionUpstreamS3BucketZSolutionPrefixzsagemaker-soln-graph-fraudZSolutionS3BucketZS3InputDataPrefixzraw-dataZS3ProcessingJobOutputPrefixzprocessed-dataZS3TrainingJobOutputPrefixztraining-outputZIamRole)%ÚjsonrÚboto3Ú sagemakerÚpathlibrr ÚsessionÚSessionÚ region_nameÚregionÚclientÚget_caller_identityÚgetÚ account_idÚdefault_bucketÚget_execution_roleZ default_roleZcfn_stack_outputsÚglobalsr rZcfn_stack_outputs_filepathÚpathÚexistsÚopenÚfÚloadZ aws_accountZ solution_nameZsolution_upstream_bucketZsolution_prefixZsolution_bucketZs3_data_prefixZs3_processing_outputZs3_train_outputÚroler r r r Ús2              sagemaker_graph_fraud_detection/requirements.in0000644000175000001440000000005314214110624023313 0ustar sagemaker-useruserssagemaker==1.72.0 awscli watchtower==1.0.0 sagemaker_graph_fraud_detection/config.py0000644000175000001440000000322014214110624022056 0ustar sagemaker-userusersimport json import os import boto3 import sagemaker from pathlib import Path def get_current_folder(global_variables): # if calling from a file if "__file__" in global_variables: current_file = Path(global_variables["__file__"]) current_folder = current_file.parent.resolve() # if calling from a notebook else: current_folder = Path(os.getcwd()) return current_folder region = boto3.session.Session().region_name account_id = boto3.client('sts').get_caller_identity().get('Account') default_bucket = sagemaker.session.Session(boto3.session.Session()).default_bucket() default_role = sagemaker.get_execution_role() cfn_stack_outputs = {} current_folder = get_current_folder(globals()) cfn_stack_outputs_filepath = Path(current_folder, '../stack_outputs.json').resolve() if os.path.exists(cfn_stack_outputs_filepath): with open(cfn_stack_outputs_filepath) as f: cfn_stack_outputs = json.load(f) aws_account = cfn_stack_outputs.get('AccountID', account_id) region_name = cfn_stack_outputs.get('AWSRegion', region) solution_name = cfn_stack_outputs.get('SolutionName') solution_upstream_bucket = cfn_stack_outputs.get('SolutionUpstreamS3Bucket') solution_prefix = cfn_stack_outputs.get('SolutionPrefix', 'sagemaker-soln-graph-fraud') solution_bucket = cfn_stack_outputs.get('SolutionS3Bucket', default_bucket) s3_data_prefix = cfn_stack_outputs.get('S3InputDataPrefix', 'raw-data') s3_processing_output = cfn_stack_outputs.get('S3ProcessingJobOutputPrefix', 'processed-data') s3_train_output = cfn_stack_outputs.get('S3TrainingJobOutputPrefix', 'training-output') role = cfn_stack_outputs.get('IamRole', default_role)sagemaker_graph_fraud_detection/__init__.py0000644000175000001440000000000014214110624022341 0ustar sagemaker-userusers