Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.

SPDX-License-Identifier: Apache-2.0

# This notebook uses the pre-processed Reddit user-behavior data to train ELAND

## Table of contents

1. Loading data
2. Setting up the model trainer
3. Model training

In [None]:
import os
import pickle
import sys
import json
import math
import logging
import pickle as pk
from collections import Counter
import numpy as np
import pandas as pd
import scipy.sparse as sp
from scipy.sparse import csr_matrix, coo_matrix
import torch
from torch.utils.data import DataLoader
import torch.nn.functional as F
import torch.nn as nn
from torch.nn import MSELoss, CosineEmbeddingLoss
from sklearn.metrics import precision_recall_curve, roc_curve, auc, average_precision_score, roc_auc_score, f1_score

### 1. Loading data 

### User labels

In [None]:
user_label = pd.read_csv("../../data/02_intermediate/user_behavior/user_labels.csv")

In [None]:
user_label.head(10)

#### User and subreddit topic index

In [None]:
with open("../../data/02_intermediate/user_behavior/u2index.pkl","rb") as f:
 u2index = pickle.load(f)

In [None]:
with open("../../data/02_intermediate/user_behavior/p2index.pkl","rb") as f:
 p2index = pickle.load(f)

#### Edge list data 

In [None]:
edgelist_df = pd.read_csv("../../data/02_intermediate/user_behavior/edge_list.csv")

In [None]:
edgelist_df.head(10)

In [None]:
from scipy.sparse import csr_matrix, coo_matrix
def process_edgelist(edge_list, u2index, p2index):
 """ Load edge list and construct a graph """
 edges = Counter()

 for i, row in edge_list.iterrows():
 #u = row[0]
 #p = row[1]
 #t = row[2]
 u = row['author']
 p = row['subreddit']
 t = row['retrieved_on']

 if i<1:
 print(u, p, t)
 edges[(u2index[u], p2index[p])] += 1
 # Construct the graph
 row = []
 col = []
 entry = []
 for edge, w in edges.items():
 #print(w)
 i, j = edge
 row.append(i)
 col.append(j)
 entry.append(w)
 graph = csr_matrix(
 (entry, (row, col)), 
 shape=(len(u2index), len(p2index))
 ) 
 return graph

In [None]:
graph = process_edgelist(edgelist_df, u2index, p2index)

In [None]:
type(graph)

#### Train/validation/test id split

In [None]:
with open("../../data/02_intermediate/user_behavior/data_tvt.pkl","rb") as f:
 tvt_idx = pickle.load(f)

In [None]:
idx_train, idx_val, idx_test = tvt_idx

In [None]:
idx_train.shape, idx_val.shape, idx_test.shape

#### Convert label format (to numpy array)

In [None]:
def process_label(labels: pd.DataFrame) -> np.array:
 """process label information"""
 u_all = set()
 pos_uids = set()
 labeled_uids = set()
 #convert a dataframe to an numpy array, array index being mapped indexes from u2index
 for i,row in labels.iterrows():
 author = row['author']
 author_label = row['label']
 u_all.add(author)
 if author_label == 1:
 pos_uids.add(author)
 labeled_uids.add(author)
 elif author_label == 0:
 labeled_uids.add(author)
 print(f'loaded labels, total of {len(pos_uids)} positive users and {len(labeled_uids)} labeled users')
 labels = np.zeros(len(u2index))
 for u in u2index:
 if u in pos_uids:
 labels[u2index[u]] = 1
 labels = labels.astype(int)
 return labels

In [None]:
labels = process_label(user_label)

In [None]:
print('Train: total of {:5} users with {:5} pos users and {:5} neg users'.format(
 len(idx_train), 
 np.sum(labels[idx_train]), 
 len(idx_train)-np.sum(labels[idx_train]))
 )
print('Val: total of {:5} users with {:5} pos users and {:5} neg users'.format(
 len(idx_val), 
 np.sum(labels[idx_val]), 
 len(idx_val)-np.sum(labels[idx_val]))
 )
print('Test: total of {:5} users with {:5} pos users and {:5} neg users'.format(
 len(idx_test), 
 np.sum(labels[idx_test]), 
 len(idx_test)-np.sum(labels[idx_test]))
 )

In [None]:
user_features = np.load("../../data/02_intermediate/user_behavior/user2vec_npy.npz")
print(user_features['data'].shape)

In [None]:
item_features = np.load("../../data/02_intermediate/user_behavior/prod2vec_npy.npz")
print(item_features['data'].shape)

### 2. Setting up the model trainer 

In [None]:
sys.path.append('../../src/')

In [None]:
from anomaly_detection_spatial_temporal_data.model.data_loader import DynamicGraphWNFDataSet, DynamicGraphWNodeFeatDatasetLoader
from anomaly_detection_spatial_temporal_data.model.dynamic_graph import Eland_e2e
from anomaly_detection_spatial_temporal_data.model.model_config import ElandConfig

#### Set up dataloader

In [None]:
data_loader = DynamicGraphWNodeFeatDatasetLoader(
 user_label, 
 u2index, 
 p2index, 
 edgelist_df, 
 tvt_idx, 
 user_features['data'], 
 item_features['data']
)

#sequential data loader
dataset = DynamicGraphWNFDataSet(p2index, item_features['data'], edgelist_df)
lstm_dataloader = DataLoader(dataset, batch_size=300)
 

In [None]:
data_dict = {
 'graph': data_loader.graph, 
 'lstm_dataloader': lstm_dataloader,
 'user_features': data_loader.user_features,
 'item_features': data_loader.item_features,
 'labels': data_loader.labels,
 'tvt_nids': data_loader.tvt_idx,
 'u2index': data_loader.u2index,
 'p2index': data_loader.p2index
 }


#### Load model config

In [None]:
import yaml

In [None]:
model_config_file = '../../conf/base/parameters/eland.yml'

In [None]:
with open(model_config_file, "r") as stream:
 try:
 mode_config=yaml.safe_load(stream)
 print(mode_config)
 except yaml.YAMLError as exc:
 print(exc)

In [None]:
# Open a log directory for notebook training session 
from pathlib import Path
log_dir = Path('logs/')
log_dir.mkdir(parents=True, exist_ok=True)

In [None]:
eland_config = ElandConfig(mode_config['eland_model_options'])

#### Adjust model directory for notebook 

In [None]:
eland_config.save_directory

In [None]:
eland_config.save_directory = '../../data/07_model_output/user_behavior/'
eland_config.epochs = 10 # reduce to 10 epochs in notebooks for demonstration 

In [None]:
if not os.path.exists(eland_config.save_directory):
 os.makedirs(eland_config.save_directory)

### 3. Model training

In [None]:
model_obj = Eland_e2e(
 data_dict['graph'], 
 data_dict['lstm_dataloader'], 
 data_dict['user_features'],
 data_dict['item_features'], 
 data_dict['labels'], 
 data_dict['tvt_nids'], 
 data_dict['u2index'],
 data_dict['p2index'], 
 data_dict['item_features'], 
 eland_config
)
training_result,save_model_path = model_obj.train()

In [None]:
training_result

# References

Jason Baumgartner, Savvas Zannettou, Brian Keegan, Megan Squire, and Jeremy Blackburn. 2020. The Pushshift Reddit Dataset.

Tong Zhao, Bo Ni, Wenhao Yu, Zhichun Guo, Neil Shah, and Meng Jiang, 2021. Action Sequence Augmentation for Early Graph-based Anomaly Detection.