{ "cells": [ { "cell_type": "markdown", "id": "2d42c11b", "metadata": {}, "source": [ "Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.\n", "\n", "SPDX-License-Identifier: Apache-2.0" ] }, { "cell_type": "markdown", "id": "5a929deb", "metadata": {}, "source": [ "# This notebook uses the pre-processed Reddit user-behavior data to train ELAND" ] }, { "cell_type": "markdown", "id": "daa8a342", "metadata": {}, "source": [ "## Table of contents\n", "\n", "1. Loading data\n", "2. Setting up the model trainer\n", "3. Model training" ] }, { "cell_type": "code", "execution_count": null, "id": "a7131aa6", "metadata": {}, "outputs": [], "source": [ "import os\n", "import pickle\n", "import sys\n", "import json\n", "import math\n", "import logging\n", "import pickle as pk\n", "from collections import Counter\n", "import numpy as np\n", "import pandas as pd\n", "import scipy.sparse as sp\n", "from scipy.sparse import csr_matrix, coo_matrix\n", "import torch\n", "from torch.utils.data import DataLoader\n", "import torch.nn.functional as F\n", "import torch.nn as nn\n", "from torch.nn import MSELoss, CosineEmbeddingLoss\n", "from sklearn.metrics import precision_recall_curve, roc_curve, auc, average_precision_score, roc_auc_score, f1_score" ] }, { "cell_type": "markdown", "id": "ee3f6aed", "metadata": {}, "source": [ "### 1. Loading data " ] }, { "cell_type": "markdown", "id": "38c9bb21", "metadata": {}, "source": [ "### User labels" ] }, { "cell_type": "code", "execution_count": null, "id": "d5e74952", "metadata": {}, "outputs": [], "source": [ "user_label = pd.read_csv(\"../../data/02_intermediate/user_behavior/user_labels.csv\")" ] }, { "cell_type": "code", "execution_count": null, "id": "1f660f6b", "metadata": {}, "outputs": [], "source": [ "user_label.head(10)" ] }, { "cell_type": "markdown", "id": "e3b10885", "metadata": {}, "source": [ "#### User and subreddit topic index" ] }, { "cell_type": "code", "execution_count": null, "id": "dbbde3ca", "metadata": {}, "outputs": [], "source": [ "with open(\"../../data/02_intermediate/user_behavior/u2index.pkl\",\"rb\") as f:\n", " u2index = pickle.load(f)" ] }, { "cell_type": "code", "execution_count": null, "id": "bf235b85", "metadata": {}, "outputs": [], "source": [ "with open(\"../../data/02_intermediate/user_behavior/p2index.pkl\",\"rb\") as f:\n", " p2index = pickle.load(f)" ] }, { "cell_type": "markdown", "id": "7928fc01", "metadata": {}, "source": [ "#### Edge list data " ] }, { "cell_type": "code", "execution_count": null, "id": "f1499c61", "metadata": {}, "outputs": [], "source": [ "edgelist_df = pd.read_csv(\"../../data/02_intermediate/user_behavior/edge_list.csv\")" ] }, { "cell_type": "code", "execution_count": null, "id": "bf8dd932", "metadata": {}, "outputs": [], "source": [ "edgelist_df.head(10)" ] }, { "cell_type": "code", "execution_count": null, "id": "0116e354", "metadata": {}, "outputs": [], "source": [ "from scipy.sparse import csr_matrix, coo_matrix\n", "def process_edgelist(edge_list, u2index, p2index):\n", " \"\"\" Load edge list and construct a graph \"\"\"\n", " edges = Counter()\n", "\n", " for i, row in edge_list.iterrows():\n", " #u = row[0]\n", " #p = row[1]\n", " #t = row[2]\n", " u = row['author']\n", " p = row['subreddit']\n", " t = row['retrieved_on']\n", "\n", " if i<1:\n", " print(u, p, t)\n", " edges[(u2index[u], p2index[p])] += 1\n", " # Construct the graph\n", " row = []\n", " col = []\n", " entry = []\n", " for edge, w in edges.items():\n", " #print(w)\n", " i, j = edge\n", " row.append(i)\n", " col.append(j)\n", " entry.append(w)\n", " graph = csr_matrix(\n", " (entry, (row, col)), \n", " shape=(len(u2index), len(p2index))\n", " ) \n", " return graph" ] }, { "cell_type": "code", "execution_count": null, "id": "e2985bc1", "metadata": {}, "outputs": [], "source": [ "graph = process_edgelist(edgelist_df, u2index, p2index)" ] }, { "cell_type": "code", "execution_count": null, "id": "acd1d921", "metadata": {}, "outputs": [], "source": [ "type(graph)" ] }, { "cell_type": "markdown", "id": "4ebd2875", "metadata": {}, "source": [ "#### Train/validation/test id split" ] }, { "cell_type": "code", "execution_count": null, "id": "33892fb8", "metadata": {}, "outputs": [], "source": [ "with open(\"../../data/02_intermediate/user_behavior/data_tvt.pkl\",\"rb\") as f:\n", " tvt_idx = pickle.load(f)" ] }, { "cell_type": "code", "execution_count": null, "id": "0f5fcd8a", "metadata": {}, "outputs": [], "source": [ "idx_train, idx_val, idx_test = tvt_idx" ] }, { "cell_type": "code", "execution_count": null, "id": "d2caa407", "metadata": {}, "outputs": [], "source": [ "idx_train.shape, idx_val.shape, idx_test.shape" ] }, { "cell_type": "markdown", "id": "0b27ecd2", "metadata": {}, "source": [ "#### Convert label format (to numpy array)" ] }, { "cell_type": "code", "execution_count": null, "id": "86feaf14", "metadata": {}, "outputs": [], "source": [ "def process_label(labels: pd.DataFrame) -> np.array:\n", " \"\"\"process label information\"\"\"\n", " u_all = set()\n", " pos_uids = set()\n", " labeled_uids = set()\n", " #convert a dataframe to an numpy array, array index being mapped indexes from u2index\n", " for i,row in labels.iterrows():\n", " author = row['author']\n", " author_label = row['label']\n", " u_all.add(author)\n", " if author_label == 1:\n", " pos_uids.add(author)\n", " labeled_uids.add(author)\n", " elif author_label == 0:\n", " labeled_uids.add(author)\n", " print(f'loaded labels, total of {len(pos_uids)} positive users and {len(labeled_uids)} labeled users')\n", " labels = np.zeros(len(u2index))\n", " for u in u2index:\n", " if u in pos_uids:\n", " labels[u2index[u]] = 1\n", " labels = labels.astype(int)\n", " return labels" ] }, { "cell_type": "code", "execution_count": null, "id": "05478c18", "metadata": {}, "outputs": [], "source": [ "labels = process_label(user_label)" ] }, { "cell_type": "code", "execution_count": null, "id": "d943caef", "metadata": {}, "outputs": [], "source": [ "print('Train: total of {:5} users with {:5} pos users and {:5} neg users'.format(\n", " len(idx_train), \n", " np.sum(labels[idx_train]), \n", " len(idx_train)-np.sum(labels[idx_train]))\n", " )\n", "print('Val: total of {:5} users with {:5} pos users and {:5} neg users'.format(\n", " len(idx_val), \n", " np.sum(labels[idx_val]), \n", " len(idx_val)-np.sum(labels[idx_val]))\n", " )\n", "print('Test: total of {:5} users with {:5} pos users and {:5} neg users'.format(\n", " len(idx_test), \n", " np.sum(labels[idx_test]), \n", " len(idx_test)-np.sum(labels[idx_test]))\n", " )" ] }, { "cell_type": "code", "execution_count": null, "id": "ae918219", "metadata": {}, "outputs": [], "source": [ "user_features = np.load(\"../../data/02_intermediate/user_behavior/user2vec_npy.npz\")\n", "print(user_features['data'].shape)" ] }, { "cell_type": "code", "execution_count": null, "id": "fac2bb98", "metadata": {}, "outputs": [], "source": [ "item_features = np.load(\"../../data/02_intermediate/user_behavior/prod2vec_npy.npz\")\n", "print(item_features['data'].shape)" ] }, { "cell_type": "markdown", "id": "1b679afa", "metadata": {}, "source": [ "### 2. Setting up the model trainer " ] }, { "cell_type": "code", "execution_count": null, "id": "7ce38050", "metadata": {}, "outputs": [], "source": [ "sys.path.append('../../src/')" ] }, { "cell_type": "code", "execution_count": null, "id": "96e49d86", "metadata": {}, "outputs": [], "source": [ "from anomaly_detection_spatial_temporal_data.model.data_loader import DynamicGraphWNFDataSet, DynamicGraphWNodeFeatDatasetLoader\n", "from anomaly_detection_spatial_temporal_data.model.dynamic_graph import Eland_e2e\n", "from anomaly_detection_spatial_temporal_data.model.model_config import ElandConfig" ] }, { "cell_type": "markdown", "id": "7a21fc51", "metadata": {}, "source": [ "#### Set up dataloader" ] }, { "cell_type": "code", "execution_count": null, "id": "19d9d427", "metadata": {}, "outputs": [], "source": [ "data_loader = DynamicGraphWNodeFeatDatasetLoader(\n", " user_label, \n", " u2index, \n", " p2index, \n", " edgelist_df, \n", " tvt_idx, \n", " user_features['data'], \n", " item_features['data']\n", ")\n", "\n", "#sequential data loader\n", "dataset = DynamicGraphWNFDataSet(p2index, item_features['data'], edgelist_df)\n", "lstm_dataloader = DataLoader(dataset, batch_size=300)\n", " " ] }, { "cell_type": "code", "execution_count": null, "id": "c02fc151", "metadata": {}, "outputs": [], "source": [ "data_dict = {\n", " 'graph': data_loader.graph, \n", " 'lstm_dataloader': lstm_dataloader,\n", " 'user_features': data_loader.user_features,\n", " 'item_features': data_loader.item_features,\n", " 'labels': data_loader.labels,\n", " 'tvt_nids': data_loader.tvt_idx,\n", " 'u2index': data_loader.u2index,\n", " 'p2index': data_loader.p2index\n", " }\n" ] }, { "cell_type": "markdown", "id": "27ba11e4", "metadata": {}, "source": [ "#### Load model config" ] }, { "cell_type": "code", "execution_count": null, "id": "997190e7", "metadata": {}, "outputs": [], "source": [ "import yaml" ] }, { "cell_type": "code", "execution_count": null, "id": "e07de1c6", "metadata": {}, "outputs": [], "source": [ "model_config_file = '../../conf/base/parameters/eland.yml'" ] }, { "cell_type": "code", "execution_count": null, "id": "8ea83b80", "metadata": {}, "outputs": [], "source": [ "with open(model_config_file, \"r\") as stream:\n", " try:\n", " mode_config=yaml.safe_load(stream)\n", " print(mode_config)\n", " except yaml.YAMLError as exc:\n", " print(exc)" ] }, { "cell_type": "code", "execution_count": null, "id": "4a3d7f80", "metadata": {}, "outputs": [], "source": [ "# Open a log directory for notebook training session \n", "from pathlib import Path\n", "log_dir = Path('logs/')\n", "log_dir.mkdir(parents=True, exist_ok=True)" ] }, { "cell_type": "code", "execution_count": null, "id": "05cd516c", "metadata": {}, "outputs": [], "source": [ "eland_config = ElandConfig(mode_config['eland_model_options'])" ] }, { "cell_type": "markdown", "id": "a0cddc13", "metadata": {}, "source": [ "#### Adjust model directory for notebook " ] }, { "cell_type": "code", "execution_count": null, "id": "238073ae", "metadata": {}, "outputs": [], "source": [ "eland_config.save_directory" ] }, { "cell_type": "code", "execution_count": null, "id": "f6ceda50", "metadata": {}, "outputs": [], "source": [ "eland_config.save_directory = '../../data/07_model_output/user_behavior/'\n", "eland_config.epochs = 10 # reduce to 10 epochs in notebooks for demonstration " ] }, { "cell_type": "code", "execution_count": null, "id": "aa47c923", "metadata": {}, "outputs": [], "source": [ "if not os.path.exists(eland_config.save_directory):\n", " os.makedirs(eland_config.save_directory)" ] }, { "cell_type": "markdown", "id": "282b393a", "metadata": {}, "source": [ "### 3. Model training" ] }, { "cell_type": "code", "execution_count": null, "id": "4f2d192b", "metadata": {}, "outputs": [], "source": [ "model_obj = Eland_e2e(\n", " data_dict['graph'], \n", " data_dict['lstm_dataloader'], \n", " data_dict['user_features'],\n", " data_dict['item_features'], \n", " data_dict['labels'], \n", " data_dict['tvt_nids'], \n", " data_dict['u2index'],\n", " data_dict['p2index'], \n", " data_dict['item_features'], \n", " eland_config\n", ")\n", "training_result,save_model_path = model_obj.train()" ] }, { "cell_type": "code", "execution_count": null, "id": "fea2f735", "metadata": {}, "outputs": [], "source": [ "training_result" ] }, { "cell_type": "markdown", "id": "5a60670d", "metadata": {}, "source": [ "# References\n", "\n", "Jason Baumgartner, Savvas Zannettou, Brian Keegan, Megan Squire, and Jeremy Blackburn. 2020. The Pushshift Reddit Dataset.\n", "\n", "Tong Zhao, Bo Ni, Wenhao Yu, Zhichun Guo, Neil Shah, and Meng Jiang, 2021. Action Sequence Augmentation for Early Graph-based Anomaly Detection." ] } ], "metadata": { "kernelspec": { "display_name": "kedro-eland-venv", "language": "python", "name": "kedro-eland-venv" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.12" } }, "nbformat": 4, "nbformat_minor": 5 }