Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.

SPDX-License-Identifier: Apache-2.0

# This notebook walks through intermediate results for data processing on Reddit user-behavior data

## Table of contents
1. Download Reddit comments dataset from PushShift.io for May 2008
2. Set rules for anomalous vs benign users along with data processing
3. Generate author/user labels and save to a csv file
4. Generate user and subreddit index files
5. Save edgelist data as csv file
6. Train/validation/test split
7. Get node features using NLP models

In [None]:
import json
import pandas as pd
import os 
import sys

In [None]:
sys.path.append('../../src/')

### 1. Download Reddit dataset and save it in a dataframe 

In [None]:
reddit_raw_data_file_path = '../../data/01_raw/user_behavior/RC_2008-05.zst'

In [None]:
records = map(json.loads, open(reddit_raw_data_file_path.rstrip(".zst"), encoding="utf8"))
df = pd.DataFrame.from_records(records)

In [None]:
df.head(10)

In [None]:
df.info()

### Observation about the data:
1. There are 536380 rows and 20 columns where each row is an unique post with 20 attributes/columns related to that comment
2. Most important attributes include author, sub-reddit, body and score. Body is the comment thread content, and score is the total votes received on Reddit (1 for one upvote and -1 for downvote). Each record represents one author posts something (body) related to the sub-reddit topic. 
3. Each unique author can have multiple comments across more than one subreddit with varying scores for each comment


### 2. Data processing

#### Data processing steps to get input for ELAND model. Steps include:
1. Drop records of absolute scores lesser than 10
2. Drop user if they have posted less than 10 times
3. Drop users that are [deleted]

#### We don't have ground truth labels for training the model. To generate labels on users that are neeeded for next step, we used a rule to group users into either benign and anomalous users based on their posts scores stats. 
 - Anomalous user: An author who has commented atleast 10 times and every score of theirs is lesser than or equal to -10
 - Benign user: An author who has commented atleast 10 times and every score of theirs is greater than or equal to 10

In [None]:
#Drop records if their absoulte value of score is lesser than 10
df_score = df.drop(df[abs(df.score) < 10].index)

In [None]:
df.shape, df_score.shape #a lot of comments with less than score of 10

In [None]:
#check lowest score and highest score
df_score.score.min(), df_score.score.max()

In [None]:
df_score['author'].value_counts()

In [None]:
df_score['subreddit'].value_counts()

In [None]:
#Drop user if they have posted less than 10 times
counts = df_score['author'].value_counts()
res = df_score[~df_score['author'].isin(counts[counts < 10].index)]

#Drop users that are [deleted]
res = res.drop(res[res.author=='[deleted]'].index)

In [None]:
res['author'].value_counts()

In [None]:
#Number of unique users
len(res.author.unique())

## Create user labels

In [None]:
benign = pd.DataFrame()
anomaly = pd.DataFrame()

In [None]:
benign = benign.append(res)
print(benign.shape)

In [None]:
#remove records that score less than 10 
benign = benign.drop(benign[benign.score < 10].index)

In [None]:
#check one example of benign author
benign.loc[benign['author'] == 'jonknee'].T

In [None]:
##Anomalous author
anomaly = anomaly.append(res)

#Remove records with score larger than -10 
anomaly = anomaly.drop(anomaly[anomaly.score > -10].index)

In [None]:
#Example author
anomaly.loc[anomaly['author'] == 'I_AM_A_NEOCON']

In [None]:
#Same author can have high score comments and low score comments at the same time 
benign.loc[benign['author'] == 'I_AM_A_NEOCON']

In [None]:
anomaly_author_names = anomaly.author.unique()
benign_author_names = benign.author.unique()

In [None]:
def common_member(a, b):
 """check common elements of a and b"""
 a_set = set(a)
 b_set = set(b)
 
 if (a_set & b_set):
 return (a_set & b_set)
 else:
 print("No common elements")

In [None]:
#Remove authors that overlap in benign and anomalous
overlap_authors = common_member(benign_author_names, anomaly_author_names)
len(overlap_authors)

In [None]:
benign = benign[~benign['author'].isin(overlap_authors)]
benign_author_names = benign.author.unique()
print("Number of benign users: ", len(benign.author.unique()))
print("Number of anomalous users: ", len(anomaly.author.unique()))

### 3. Generate author/user labels and save to a csv file

In [None]:
benign_user_label = pd.DataFrame()
benign_user_label['author'] = benign_author_names
benign_user_label['label'] = 0 #0 as benign user
anomalous_user_label = pd.DataFrame()
anomalous_user_label['author'] = anomaly_author_names
anomalous_user_label['label'] = 1

In [None]:
benign_user_label.shape, anomalous_user_label.shape

In [None]:
benign_user_label.head(2)

In [None]:
anomalous_user_label.head(2)

In [None]:
user_label = pd.concat([benign_user_label, anomalous_user_label])

In [None]:
# Save user label
user_label_filepath = '../../data/02_intermediate/user_behavior/user_labels.csv'

In [None]:
from anomaly_detection_spatial_temporal_data.utils import ensure_directory

In [None]:
ensure_directory(user_label_filepath)
user_label.to_csv(user_label_filepath, index=False)

### 4. Generate user and subreddit index files

#### Each subreddit topic is given an index and saved as a pickle file. We will be naming the file p2index.pkl
#### Each author is also given an index and saved as a pickle file. We will be naming the file u2index.pkl

In [None]:
benign_prod_names = benign.subreddit.unique()
benign_prod_names = benign_prod_names.tolist()

anomaly_prod_names = anomaly.subreddit.unique()
anomaly_prod_names = anomaly_prod_names.tolist()

In [None]:
total_prod_names = benign_prod_names + anomaly_prod_names
total_prod_names = sorted(list(set(total_prod_names)))

In [None]:
p2index={}
count = 0
for subreddit in total_prod_names:
 p2index[subreddit]=count
 count+=1

In [None]:
total_author_names = benign_author_names.tolist() + anomaly_author_names.tolist()
total_author_names = sorted(list(set(total_author_names)))

In [None]:
u2index={}
count = 0
for author in total_author_names:
 u2index[author]=count
 count+=1

### Save the index mapping for author/user and subreddit topic 

In [None]:
import pickle
with open("../../data/02_intermediate/user_behavior/u2index.pkl","wb") as f:
 pickle.dump(u2index, f)

In [None]:
with open("../../data/02_intermediate/user_behavior/p2index.pkl","wb") as f:
 pickle.dump(p2index,f)

### 5. Save edge list as csv file

In [None]:
benign.shape, anomaly.shape

In [None]:
edgelist_df = benign.append(anomaly, ignore_index=True)
edgelist_df = edgelist_df.sort_values(by = 'retrieved_on')
print(edgelist_df.shape)

In [None]:
edgelist_df[['author','subreddit','retrieved_on']].head(10)

In [None]:
edge_list_file_path = "../../data/02_intermediate/user_behavior/edge_list.csv"
edgelist_df[['author','subreddit','retrieved_on']].to_csv(edge_list_file_path, index=False)

### 6. Train/validation/test split 

In [None]:
import random

def generate_n_lists(num_of_lists, num_of_elements, value_from=0, value_to=100):
 s = random.sample(range(value_from, value_to + 1), num_of_lists * num_of_elements)
 return [s[i*num_of_elements:(i+1)*num_of_elements] for i in range(num_of_lists)]

l = generate_n_lists(2, 393, 0, 786)

In [None]:
len(l), len(l[0]), len(l[1])

In [None]:
import numpy as np

In [None]:
import numpy as np
data_tvt = (np.array(l[0][:195]), np.array(l[0][195:]), np.array(l[1]))
print(type(data_tvt))
print(len(data_tvt[0]),len(data_tvt[1]), len(data_tvt[2]))

In [None]:
with open("../../data/02_intermediate/user_behavior/data_tvt.pkl","wb") as f:
 pickle.dump(data_tvt,f)

### 7. Get node features using NLP models

- To get node feature for user/author, we preprocess comments from each author, get their Top 10 used words and feed these words into word2vec model to get embeddings as author node features.
- To get node feature for subreddit topic, we get the Top 10 used words for each topic and feed these words into word2vec model to get embeddings as subreddit topic node features. 


#### Steps for comments/posts body processing are:
1. Convert words to lower
2. Remove numbers
3. Remove punctuation and symbols
4. Normalize the words (lemmatize and stem the words)

In [None]:
from gensim.test.utils import common_texts
from gensim.models import Word2Vec
import re
import collections
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import gensim.downloader

### Download the pretrained models

In [None]:
vectors = gensim.downloader.load('word2vec-google-news-300')

In [None]:
stopwords = set(nltk.corpus.stopwords.words('english'))
stemmer= PorterStemmer()

### Get the user node features (user2vec) 

In [None]:
type(vectors['hi']),vectors['hi'].shape

In [None]:
final_user2vec_npy = np.zeros((len(u2index), 300))

for u in u2index:
 user = edgelist_df.loc[edgelist_df['author'] == u]
 comment_row_list = []
 for index, rows in user.iterrows():
 my_list = rows.body
 my_list = my_list.replace('\n'," ")
 my_list = my_list.replace('\t'," ")
 my_list = my_list.lower()
 my_list = ''.join([i for i in my_list if not i.isdigit()])
 my_list = re.sub(r'[^\w\s]', ' ', my_list)
 tokens = word_tokenize(my_list)
 my_list = [i for i in tokens if not i in stopwords]
 comment_row_list.append(my_list)
 
 flat_list = [x for xs in comment_row_list for x in xs]
 counter = collections.Counter(flat_list)
 top10 = counter.most_common(10)
 #print(f'top 10 words used by {u} are:', top10)
 final_vectors = np.zeros((10, 300))
 for i, w in enumerate(top10):
 try:
 embedding = vectors[w[0]]
 #embedding = embedding.tolist()
 except:
 #print('no embeddings created for word: {}'.format(w[0]))
 embedding = np.array([0] * 300)
 final_vectors[i,:]=embedding
 final_embeddings = np.sum(final_vectors, axis=0) 

# if u2index[u] < 1:
# print(final_vectors.shape, final_embeddings.shape)
 final_user2vec_npy[u2index[u],:] = final_embeddings

In [None]:
final_user2vec_npy.shape

In [None]:
# Save the user2vec feature matrix 
userfeat_file = "../../data/02_intermediate/user_behavior/user2vec_npy.npz"
np.savez(userfeat_file,data=final_user2vec_npy)

#### Get the subreddit topic node features (prod2vec)

In [None]:
final_prod2vec_npy = np.zeros((len(p2index), 300))

for p in p2index:
 subreddit = edgelist_df.loc[edgelist_df['subreddit'] == p]
 subreddit_row_list = []
 for index, rows in subreddit.iterrows():
 my_list = rows.body
 my_list = my_list.replace('\n'," ")
 my_list = my_list.replace('\t'," ")
 my_list = my_list.lower()
 my_list = ''.join([i for i in my_list if not i.isdigit()])
 my_list = re.sub(r'[^\w\s]', ' ', my_list)
 tokens = word_tokenize(my_list)
 my_list = [i for i in tokens if not i in stopwords]
 subreddit_row_list.append(my_list)
 
 flat_list = [x for xs in subreddit_row_list for x in xs]
 counter = collections.Counter(flat_list)
 top10 = counter.most_common(10)
 #print(f'top 10 words for subreddit topic {p} are:', top10)

 final_vectors = np.zeros((10, 300))
 for i, w in enumerate(top10):
 try:
 embedding = vectors[w[0]]
 #embedding = embedding.tolist()
 except:
 #print('no embeddings created for word: {}'.format(w[0]))
 embedding = np.array([0] * 300)
 final_vectors[i,:]=embedding
 final_embeddings = np.sum(final_vectors, axis=0)
 final_prod2vec_npy[p2index[p],:] = final_embeddings

In [None]:
type(final_prod2vec_npy),final_prod2vec_npy.shape

In [None]:
# Save the prod2vec feature matrix 
prodfeat_file = "../../data/02_intermediate/user_behavior/prod2vec_npy.npz"
np.savez(prodfeat_file,data=final_prod2vec_npy)

# References

Jason Baumgartner, Savvas Zannettou, Brian Keegan, Megan Squire, and Jeremy Blackburn. 2020. The Pushshift Reddit Dataset.