# Pre-processing the text for Object2Vec

Processing the text to fit Object2Vec algorithm.

In [None]:
import boto3
import pandas as pd
import re
from sklearn import preprocessing
import numpy as np
import json
import os
from sklearn.feature_extraction.text import CountVectorizer
import random
random.seed(42)
from random import sample
from sklearn.utils import shuffle
from nltk import word_tokenize

#### Functions

In [None]:
def get_filtered_objects(bucket_name, prefix):
 """filter objects based on bucket and prefix"""
 s3 = boto3.client("s3")
 files = s3.list_objects_v2(Bucket = bucket_name, Prefix =prefix)
 return files

In [None]:
def download_object(bucket_name, key, local_path):
 """Download S3 object to local"""
 s3 = boto3.resource('s3')
 try:
 s3.Bucket(bucket_name).download_file(key,local_path)
 except botocore.exceptions.ClientError as e:
 if e.response['Error']['Code'] == "404":
 print("The object does not exist")
 else:
 raise

In [None]:
def get_csv(files):
 """Filter the files by selecting .csv extension"""
 paths = []
 for file in files:
 if file['Key'].endswith(".csv"):
 paths.append(file['Key'])
 return paths

In [None]:
def sentence_to_tokens(sentence, vocab_to_tokens):
 """converts sentences to tokens"""
 words = word_tokenize(sentence)
 return [ vocab_to_tokens[w] for w in words if w in vocab_to_tokens]

In [None]:
def create_dir(directory):
 """Create a directory"""
 if not os.path.exists(directory):
 os.makedirs(directory)

In [None]:
def remove_file(file_path):
 """Remove locally the specified path"""
 if os.path.isfile(file_path):
 os.remove(file_path)
 else:
 print("Error, file not found.")

In [None]:
def build_sentence_pairs(data):
 """transform the dataframe into sentence pairs for Object2Vec algorithm."""
 sentence_pairs = []
 for r in range(len(data)):
 row = data.iloc[r]
 sentence_pairs.append({'in0': row['encoded_content'], \
 'in1': row['labels'],\
 'label':1})
 return sentence_pairs

In [None]:
def build_negative_pairs(data, negative_labels_to_sample,sentence_pairs, n_neg_pairs_per_label=10):
 """build negative pairs for training dataframe"""
 for r in negative_labels_to_sample:
 #news that have that label as tag
 selection = data.loc[data.labels.apply(lambda x: x is not None and r in x)]
 #news that do not have that label as tag.
 wrong_selection = data.loc[data.labels.apply(lambda x: x is not None and r not in x)]
 if len(wrong_selection)>0:
 for p in range(n_neg_pairs_per_label):
 negative_pair = {}
 negative_pair['in0'] = selection.sample(1)['encoded_content'].iloc[0]
 negative_pair['in1'] = wrong_selection.sample(1)['labels'].iloc[0]
 negative_pair['label'] = 0
 sentence_pairs.append(negative_pair)
 return sentence_pairs

##### Download the data locally

In [None]:
bucket_name = "YOUR_BUCKET_HERE"
prefix = "connect/"

In [None]:
#save the files locally.
create_dir("./data")

In [None]:
files = get_filtered_objects(bucket_name, prefix)['Contents']
files = get_csv(files)
local_files=[]
print(files)
for file in files:
 full_prefix = "/".join(file.split("/")[:-1])
 inner_folder = full_prefix.replace(prefix,'')
 local_path = "./data/" +file.split("/")[-1]
 download_object(bucket_name, file, local_path)
 local_files.append(local_path)

In [None]:
local_files

##### Concatenate the .csv files

In [None]:
import pandas.errors
content = []
for filename in local_files:
 try:
 df = pd.read_csv(filename, sep=";")
 print(df.columns)
 content.append(df)
 except pandas.errors.ParserError:
 print("File", filename, "cannot be parsed. Check its format")
data = pd.concat(content)

In [None]:
customer_text = data.loc[data.ParticipantId=='CUSTOMER']

In [None]:
customer_text.shape

##### Create random labels

Change this to use your own labels
Also: we are here replicating the texts to increase statistics

In [None]:
customer_text = pd.concat([customer_text]*300, ignore_index=True)

In [None]:
customer_text['labels']=np.random.randint(low=0, high=5, size=len(customer_text))

In [None]:
customer_text.labels.hist()

##### Get vocabulary from the corpus using sklearn for the heavy lifting

The vocabulary will be built only taking into account words that belong to news related to crimes.

In [None]:
counts = CountVectorizer(min_df=5, max_df=0.95, token_pattern=r'(?u)\b[A-Za-z]{2,}\b').fit(customer_text['Content'].values.tolist())

In [None]:
vocab = counts.get_feature_names()
vocab_to_token_dict = dict(zip(vocab, range(len(vocab))))
token_to_vocab_dict = dict(zip(range(len(vocab)), vocab))

In [None]:
len(vocab)

In [None]:
create_dir("./vocab")
vocab_filename = './vocab/vocab.json'
with open(vocab_filename, "w") as write_file:
 json.dump(vocab_to_token_dict, write_file)

##### Encode data body

Transform the texts in the data to encodings from the vocabulary created.

In [None]:
import nltk
nltk.download('punkt')

In [None]:
customer_text['encoded_content'] = customer_text['Content'].apply(lambda x: sentence_to_tokens(x, vocab_to_token_dict))

In [None]:
customer_text['labels']

In [None]:
customer_text['labels']=customer_text['labels'].apply(lambda x: [x])

In [None]:
customer_text[['labels','encoded_content']]

In [None]:
# remove entriews with no text

In [None]:
customer_text = customer_text.loc[customer_text['encoded_content'].apply(lambda x: len(x)>0)]

In [None]:
customer_text[['labels','encoded_content', 'Content']]

##### Build sentence pairs Object2Vec

In [None]:
#negative pairs for the algorithm: need to decide which lables we want to sample *against*. 
negative_labels_to_sample = range(5)

In [None]:
sentence_pairs = build_sentence_pairs(customer_text)


##### Build negative sentence pairs for training Object2Vec

Negative sampling for the Object2Vec algorithm - add negative and positive pairs (document,label)

In [None]:
sentence_pairs = build_negative_pairs(customer_text,negative_labels_to_sample,sentence_pairs)


In [None]:
print("Sample of input for Object2vec algorith: {}".format(sentence_pairs[1]))

In [None]:
!pip install jsonlines

##### train/test/val split, save to file


In [None]:
# shuffle and split test/train/val
random.seed(42)
random.shuffle(sentence_pairs)

n_train = int(0.7 * len(sentence_pairs))

# split train and test
sentence_pairs_train = sentence_pairs[:n_train]
sentence_pairs_test = sentence_pairs[n_train:]

# further split test set into validation set (val_vectors) and test set (test_vectors)
n_test = len(sentence_pairs_test)

sentence_pairs_val = sentence_pairs_test[:n_test//2]
sentence_pairs_test = sentence_pairs_test[n_test//2:]


In [None]:
import jsonlines
with jsonlines.open('./data/train.jsonl', mode='w') as writer:
 writer.write_all(sentence_pairs_train)
 
with jsonlines.open('./data/test.jsonl', mode='w') as writer:
 writer.write_all(sentence_pairs_test)

with jsonlines.open('./data/val.jsonl', mode='w') as writer:
 writer.write_all(sentence_pairs_val)

##### 8. Upload to S3

In [None]:
import os
s3_client = boto3.client('s3')

out_prefix = "connect/O2VInput"
for n in ['train', 'test', 'val',]:
 s3_client.upload_file("./data/"+n+'.jsonl', bucket_name, \
 os.path.join(out_prefix, n, n+'.jsonl'),\
 ExtraArgs = {'ServerSideEncryption':'AES256'}) #upload input files

In [None]:
print(vocab_filename)
print(out_prefix)
print( os.path.join(out_prefix, "auxiliary/vocab.json"))

In [None]:
s3_client.upload_file(vocab_filename,
 bucket_name, os.path.join(out_prefix, "auxiliary/vocab.json"),
 ExtraArgs = {'ServerSideEncryption':'AES256'}) #upload vocab file

In [None]:
import pickle
pickle.dump(vocab_to_token_dict, open('./vocab/vocab_to_token_dict.p', 'wb'))
pickle.dump(token_to_vocab_dict, open('./vocab/token_to_vocab_dict.p', 'wb'))
for f in ['vocab_to_token_dict.p','token_to_vocab_dict.p']:
 s3_client.upload_file("./vocab/"+f, bucket_name, \
 os.path.join(out_prefix, 'meta', f),ExtraArgs = {'ServerSideEncryption':'AES256'})

In [None]:
for f in local_files:
 remove_file(f)