# Sentiment Analysis with Apache MXNet and Gluon

This tutorial walks you through how to implement a sentiment analysis model to classify movie reviews as either 'Positive' or 'Negative' using Apache MXNet and the Gluon programming interface.

In [15]:
import re
import itertools
from collections import Counter
import numpy as np
from sklearn.model_selection import train_test_split


import mxnet as mx
from mxnet import gluon, nd, autograd
from mxnet.gluon import nn, rnn

context = mx.gpu(0)

First, we are going to load the movie review dataset. We will be taking advantage of Stanford's Large Movie Review Dataset that is available here: http://ai.stanford.edu/~amaas/data/sentiment/. This dataset includes 25,000 movies reviews from the IMBD database with 12,500 labeled as 'Positive' reviews and the other 12,500 labeled as 'Negative' reviews.

In [16]:
def read_files(foldername):
    import os
    sentiments = []
    filenames = os.listdir(os.curdir+ "/"+foldername)
    for file in filenames:
        with open(foldername+"/"+file,"r", encoding="utf8") as pos_file:
            data=pos_file.read().replace('\n', '')
            sentiments.append(data)
    return sentiments
    
    
#Ensure that the path below leads to the location of the positive reviews 
foldername = "aclImdb/train/pos/"
postive_sentiment = read_files(foldername)

#Ensure that the path below leads to the location of the negative reviews
foldername = "aclImdb/train/neg/"
negative_sentiment = read_files(foldername)

#This labels the 'Positive' reviews as 1' and the 'Negative' reviews as 0
positive_labels = [1 for _ in postive_sentiment]
negative_labels = [0 for _ in negative_sentiment]

Next we want to clean up the text of the movie reviews so that we are only processing words. The actual words in the reviews are going to be the most predictive - not sentence breaks or commas, for example. 

In [17]:
#some string preprocessing
def clean_str(string):  
    
    #This removes any special characters from the review
    remove_special_chars = re.compile("[^A-Za-z0-9 ]+")
    
    #This removes any line breaks and replaces them with spaces
    string = string.lower().replace("<br />", " ")
    
    return re.sub(remove_special_chars, "", string.lower())

Next, we are going to process all of the words in the reviews, count the number of occurences of each word, and then index the words in descending order with respect to how many times this occur. This is a necessary input to help us encode the words in the reviews so that they can be understood by a machine.

In [18]:
#This creates a dictionary of the words and their counts in entire 
#movie review dataset {word:count}

word_counter = Counter()
def create_count(sentiments):
    for line in sentiments:
        for word in (clean_str(line)).split():
            if word not in word_counter.keys():               
                word_counter[word] = 1
            else:
                word_counter[word] += 1

#This assigns a unique a number for each word (sorted by descending order 
#based on the frequency of occurrence)and returns a word_dict

def create_word_index():
    idx = 1
    word_dict = {}
    for word in word_counter.most_common():
        word_dict[word[0]] = idx
        idx+=1
    return word_dict
    
#Here we combine all of the reviews into one dataset and create a word
#dictionary using this entire dataset

all_sentiments = postive_sentiment + negative_sentiment
all_labels = positive_labels + negative_labels
create_count(all_sentiments)
word_dict = create_word_index()

#This creates a reverse index from a number to the word 
idx2word = {v: k for k, v in word_dict.items()}

Next we create a set of helper functions that (1) encode words into a sequence of numbers, (2) decode a sequence of numbers back into words, and (3) truncate and pad the input data to ensure they are of equal length and thereby enable easier processing.  

In [19]:
#This helper function creates a encoded sentences by assigning the unique 
#id from word_dict to the words in the input text (i.e., movie reviews)
def encoded_sentences(input_file,word_dict):
    output_string = []
    for line in input_file:
        output_line = []
        for word in (clean_str(line)).split():
            if word in word_dict:
                output_line.append(word_dict[word])
        output_string.append(output_line)
    return output_string

#This helper function decodes encoded sentences
def decode_sentences(input_file,word_dict):
    output_string = []
    for line in input_file:
        output_line = ''
        for idx in line:
            output_line += idx2word[idx] + ' '
        output_string.append(output_line)
    return output_string

#This helper function pads the sequences to maxlen.
#If the sentence is greater than maxlen, it truncates the sentence.
#If the sentence is less than 500, it pads with value 0.
def pad_sequences(sentences,maxlen=500,value=0):
    """
    Pads all sentences to the same length. The length is defined by maxlen.
    Returns padded sentences.
    """
    padded_sentences = []
    for sen in sentences:
        new_sentence = []
        if(len(sen) > maxlen):
            new_sentence = sen[:maxlen]
            padded_sentences.append(new_sentence)
        else:
            num_padding = maxlen - len(sen)
            new_sentence = np.append(sen,[value] * num_padding)
            padded_sentences.append(new_sentence)
    return padded_sentences

Next we are going to encode all of the movie reviews using the word dictionary created. In addition, we are going to cap the size of the tracked vocabulary size - meaning any word that is outside of the tracked range will be encoded with the last position. This is performance versus accuracy consideration - a larger tracked vocabulary will lead to more accurary but will have performance considerations because it requires a longer training process.

In [20]:
#Encodes the positive and negative reviews into sequences of number
positive_encoded = encoded_sentences(postive_sentiment,word_dict)
negative_encoded = encoded_sentences(negative_sentiment,word_dict)

all_encoded = positive_encoded + negative_encoded

In [21]:
vocab_size = 5000 #Here we set the total num of words to be tracked

#Any word outside of the tracked range will be encoded with last position.
t_data = [np.array([i if i<(vocab_size-1) else (vocab_size-1) for i in s]) for s in all_encoded]


We will be using a word embedding matrix to represent the words that we observe in the movie reviews. Represeting the meaning of the words with these vectors is a large exercise unto itself. Instead, we will be leveraging Stanford's Global Vector for Word Representation (GloVe) embedding. We specifically used glove.42B.300d.zip available at this link:
https://nlp.stanford.edu/projects/glove/.

In [22]:
# Loads Stanford's Global Vector for Word Representation (GloVe) embedding

num_embed = 300 #This is the richness of the word attributes captured

def load_glove_index(loc):
    f = open(loc, encoding="utf8")
    embeddings_index = {}
    
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype = 'float32')
        embeddings_index[word] = coefs
    f.close()
    return embeddings_index

def create_emb():
    embedding_matrix = np.zeros((vocab_size, num_embed))
    for word, i in word_dict.items():
        if i >= vocab_size:
            continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    embedding_matrix = nd.array(embedding_matrix)
    return embedding_matrix

embeddings_index = load_glove_index('glove.42B.300d.txt')
embedding_matrix = create_emb()

Next we prepare the movie reviews to be fed into the deep learning model by (1) Reserving 30% of the dataset as a test dataset, (2) padding and truncating the data to the length of 500 words, and (3) converting the movie reviews into MXNet's NDArray format.

In [23]:
#This separates 30% of the entire dataset into test dataset.
X_train, X_test, y_train, y_test_set = train_test_split(t_data, all_labels, test_size=0.3, random_state=42)

In [24]:
#Here are some of the statistics of sentences before padding
min_len = min(map(len, t_data))
max_len = max(map(len,t_data))
avg_len = sum(map(len,t_data)) / len(t_data)
print("the minimum length is:",min_len)
print("the maximum length is:",max_len)
print("the average length is:",avg_len)

the minimum length is: 10
the maximum length is: 2459
the average length is: 230.51952


In [25]:
seq_len = 500 #This set the max word length of each movie review

#Below we pad the reviews and convert them to MXNet's NDArray format
trn = nd.array(pad_sequences(X_train, maxlen=seq_len, value=0))
test = nd.array(pad_sequences(X_test, maxlen=seq_len, value=0))
y_trn = nd.array(y_train)
y_test = nd.array(y_test_set)

In [None]:
num_classes = 2
num_hidden = 64
learning_rate = .001
epochs = 10
batch_size = 12

In [None]:
train_arraydataset = mx.gluon.data.ArrayDataset(trn,y_trn)
train_loader = mx.gluon.data.DataLoader(train_arraydataset,batch_size=batch_size,shuffle=False,last_batch='keep')

In [None]:
test_arraydataset = mx.gluon.data.ArrayDataset(test,y_test)
test_loader = mx.gluon.data.DataLoader(test_arraydataset,batch_size=batch_size,shuffle=False,last_batch='keep')

Now we're ready to define the neural network for this model using Gluon. We will be using an LSTM model with 64 hidden units, and we will be taking advantage of the embedding layer created above.

In [26]:
model = mx.gluon.nn.Sequential()

with model.name_scope():    
    model.embed = mx.gluon.nn.Embedding(vocab_size, num_embed)
    model.add(mx.gluon.rnn.LSTM(num_hidden, layout = 'NTC'))
    model.add(mx.gluon.nn.Dense(num_classes))

Before we execute the training loop, we need to define a function that will calculate the accurary metrics for the model.

In [27]:
def evaluate_accuracy(model,loader):
    correct = 0
    total = 0
    
    for _,(data,target) in enumerate(loader):
        
        data = data.as_in_context(context)
        target = target.as_in_context(context)
        
        output = model(data)
        predictions = nd.argmax(output, axis=1)
        correct+=np.sum(predictions.asnumpy()==target.asnumpy())
        total+=data.shape[0]
    
    return float(correct/total)

Finally, we are ready to execute the training loop. Prior to kicking off the training loop, we need to initialize the model parameters and the optimer function in addition to setting up the pre-training embedding layer.

In [28]:
model.collect_params().initialize(mx.init.Xavier(), ctx=context)

model.embed.weight.set_data(embedding_matrix.as_in_context(context))

trainer = gluon.Trainer(model.collect_params(), 'sgd',
                        {'learning_rate': learning_rate})

softmax_cross_entropy = gluon.loss.SoftmaxCrossEntropyLoss()    

for epoch in range(epochs):
    
    print('ping')
    for _,(data,target) in enumerate(train_loader):
        
        data = data.as_in_context(context)
        target = target.as_in_context(context)
        
        with autograd.record():
            output = model(data)
            L = softmax_cross_entropy(output, target)
            L.backward()
        trainer.step(data.shape[0])
            
    test_accuracy = evaluate_accuracy(model,train_loader)
    train_accuracy = evaluate_accuracy(model,test_loader)
    print("Epoch %s. Train_acc %s, Test_acc %s" %
          (epoch, train_accuracy, test_accuracy))

ping
Epoch 0. Train_acc 0.622133333333, Test_acc 0.635802469136
ping
Epoch 1. Train_acc 0.664266666667, Test_acc 0.68032693187
ping
Epoch 2. Train_acc 0.726, Test_acc 0.738225880201
ping
Epoch 3. Train_acc 0.770133333333, Test_acc 0.779721079104
ping
Epoch 4. Train_acc 0.7864, Test_acc 0.793267032465
ping
Epoch 5. Train_acc 0.798266666667, Test_acc 0.800925925926
ping
Epoch 6. Train_acc 0.802533333333, Test_acc 0.805898491084
ping
Epoch 7. Train_acc 0.805466666667, Test_acc 0.812299954275
ping
Epoch 8. Train_acc 0.809466666667, Test_acc 0.816415180613
ping
Epoch 9. Train_acc 0.813333333333, Test_acc 0.820816186557
