#!/usr/bin/env python
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
#   Licensed under the Apache License, Version 2.0 (the "License").
#   You may not use this file except in compliance with the License.
#   A copy of the License is located at
#
#       http://www.apache.org/licenses/LICENSE-2.0
#
#   or in the "license" file accompanying this file. This file is distributed
#   on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
#   express or implied. See the License for the specific language governing
#   permissions and limitations under the License.
#
#   A sample training component that trains a keras text classification
#   model.


from __future__ import print_function

import os
import json
import pickle
import sys
import traceback

from sklearn import tree
from sklearn.model_selection import train_test_split

import pandas as pd
import tensorflow as tf
import re
import numpy as np

from tensorflow.python.estimator.export.export import build_raw_serving_input_receiver_fn
from tensorflow.python.estimator.export.export_output import PredictOutput
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from tensorflow.python.keras.utils import to_categorical
import re

# These are the paths to where SageMaker mounts interesting things in your container.

prefix = '/opt/ml/'

input_path = prefix + 'input/data'
output_path = os.path.join(prefix, 'output')
model_path = os.path.join(prefix, 'model')
param_path = os.path.join(prefix, 'input/config/hyperparameters.json')

# This algorithm has a single channel of input data called 'training'. Since we run in
# File mode, the input files are copied to the directory specified here.
channel_name='training'
training_path = os.path.join(input_path, channel_name)

label_index = {'b':0,'t':1,'e':2,'m':3}
MAX_WORDS = 10000 # only consider the top 10,000 words in the dataset
WORD_INDEX = {}
EMBEDDING_DIM = 100
MAX_LEN = 100

def get_label_id(category):
  return label_index[category]

def get_class_label(prediction):
  for key, value in label_index.items():
    if value == prediction[0]:
      return key

def normalize_text(s):
    s = s.lower()

    # remove punctuation that is not word-internal (e.g., hyphens, apostrophes)
    s = re.sub('\s\W',' ',s)
    s = re.sub('\W\s',' ',s)

    # make sure we didn't introduce any double spaces
    s = re.sub('\s+',' ',s)

    return s


# The function to execute the training.
def train():
    print('Starting the training.')
    try:
        # Read in any hyperparameters that the user passed with the training job
        with open(param_path, 'r') as tc:
            trainingParams = json.load(tc)

        column_names = ["TITLE", "URL", "PUBLISHER", "CATEGORY", "STORY", "HOSTNAME", "TIMESTAMP"]
        news_dataset = pd.read_csv(os.path.join(training_path, 'newsCorpora.csv'), names=column_names, header=None, delimiter='\t')
        news_dataset = news_dataset.dropna()
        print(news_dataset.head())

        #remove punctuations and double spaces
        texts = [normalize_text(s) for s in news_dataset['TITLE']]
        #convert category to a number
        labels = [get_label_id(l) for l in news_dataset['CATEGORY']]

        tokenizer = Tokenizer(num_words=MAX_WORDS)
        tokenizer.fit_on_texts(texts) #texts contains the Title
        sequences = tokenizer.texts_to_sequences(texts)
        WORD_INDEX = tokenizer.word_index
        data = pad_sequences(sequences, maxlen=MAX_LEN)
        labels = to_categorical(np.asarray(labels))

        print('Found %s unique tokens.' % len(WORD_INDEX))
        print('Shape of data tensor:', data.shape)
        print('Shape of label tensor:', labels.shape)

        embeddings_index = {}
        f = open(os.path.join(training_path, 'glove.6B.100d.txt'))
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
        f.close()

        embedding_matrix = np.zeros((MAX_WORDS, EMBEDDING_DIM))

        for word, i in WORD_INDEX.items():
            embedding_vector = embeddings_index.get(word)
            if i < MAX_WORDS:
                if embedding_vector is not None:
                    embedding_matrix[i] = embedding_vector

        x_train, x_test, y_train, y_test = train_test_split(data, labels,
                                                    test_size=0.2,
                                                    random_state=99)
        print("x_train shape: ", x_train.shape)

        #now lets build our sequential model using keras

        model = tf.keras.models.Sequential()
        model.add(tf.keras.layers.Embedding(MAX_WORDS, EMBEDDING_DIM, input_length=MAX_LEN))

        # IMPORTANT: Only one of the architectures below should be uncommented

        # ------Architecture: MLP------------------------------
        model.add(tf.keras.layers.Flatten())
        model.add(tf.keras.layers.Dense(2, activation='relu')) # Try 2-32 units, here and in other Dense layers
        # model.add(tf.keras.layers.Dense(2, activation='relu')) # Try adding more Dense layers uncommenting this line
        #------------------------------------------------------

        #-------Architecture: RNN w/Dropout suitable for GPUs--
        # model.add(tf.keras.layers.Dropout(0.5))
        # model.add(tf.keras.layers.LSTM(32, return_sequences=True))  # returns a sequence of vectors of dimension 32
        # model.add(tf.keras.layers.LSTM(32, return_sequences=True))  # returns a sequence of vectors of dimension 32
        # model.add(tf.keras.layers.LSTM(32))  # return a single vector of dimension 32
        #------------------------------------------------------

        model.add(tf.keras.layers.Dense(len(label_index), activation='softmax'))
        model.summary()

        model.layers[0].set_weights([embedding_matrix])
        model.layers[0].trainable = False

        #model.save_weights('pre_trained_glove_model.h5')

        model.compile(optimizer='rmsprop',
                      loss='categorical_crossentropy',
                      metrics=['acc'])
        history = model.fit(x_train, y_train,
                            epochs=2,
                            verbose=2,
                            batch_size=32,
                            validation_data=(x_test, y_test))


        # save the model and the tokenizer
        print('Training complete. Now saving model to: ', model_path)
        model.save(os.path.join(model_path, 'news_breaker.h5'))
        saved_model = tf.keras.models.load_model(os.path.join(model_path, 'news_breaker.h5'))
        with open(os.path.join(model_path,'tokenizer.pickle'), 'wb') as handle:
            pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

        # test with a sample headline
        post = "What Improved Tech Means for Electric, Self-Driving and Flying Cars"
        seq = tokenizer.texts_to_sequences([post])
        d = pad_sequences(seq, maxlen=MAX_LEN)
        prediction = saved_model.predict_classes(np.array(d))
        print("Test headline: ", post)
        print("Predicted category: " , get_class_label(prediction))

    except Exception as e:
        # Write out an error file. This will be returned as the failureReason in the
        # DescribeTrainingJob result.
        trc = traceback.format_exc()
        with open(os.path.join(output_path, 'failure'), 'w') as s:
            s.write('Exception during training: ' + str(e) + '\n' + trc)
        # Printing this causes the exception to be in the training job logs, as well.
        print('Exception during training: ' + str(e) + '\n' + trc, file=sys.stderr)
        # A non-zero exit code causes the training job to be marked as Failed.
        sys.exit(255)

if __name__ == '__main__':
    train()

    # A zero exit code causes the job to be marked a Succeeded.
    sys.exit(0)