{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Pre-processing the text for Object2Vec\n",
    "\n",
    "Processing the text to fit Object2Vec algorithm."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-01-20T10:03:01.389224Z",
     "start_time": "2021-01-20T10:03:00.872885Z"
    },
    "execution": {
     "iopub.execute_input": "2020-09-22T11:42:30.848002Z",
     "iopub.status.busy": "2020-09-22T11:42:30.847698Z",
     "iopub.status.idle": "2020-09-22T11:42:30.852099Z",
     "shell.execute_reply": "2020-09-22T11:42:30.851258Z",
     "shell.execute_reply.started": "2020-09-22T11:42:30.847970Z"
    }
   },
   "outputs": [],
   "source": [
    "import boto3\n",
    "import pandas as pd\n",
    "import re\n",
    "from sklearn import preprocessing\n",
    "import numpy as np\n",
    "import json\n",
    "import os\n",
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "import random\n",
    "random.seed(42)\n",
    "from random import sample\n",
    "from sklearn.utils import shuffle\n",
    "from nltk import word_tokenize"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Functions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_filtered_objects(bucket_name, prefix):\n",
    "    \"\"\"filter objects based on bucket and prefix\"\"\"\n",
    "    s3 = boto3.client(\"s3\")\n",
    "    files = s3.list_objects_v2(Bucket = bucket_name, Prefix =prefix)\n",
    "    return files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def download_object(bucket_name, key, local_path):\n",
    "    \"\"\"Download S3 object to local\"\"\"\n",
    "    s3 = boto3.resource('s3')\n",
    "    try:\n",
    "        s3.Bucket(bucket_name).download_file(key,local_path)\n",
    "    except botocore.exceptions.ClientError as e:\n",
    "        if e.response['Error']['Code'] == \"404\":\n",
    "            print(\"The object does not exist\")\n",
    "        else:\n",
    "            raise"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_csv(files):\n",
    "    \"\"\"Filter the files by selecting .csv extension\"\"\"\n",
    "    paths = []\n",
    "    for file in files:\n",
    "        if file['Key'].endswith(\".csv\"):\n",
    "            paths.append(file['Key'])\n",
    "    return paths"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def sentence_to_tokens(sentence, vocab_to_tokens):\n",
    "    \"\"\"converts sentences to tokens\"\"\"\n",
    "    words = word_tokenize(sentence)\n",
    "    return [ vocab_to_tokens[w] for w in words if w in vocab_to_tokens]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_dir(directory):\n",
    "    \"\"\"Create a directory\"\"\"\n",
    "    if not os.path.exists(directory):\n",
    "        os.makedirs(directory)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def remove_file(file_path):\n",
    "    \"\"\"Remove locally the specified path\"\"\"\n",
    "    if os.path.isfile(file_path):\n",
    "        os.remove(file_path)\n",
    "    else:\n",
    "        print(\"Error, file not found.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def build_sentence_pairs(data):\n",
    "    \"\"\"transform the dataframe into sentence pairs for Object2Vec algorithm.\"\"\"\n",
    "    sentence_pairs = []\n",
    "    for r in range(len(data)):\n",
    "        row = data.iloc[r]\n",
    "        sentence_pairs.append({'in0': row['encoded_content'], \\\n",
    "                               'in1': row['labels'],\\\n",
    "                               'label':1})\n",
    "    return sentence_pairs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def build_negative_pairs(data, negative_labels_to_sample,sentence_pairs, n_neg_pairs_per_label=10):\n",
    "    \"\"\"build negative pairs for training dataframe\"\"\"\n",
    "    for r in negative_labels_to_sample:\n",
    "        #news that have that label as tag\n",
    "        selection = data.loc[data.labels.apply(lambda x: x is not None and r in x)]\n",
    "        #news that do not have that label as tag.\n",
    "        wrong_selection = data.loc[data.labels.apply(lambda x: x is not None and r not in x)]\n",
    "        if len(wrong_selection)>0:\n",
    "            for p in range(n_neg_pairs_per_label):\n",
    "                negative_pair = {}\n",
    "                negative_pair['in0'] = selection.sample(1)['encoded_content'].iloc[0]\n",
    "                negative_pair['in1'] = wrong_selection.sample(1)['labels'].iloc[0]\n",
    "                negative_pair['label'] = 0\n",
    "                sentence_pairs.append(negative_pair)\n",
    "    return sentence_pairs"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##### Download the data locally"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "bucket_name = \"YOUR_BUCKET_HERE\"\n",
    "prefix = \"connect/\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#save the files locally.\n",
    "create_dir(\"./data\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "files = get_filtered_objects(bucket_name, prefix)['Contents']\n",
    "files = get_csv(files)\n",
    "local_files=[]\n",
    "print(files)\n",
    "for file in files:\n",
    "    full_prefix = \"/\".join(file.split(\"/\")[:-1])\n",
    "    inner_folder = full_prefix.replace(prefix,'')\n",
    "    local_path = \"./data/\" +file.split(\"/\")[-1]\n",
    "    download_object(bucket_name, file, local_path)\n",
    "    local_files.append(local_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "local_files"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##### Concatenate the .csv files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas.errors\n",
    "content = []\n",
    "for filename in local_files:\n",
    "    try:\n",
    "        df = pd.read_csv(filename, sep=\";\")\n",
    "        print(df.columns)\n",
    "        content.append(df)\n",
    "    except pandas.errors.ParserError:\n",
    "        print(\"File\", filename, \"cannot be parsed. Check its format\")\n",
    "data = pd.concat(content)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "customer_text = data.loc[data.ParticipantId=='CUSTOMER']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "customer_text.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##### Create random labels\n",
    "\n",
    "Change this to use your own labels\n",
    "Also: we are here replicating the texts to increase statistics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "customer_text = pd.concat([customer_text]*300, ignore_index=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "customer_text['labels']=np.random.randint(low=0, high=5, size=len(customer_text))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "customer_text.labels.hist()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##### Get vocabulary from the corpus using sklearn for the heavy lifting\n",
    "\n",
    "The vocabulary will be built only taking into account words that belong to news related to crimes."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-01-20T10:04:02.252430Z",
     "start_time": "2021-01-20T10:04:02.214275Z"
    },
    "execution": {
     "iopub.execute_input": "2020-09-22T11:43:25.075339Z",
     "iopub.status.busy": "2020-09-22T11:43:25.075126Z",
     "iopub.status.idle": "2020-09-22T11:43:26.023392Z",
     "shell.execute_reply": "2020-09-22T11:43:26.022645Z",
     "shell.execute_reply.started": "2020-09-22T11:43:25.075308Z"
    }
   },
   "outputs": [],
   "source": [
    "counts = CountVectorizer(min_df=5, max_df=0.95, token_pattern=r'(?u)\\b[A-Za-z]{2,}\\b').fit(customer_text['Content'].values.tolist())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-01-20T10:04:02.778942Z",
     "start_time": "2021-01-20T10:04:02.774643Z"
    },
    "execution": {
     "iopub.execute_input": "2020-09-22T11:43:26.024745Z",
     "iopub.status.busy": "2020-09-22T11:43:26.024523Z",
     "iopub.status.idle": "2020-09-22T11:43:26.030167Z",
     "shell.execute_reply": "2020-09-22T11:43:26.029583Z",
     "shell.execute_reply.started": "2020-09-22T11:43:26.024715Z"
    }
   },
   "outputs": [],
   "source": [
    "vocab = counts.get_feature_names()\n",
    "vocab_to_token_dict = dict(zip(vocab, range(len(vocab))))\n",
    "token_to_vocab_dict = dict(zip(range(len(vocab)), vocab))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-01-20T10:04:03.187494Z",
     "start_time": "2021-01-20T10:04:03.183508Z"
    },
    "execution": {
     "iopub.execute_input": "2020-09-22T11:43:26.031900Z",
     "iopub.status.busy": "2020-09-22T11:43:26.031687Z",
     "iopub.status.idle": "2020-09-22T11:43:26.036107Z",
     "shell.execute_reply": "2020-09-22T11:43:26.035548Z",
     "shell.execute_reply.started": "2020-09-22T11:43:26.031873Z"
    }
   },
   "outputs": [],
   "source": [
    "len(vocab)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-01-20T10:04:03.619569Z",
     "start_time": "2021-01-20T10:04:03.614481Z"
    },
    "execution": {
     "iopub.execute_input": "2020-09-22T11:43:26.037456Z",
     "iopub.status.busy": "2020-09-22T11:43:26.037252Z",
     "iopub.status.idle": "2020-09-22T11:43:26.045015Z",
     "shell.execute_reply": "2020-09-22T11:43:26.044448Z",
     "shell.execute_reply.started": "2020-09-22T11:43:26.037431Z"
    }
   },
   "outputs": [],
   "source": [
    "create_dir(\"./vocab\")\n",
    "vocab_filename = './vocab/vocab.json'\n",
    "with open(vocab_filename, \"w\") as write_file:\n",
    "    json.dump(vocab_to_token_dict, write_file)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##### Encode data body\n",
    "\n",
    "Transform the texts in the data to encodings from the vocabulary created."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import nltk\n",
    "nltk.download('punkt')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-01-20T10:04:05.783149Z",
     "start_time": "2021-01-20T10:04:05.392404Z"
    }
   },
   "outputs": [],
   "source": [
    "customer_text['encoded_content'] = customer_text['Content'].apply(lambda x: sentence_to_tokens(x, vocab_to_token_dict))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "customer_text['labels']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "customer_text['labels']=customer_text['labels'].apply(lambda x: [x])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "customer_text[['labels','encoded_content']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# remove entriews with no text"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "customer_text = customer_text.loc[customer_text['encoded_content'].apply(lambda x: len(x)>0)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "customer_text[['labels','encoded_content', 'Content']]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##### Build sentence pairs Object2Vec"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#negative pairs for the algorithm: need to decide which lables we want to sample *against*. \n",
    "negative_labels_to_sample = range(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sentence_pairs = build_sentence_pairs(customer_text)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##### Build negative sentence pairs for training Object2Vec\n",
    "\n",
    "Negative sampling for the Object2Vec algorithm - add negative and positive pairs (document,label)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sentence_pairs = build_negative_pairs(customer_text,negative_labels_to_sample,sentence_pairs)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"Sample of input for Object2vec algorith: {}\".format(sentence_pairs[1]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install jsonlines"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##### train/test/val split, save to file\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# shuffle and split test/train/val\n",
    "random.seed(42)\n",
    "random.shuffle(sentence_pairs)\n",
    "\n",
    "n_train = int(0.7 * len(sentence_pairs))\n",
    "\n",
    "# split train and test\n",
    "sentence_pairs_train = sentence_pairs[:n_train]\n",
    "sentence_pairs_test = sentence_pairs[n_train:]\n",
    "\n",
    "# further split test set into validation set (val_vectors) and test  set (test_vectors)\n",
    "n_test = len(sentence_pairs_test)\n",
    "\n",
    "sentence_pairs_val = sentence_pairs_test[:n_test//2]\n",
    "sentence_pairs_test = sentence_pairs_test[n_test//2:]\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-01-20T10:05:49.125511Z",
     "start_time": "2021-01-20T10:05:49.099860Z"
    },
    "execution": {
     "iopub.execute_input": "2020-09-22T13:44:10.091478Z",
     "iopub.status.busy": "2020-09-22T13:44:10.091121Z",
     "iopub.status.idle": "2020-09-22T13:44:12.187294Z",
     "shell.execute_reply": "2020-09-22T13:44:12.186515Z",
     "shell.execute_reply.started": "2020-09-22T13:44:10.091444Z"
    }
   },
   "outputs": [],
   "source": [
    "import jsonlines\n",
    "with jsonlines.open('./data/train.jsonl', mode='w') as writer:\n",
    "    writer.write_all(sentence_pairs_train)\n",
    "    \n",
    "with jsonlines.open('./data/test.jsonl', mode='w') as writer:\n",
    "    writer.write_all(sentence_pairs_test)\n",
    "\n",
    "with jsonlines.open('./data/val.jsonl', mode='w') as writer:\n",
    "    writer.write_all(sentence_pairs_val)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##### 8. Upload to S3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-01-20T10:06:00.752265Z",
     "start_time": "2021-01-20T10:06:00.408584Z"
    },
    "execution": {
     "iopub.execute_input": "2020-09-22T13:44:12.188764Z",
     "iopub.status.busy": "2020-09-22T13:44:12.188549Z",
     "iopub.status.idle": "2020-09-22T13:44:13.071645Z",
     "shell.execute_reply": "2020-09-22T13:44:13.070816Z",
     "shell.execute_reply.started": "2020-09-22T13:44:12.188735Z"
    }
   },
   "outputs": [],
   "source": [
    "import os\n",
    "s3_client = boto3.client('s3')\n",
    "\n",
    "out_prefix = \"connect/O2VInput\"\n",
    "for n in ['train', 'test',  'val',]:\n",
    "    s3_client.upload_file(\"./data/\"+n+'.jsonl', bucket_name, \\\n",
    "                          os.path.join(out_prefix, n, n+'.jsonl'),\\\n",
    "                          ExtraArgs = {'ServerSideEncryption':'AES256'}) #upload input files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(vocab_filename)\n",
    "print(out_prefix)\n",
    "print( os.path.join(out_prefix, \"auxiliary/vocab.json\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "s3_client.upload_file(vocab_filename,\n",
    "                      bucket_name, os.path.join(out_prefix, \"auxiliary/vocab.json\"),\n",
    "                      ExtraArgs = {'ServerSideEncryption':'AES256'}) #upload vocab file"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pickle\n",
    "pickle.dump(vocab_to_token_dict, open('./vocab/vocab_to_token_dict.p', 'wb'))\n",
    "pickle.dump(token_to_vocab_dict, open('./vocab/token_to_vocab_dict.p', 'wb'))\n",
    "for f in ['vocab_to_token_dict.p','token_to_vocab_dict.p']:\n",
    "    s3_client.upload_file(\"./vocab/\"+f, bucket_name, \\\n",
    "                          os.path.join(out_prefix, 'meta', f),ExtraArgs = {'ServerSideEncryption':'AES256'})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for f in local_files:\n",
    "    remove_file(f)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "conda_python3",
   "language": "python",
   "name": "conda_python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}