{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Pre-processing the text for Object2Vec\n", "\n", "Processing the text to fit Object2Vec algorithm." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2021-01-20T10:03:01.389224Z", "start_time": "2021-01-20T10:03:00.872885Z" }, "execution": { "iopub.execute_input": "2020-09-22T11:42:30.848002Z", "iopub.status.busy": "2020-09-22T11:42:30.847698Z", "iopub.status.idle": "2020-09-22T11:42:30.852099Z", "shell.execute_reply": "2020-09-22T11:42:30.851258Z", "shell.execute_reply.started": "2020-09-22T11:42:30.847970Z" } }, "outputs": [], "source": [ "import boto3\n", "import pandas as pd\n", "import re\n", "from sklearn import preprocessing\n", "import numpy as np\n", "import json\n", "import os\n", "from sklearn.feature_extraction.text import CountVectorizer\n", "import random\n", "random.seed(42)\n", "from random import sample\n", "from sklearn.utils import shuffle\n", "from nltk import word_tokenize" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Functions" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def get_filtered_objects(bucket_name, prefix):\n", " \"\"\"filter objects based on bucket and prefix\"\"\"\n", " s3 = boto3.client(\"s3\")\n", " files = s3.list_objects_v2(Bucket = bucket_name, Prefix =prefix)\n", " return files" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def download_object(bucket_name, key, local_path):\n", " \"\"\"Download S3 object to local\"\"\"\n", " s3 = boto3.resource('s3')\n", " try:\n", " s3.Bucket(bucket_name).download_file(key,local_path)\n", " except botocore.exceptions.ClientError as e:\n", " if e.response['Error']['Code'] == \"404\":\n", " print(\"The object does not exist\")\n", " else:\n", " raise" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def get_csv(files):\n", " \"\"\"Filter the files by selecting .csv extension\"\"\"\n", " paths = []\n", " for file in files:\n", " if file['Key'].endswith(\".csv\"):\n", " paths.append(file['Key'])\n", " return paths" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def sentence_to_tokens(sentence, vocab_to_tokens):\n", " \"\"\"converts sentences to tokens\"\"\"\n", " words = word_tokenize(sentence)\n", " return [ vocab_to_tokens[w] for w in words if w in vocab_to_tokens]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def create_dir(directory):\n", " \"\"\"Create a directory\"\"\"\n", " if not os.path.exists(directory):\n", " os.makedirs(directory)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def remove_file(file_path):\n", " \"\"\"Remove locally the specified path\"\"\"\n", " if os.path.isfile(file_path):\n", " os.remove(file_path)\n", " else:\n", " print(\"Error, file not found.\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def build_sentence_pairs(data):\n", " \"\"\"transform the dataframe into sentence pairs for Object2Vec algorithm.\"\"\"\n", " sentence_pairs = []\n", " for r in range(len(data)):\n", " row = data.iloc[r]\n", " sentence_pairs.append({'in0': row['encoded_content'], \\\n", " 'in1': row['labels'],\\\n", " 'label':1})\n", " return sentence_pairs" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def build_negative_pairs(data, negative_labels_to_sample,sentence_pairs, n_neg_pairs_per_label=10):\n", " \"\"\"build negative pairs for training dataframe\"\"\"\n", " for r in negative_labels_to_sample:\n", " #news that have that label as tag\n", " selection = data.loc[data.labels.apply(lambda x: x is not None and r in x)]\n", " #news that do not have that label as tag.\n", " wrong_selection = data.loc[data.labels.apply(lambda x: x is not None and r not in x)]\n", " if len(wrong_selection)>0:\n", " for p in range(n_neg_pairs_per_label):\n", " negative_pair = {}\n", " negative_pair['in0'] = selection.sample(1)['encoded_content'].iloc[0]\n", " negative_pair['in1'] = wrong_selection.sample(1)['labels'].iloc[0]\n", " negative_pair['label'] = 0\n", " sentence_pairs.append(negative_pair)\n", " return sentence_pairs" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### Download the data locally" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "bucket_name = \"YOUR_BUCKET_HERE\"\n", "prefix = \"connect/\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#save the files locally.\n", "create_dir(\"./data\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "files = get_filtered_objects(bucket_name, prefix)['Contents']\n", "files = get_csv(files)\n", "local_files=[]\n", "print(files)\n", "for file in files:\n", " full_prefix = \"/\".join(file.split(\"/\")[:-1])\n", " inner_folder = full_prefix.replace(prefix,'')\n", " local_path = \"./data/\" +file.split(\"/\")[-1]\n", " download_object(bucket_name, file, local_path)\n", " local_files.append(local_path)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "local_files" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### Concatenate the .csv files" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas.errors\n", "content = []\n", "for filename in local_files:\n", " try:\n", " df = pd.read_csv(filename, sep=\";\")\n", " print(df.columns)\n", " content.append(df)\n", " except pandas.errors.ParserError:\n", " print(\"File\", filename, \"cannot be parsed. Check its format\")\n", "data = pd.concat(content)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "customer_text = data.loc[data.ParticipantId=='CUSTOMER']" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "customer_text.shape" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### Create random labels\n", "\n", "Change this to use your own labels\n", "Also: we are here replicating the texts to increase statistics" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "customer_text = pd.concat([customer_text]*300, ignore_index=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "customer_text['labels']=np.random.randint(low=0, high=5, size=len(customer_text))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "customer_text.labels.hist()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### Get vocabulary from the corpus using sklearn for the heavy lifting\n", "\n", "The vocabulary will be built only taking into account words that belong to news related to crimes." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2021-01-20T10:04:02.252430Z", "start_time": "2021-01-20T10:04:02.214275Z" }, "execution": { "iopub.execute_input": "2020-09-22T11:43:25.075339Z", "iopub.status.busy": "2020-09-22T11:43:25.075126Z", "iopub.status.idle": "2020-09-22T11:43:26.023392Z", "shell.execute_reply": "2020-09-22T11:43:26.022645Z", "shell.execute_reply.started": "2020-09-22T11:43:25.075308Z" } }, "outputs": [], "source": [ "counts = CountVectorizer(min_df=5, max_df=0.95, token_pattern=r'(?u)\\b[A-Za-z]{2,}\\b').fit(customer_text['Content'].values.tolist())" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2021-01-20T10:04:02.778942Z", "start_time": "2021-01-20T10:04:02.774643Z" }, "execution": { "iopub.execute_input": "2020-09-22T11:43:26.024745Z", "iopub.status.busy": "2020-09-22T11:43:26.024523Z", "iopub.status.idle": "2020-09-22T11:43:26.030167Z", "shell.execute_reply": "2020-09-22T11:43:26.029583Z", "shell.execute_reply.started": "2020-09-22T11:43:26.024715Z" } }, "outputs": [], "source": [ "vocab = counts.get_feature_names()\n", "vocab_to_token_dict = dict(zip(vocab, range(len(vocab))))\n", "token_to_vocab_dict = dict(zip(range(len(vocab)), vocab))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2021-01-20T10:04:03.187494Z", "start_time": "2021-01-20T10:04:03.183508Z" }, "execution": { "iopub.execute_input": "2020-09-22T11:43:26.031900Z", "iopub.status.busy": "2020-09-22T11:43:26.031687Z", "iopub.status.idle": "2020-09-22T11:43:26.036107Z", "shell.execute_reply": "2020-09-22T11:43:26.035548Z", "shell.execute_reply.started": "2020-09-22T11:43:26.031873Z" } }, "outputs": [], "source": [ "len(vocab)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2021-01-20T10:04:03.619569Z", "start_time": "2021-01-20T10:04:03.614481Z" }, "execution": { "iopub.execute_input": "2020-09-22T11:43:26.037456Z", "iopub.status.busy": "2020-09-22T11:43:26.037252Z", "iopub.status.idle": "2020-09-22T11:43:26.045015Z", "shell.execute_reply": "2020-09-22T11:43:26.044448Z", "shell.execute_reply.started": "2020-09-22T11:43:26.037431Z" } }, "outputs": [], "source": [ "create_dir(\"./vocab\")\n", "vocab_filename = './vocab/vocab.json'\n", "with open(vocab_filename, \"w\") as write_file:\n", " json.dump(vocab_to_token_dict, write_file)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### Encode data body\n", "\n", "Transform the texts in the data to encodings from the vocabulary created." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import nltk\n", "nltk.download('punkt')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2021-01-20T10:04:05.783149Z", "start_time": "2021-01-20T10:04:05.392404Z" } }, "outputs": [], "source": [ "customer_text['encoded_content'] = customer_text['Content'].apply(lambda x: sentence_to_tokens(x, vocab_to_token_dict))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "customer_text['labels']" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "customer_text['labels']=customer_text['labels'].apply(lambda x: [x])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "customer_text[['labels','encoded_content']]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# remove entriews with no text" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "customer_text = customer_text.loc[customer_text['encoded_content'].apply(lambda x: len(x)>0)]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "customer_text[['labels','encoded_content', 'Content']]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### Build sentence pairs Object2Vec" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#negative pairs for the algorithm: need to decide which lables we want to sample *against*. \n", "negative_labels_to_sample = range(5)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sentence_pairs = build_sentence_pairs(customer_text)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### Build negative sentence pairs for training Object2Vec\n", "\n", "Negative sampling for the Object2Vec algorithm - add negative and positive pairs (document,label)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sentence_pairs = build_negative_pairs(customer_text,negative_labels_to_sample,sentence_pairs)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(\"Sample of input for Object2vec algorith: {}\".format(sentence_pairs[1]))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!pip install jsonlines" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### train/test/val split, save to file\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# shuffle and split test/train/val\n", "random.seed(42)\n", "random.shuffle(sentence_pairs)\n", "\n", "n_train = int(0.7 * len(sentence_pairs))\n", "\n", "# split train and test\n", "sentence_pairs_train = sentence_pairs[:n_train]\n", "sentence_pairs_test = sentence_pairs[n_train:]\n", "\n", "# further split test set into validation set (val_vectors) and test set (test_vectors)\n", "n_test = len(sentence_pairs_test)\n", "\n", "sentence_pairs_val = sentence_pairs_test[:n_test//2]\n", "sentence_pairs_test = sentence_pairs_test[n_test//2:]\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2021-01-20T10:05:49.125511Z", "start_time": "2021-01-20T10:05:49.099860Z" }, "execution": { "iopub.execute_input": "2020-09-22T13:44:10.091478Z", "iopub.status.busy": "2020-09-22T13:44:10.091121Z", "iopub.status.idle": "2020-09-22T13:44:12.187294Z", "shell.execute_reply": "2020-09-22T13:44:12.186515Z", "shell.execute_reply.started": "2020-09-22T13:44:10.091444Z" } }, "outputs": [], "source": [ "import jsonlines\n", "with jsonlines.open('./data/train.jsonl', mode='w') as writer:\n", " writer.write_all(sentence_pairs_train)\n", " \n", "with jsonlines.open('./data/test.jsonl', mode='w') as writer:\n", " writer.write_all(sentence_pairs_test)\n", "\n", "with jsonlines.open('./data/val.jsonl', mode='w') as writer:\n", " writer.write_all(sentence_pairs_val)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### 8. Upload to S3" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2021-01-20T10:06:00.752265Z", "start_time": "2021-01-20T10:06:00.408584Z" }, "execution": { "iopub.execute_input": "2020-09-22T13:44:12.188764Z", "iopub.status.busy": "2020-09-22T13:44:12.188549Z", "iopub.status.idle": "2020-09-22T13:44:13.071645Z", "shell.execute_reply": "2020-09-22T13:44:13.070816Z", "shell.execute_reply.started": "2020-09-22T13:44:12.188735Z" } }, "outputs": [], "source": [ "import os\n", "s3_client = boto3.client('s3')\n", "\n", "out_prefix = \"connect/O2VInput\"\n", "for n in ['train', 'test', 'val',]:\n", " s3_client.upload_file(\"./data/\"+n+'.jsonl', bucket_name, \\\n", " os.path.join(out_prefix, n, n+'.jsonl'),\\\n", " ExtraArgs = {'ServerSideEncryption':'AES256'}) #upload input files" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(vocab_filename)\n", "print(out_prefix)\n", "print( os.path.join(out_prefix, \"auxiliary/vocab.json\"))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "s3_client.upload_file(vocab_filename,\n", " bucket_name, os.path.join(out_prefix, \"auxiliary/vocab.json\"),\n", " ExtraArgs = {'ServerSideEncryption':'AES256'}) #upload vocab file" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pickle\n", "pickle.dump(vocab_to_token_dict, open('./vocab/vocab_to_token_dict.p', 'wb'))\n", "pickle.dump(token_to_vocab_dict, open('./vocab/token_to_vocab_dict.p', 'wb'))\n", "for f in ['vocab_to_token_dict.p','token_to_vocab_dict.p']:\n", " s3_client.upload_file(\"./vocab/\"+f, bucket_name, \\\n", " os.path.join(out_prefix, 'meta', f),ExtraArgs = {'ServerSideEncryption':'AES256'})" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "for f in local_files:\n", " remove_file(f)" ] } ], "metadata": { "kernelspec": { "display_name": "conda_python3", "language": "python", "name": "conda_python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.13" } }, "nbformat": 4, "nbformat_minor": 4 }