{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "a2a529e3",
   "metadata": {
    "slideshow": {
     "slide_type": "skip"
    }
   },
   "source": [
    "# Setup"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "de416671",
   "metadata": {
    "scrolled": true,
    "slideshow": {
     "slide_type": "skip"
    }
   },
   "outputs": [],
   "source": [
    "#import helper modules\n",
    "import sys\n",
    "import subprocess \n",
    "\n",
    "#import needed aws modules\n",
    "import sagemaker\n",
    "from sagemaker import get_execution_role\n",
    "import boto3\n",
    "import s3fs\n",
    "fs = s3fs.S3FileSystem()\n",
    "\n",
    "#install needed packages for machine learning\n",
    "packages=['ruamel.yaml','sentence-transformers', 'torch','torchvision', 'spacy', 'setuptools', 'wheel', 'gensim', '-Uq']\n",
    "subprocess.check_call([sys.executable, '-m', 'pip', 'install'] + packages)\n",
    "import torch\n",
    "\n",
    "#import bert packages\n",
    "import sentence_transformers\n",
    "from sentence_transformers import SentenceTransformer, util\n",
    "\n",
    "#import needed nlp and text processing libraries\n",
    "!python -m spacy download en_core_web_sm\n",
    "import spacy\n",
    "import en_core_web_sm\n",
    "import nltk\n",
    "import re\n",
    "import gensim\n",
    "import gensim.corpora as corpora\n",
    "from gensim.utils import simple_preprocess\n",
    "from gensim.models import CoherenceModel\n",
    "nltk.download('stopwords')\n",
    "from nltk.corpus import stopwords\n",
    "\n",
    "#import image processing utilities\n",
    "from IPython.display import HTML, display, Image as IImage\n",
    "from PIL import Image, ImageDraw, ImageFont, ExifTags, ImageColor\n",
    "import io\n",
    "from io import BytesIO\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "#import other utilities\n",
    "import json\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import datetime\n",
    "import time\n",
    "import glob\n",
    "import os\n",
    "import csv\n",
    "from pprint import pprint\n",
    "import tarfile"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "05365487",
   "metadata": {
    "scrolled": true,
    "slideshow": {
     "slide_type": "skip"
    }
   },
   "outputs": [],
   "source": [
    "aws_account_id  = boto3.client('sts').get_caller_identity()['Account']  #store your aws account id in a variable that you will use to access topic results\n",
    "\n",
    "\n",
    "sess = sagemaker.Session()\n",
    "role = get_execution_role()\n",
    "print(role)\n",
    "\n",
    "\n",
    "\n",
    "bucket='bucket-name'  #replace with the name of the bucket where your taxonomy and content are stored\n",
    "path_taxonomy='taxonomy-path-name' #replace with the name of the folder where the Content Taxonomy is stored\n",
    "file_taxonomy='taxonomy-file-name' #replace with the name of the file for the Content Taxonomy saved as csv \n",
    "path_texts='texts-path-name' #replace with the name of the folder where extracted text files are stored\n",
    "path_images='images-path-name'#replace with the name of the folder where extracted images are stored\n",
    "path_topics='topics-path-name' #replace with the name of the folder where you want to store outputs from Amazon Comprehend topic modeling\n",
    "\n",
    "region = boto3.Session().region_name \n",
    "s3_client = boto3.client(\"s3\")\n",
    "comprehend_client = boto3.client('comprehend')\n",
    "rekognition_client = boto3.client(\"rekognition\")\n",
    "\n",
    "\n",
    "\n",
    "#Test if variable defaults for file paths have been changed. No action needed\n",
    "if bucket == 'bucket-name':\n",
    "    print(\"Replace the variable 'bucket' with the name of your bucket\")\n",
    "elif path_taxonomy=='taxonomy-path-name':\n",
    "    print(\"Replace the variable 'path_taxonomy' with the name the folder where the content taxonomy is stored\")   \n",
    "elif  file_taxonomy == 'taxonomy-file-name':\n",
    "    print(\"Replace the variable 'file_taxonomy' with the name of the file for the content taxonomy saved as a csv file\")\n",
    "    \n",
    "            \n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ff554e20",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "# 1. Create Content Taxonomy Feature Vectors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e42fbeff",
   "metadata": {
    "scrolled": true,
    "slideshow": {
     "slide_type": "skip"
    }
   },
   "outputs": [],
   "source": [
    "#read content taxonomy from Amazon S3\n",
    "taxonomy_path = 's3://{}/{}/{}'.format(bucket, path_taxonomy, file_taxonomy)\n",
    "read_taxonomy=pd.read_csv(taxonomy_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "35f9d92a",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "outputs": [],
   "source": [
    "\"\"\"\n",
    "The function below is designed to prepare the IAB Tech Lab's Content Taxonomy. \n",
    "If using another taxonomy, modify the code below as needed\n",
    "\n",
    "\"\"\"\n",
    "\n",
    "def prepare_taxonomy(taxonomy_df):\n",
    "    \n",
    "    \"\"\"\n",
    "    Concatenate IAB Tech Lab content taxonomy tiers and prepare keywords for BERT embedding. \n",
    "    Use this function as-is if using the IAB Content Taxonomy\n",
    "    \n",
    "    Parameters (input):\n",
    "    ----------\n",
    "    taxonomy_df : Content taxonomy dataframe\n",
    "\n",
    "    Returns (output):\n",
    "    -------\n",
    "    df_clean : Content taxonomy with tiers in the taxonomy concatenated\n",
    "    keyword_list: List of concatenated content taxonomy keywords\n",
    "    ids: List of ids for the content taxonomy keywords\n",
    "    \"\"\"\n",
    "    \n",
    "    df = taxonomy_df[['Unique ID ','Parent','Name','Tier 1','Tier 2','Tier 3']] \n",
    "    df_str = df.astype({\"Unique ID \": 'str', \"Parent\": 'str', \"Tier 1\": 'str', \"Tier 2\": 'str', \"Tier 3\": 'str'})\n",
    "    df_clean = df_str.replace('nan','')\n",
    "    \n",
    "    #create a column that concatenates all tiers for each taxonomy keyword\n",
    "    df_clean['combined']=df_clean[df_clean.columns[2:6]].apply(lambda x: ' '.join(x.dropna().astype(str)),axis=1)\n",
    "    \n",
    "    #turn taxonomy keyords to list of strings a prep for encoding with BERT sentence transformer\n",
    "    keyword_list=df_clean['combined'].to_list()\n",
    "                       \n",
    "    #get list of taxonomy ids\n",
    "    ids = df_clean['Unique ID '].to_list()                  \n",
    "            \n",
    "    return df_clean, keyword_list, ids\n",
    "\n",
    "taxonomy_df, taxonomy_terms, taxonomy_ids = prepare_taxonomy(read_taxonomy)\n",
    "taxonomy_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b9538338",
   "metadata": {
    "scrolled": true,
    "slideshow": {
     "slide_type": "skip"
    }
   },
   "outputs": [],
   "source": [
    "#check length of taxonomy terms and view them for quick audit\n",
    "print(len(taxonomy_terms))\n",
    "print(taxonomy_terms[0:5])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ecf0fbd4",
   "metadata": {
    "scrolled": true,
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "outputs": [],
   "source": [
    "#initialize BERT sentence transformer model\n",
    "model = SentenceTransformer('paraphrase-MiniLM-L6-v2')\n",
    "\n",
    "#create embeddings for taxonomy terms using sentence transformer\n",
    "taxonomy_embeddings = model.encode(taxonomy_terms, normalize_embeddings=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fb374f57",
   "metadata": {
    "scrolled": true,
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "outputs": [],
   "source": [
    "# View a sample embedding\n",
    "for term, embedding in zip(taxonomy_terms, taxonomy_embeddings):\n",
    "    print(\"Term:\", term)\n",
    "    print(\"Embedding:\", embedding)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "73a8cfca",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "# 2. Create Topic Model Feature Vectors (Keywords)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ce589f19",
   "metadata": {
    "slideshow": {
     "slide_type": "skip"
    }
   },
   "source": [
    "Use gensim's coherence_model to find the optimal number of topics for your collection of article texts. Before using coherence_model, the first step is to preprocess the texts for NLP"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "125ab56a",
   "metadata": {},
   "outputs": [],
   "source": [
    "\"\"\"\n",
    "To train a topic model that fits your data, first find the optimal number of topics for your collection of articles. \n",
    "You will use gensim's coherence_model to find the optimal number of topics for your collection of article texts. \n",
    "Before using coherence_model, the first step is to preprocess the texts for NLP\n",
    "\"\"\"\n",
    "\n",
    "\n",
    "#Test if the variable default for the path to text files has been changed. No action needed\n",
    "if path_texts == 'texts-path-name':\n",
    "    print(\"Replace the variable 'path_texts' with the name of the folder where extracted text files are stored \")\n",
    "\n",
    "\n",
    "\n",
    "def iterate_bucket_items(Bucket, Path=''):\n",
    "    \n",
    "    \"\"\"\n",
    "    Create a generator that iterates over all object in an Amazon S3 bucket\n",
    "    Using this fuction, you will iterate over all the text files in the Amazon S3 bucket\n",
    "    \n",
    "    Parameters (input):\n",
    "    ----------\n",
    "    Bucket: The name of the Amazon S3 bucket, stored in the 'bucket' variable created ealier\n",
    "    Path: The name of the folder where extracted text files are store, stored in the 'path_texts' variable created earlier\n",
    "\n",
    "    Returns (output):\n",
    "    -------\n",
    "    A list of uris for all text files in the Amazon S3 folder \n",
    "    \n",
    "    \"\"\"\n",
    "    \n",
    "\n",
    "    paginator = s3_client.get_paginator('list_objects_v2')\n",
    "    page_iterator = paginator.paginate(Bucket=bucket, Prefix=path_texts)\n",
    "\n",
    "    for page in page_iterator:\n",
    "        if page['KeyCount'] > 0:\n",
    "            for item in page['Contents']:\n",
    "                yield item['Key']\n",
    "\n",
    "#store all article texts in your corpus into a list\n",
    "corpus=[]\n",
    "for i in iterate_bucket_items(Bucket=bucket, Path=path_texts):\n",
    "    file=s3_client.get_object(Bucket=bucket, Key=i)\n",
    "    text = file['Body'].read()\n",
    "    text_format = text.decode(\"utf-8\")\n",
    "    corpus.append(text_format)\n",
    "    \n",
    "#view a sample text file\n",
    "print(corpus[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "593063ee",
   "metadata": {
    "scrolled": true,
    "slideshow": {
     "slide_type": "skip"
    }
   },
   "outputs": [],
   "source": [
    "#tokenize each article in the corpus\n",
    "def tokenize_corpus(corpus):\n",
    "    #remove new line characters in articles\n",
    "    remove_newline= [re.sub('\\s+', ' ', each) for each in corpus]\n",
    "    #remove article quotations\n",
    "    remove_quotations = [re.sub(\"\\\"\", \"\", each) for each in remove_newline]\n",
    "    for each in remove_quotations:\n",
    "        yield(gensim.utils.simple_preprocess(str(each), deacc=True))\n",
    "        \n",
    "#preprocess tokenized corpus for topic modelling\n",
    "def clean_stops(corpus):\n",
    "    #prepare stopwords\n",
    "    stop_words = stopwords.words('english')\n",
    "    #stop_words.extend([]) # optionally add a list over-indexed words in your corpus that don't give context (e.g.: name of content publisher)\n",
    "    \n",
    "    #remove stop words\n",
    "    tokens=list(tokenize_corpus(corpus))\n",
    "    corpus_cleaned= [[word for word in simple_preprocess(str(each)) if word not in stop_words] for each in tokens]\n",
    "    return corpus_cleaned\n",
    "    \n",
    "def n_grams(corpus):\n",
    "    corpus_words= clean_stops(corpus)\n",
    "    # Build the bigram and trigram models\n",
    "    bigram = gensim.models.Phrases(corpus_words, min_count=5, threshold=100) # higher threshold fewer phrases.\n",
    "    trigram = gensim.models.Phrases(bigram[corpus_words], threshold=100)\n",
    "    \n",
    "    # Groups sentences as trigrams/bigram\n",
    "    bigram_mod = gensim.models.phrases.Phraser(bigram)\n",
    "    trigram_mod = gensim.models.phrases.Phraser(trigram)\n",
    "\n",
    "    #create n_grams\n",
    "    corpus_bigrams = [bigram_mod[each] for each in corpus_words]\n",
    "    corpus_trigrams = [trigram_mod[bigram_mod[each]] for each in corpus_words]\n",
    "    return corpus_trigrams\n",
    "\n",
    "\n",
    "nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])\n",
    "def lemmatize(corpus, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):\n",
    "    corpus_ngrams=n_grams(corpus)\n",
    "    corpus_processed = []\n",
    "    for each in corpus_ngrams:\n",
    "        story = nlp(\" \".join(each)) \n",
    "        corpus_processed.append([token.lemma_ for token in story if token.pos_ in allowed_postags])\n",
    "    return corpus_processed\n",
    "   \n",
    "corpus_words = lemmatize(corpus, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])\n",
    "\n",
    "#view a sample preprocessed text file\n",
    "print(corpus_words[:1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "aa9887da",
   "metadata": {
    "scrolled": true,
    "slideshow": {
     "slide_type": "skip"
    }
   },
   "outputs": [],
   "source": [
    "#create a topic modeling dictionary and corpus for a baseline topic model\n",
    "id2word = corpora.Dictionary(corpus_words)\n",
    "\n",
    "# compute term document frequency\n",
    "corpus_tdf = [id2word.doc2bow(text) for text in corpus_words]\n",
    "\n",
    "#create baseline topic model with arbitrary number of topics --lda\n",
    "lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus_tdf,\n",
    "                                           id2word=id2word,\n",
    "                                           num_topics=12, \n",
    "                                           random_state=100,\n",
    "                                           update_every=1,\n",
    "                                           chunksize=100,\n",
    "                                           passes=10,\n",
    "                                           alpha='auto',\n",
    "                                           per_word_topics=True)\n",
    "\n",
    "#calculate a baseline coherence score for the baseline topic model\n",
    "coherence_model_lda = CoherenceModel(model=lda_model, texts=corpus_words, dictionary=id2word, coherence='c_v')\n",
    "coherence_lda = coherence_model_lda.get_coherence()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "01c22aea",
   "metadata": {
    "slideshow": {
     "slide_type": "skip"
    }
   },
   "outputs": [],
   "source": [
    "#find the optimal number of topics for your text corpus starting with the baseline topic model\n",
    "def compute_coherence_scores(dictionary, corpus, texts, limit, start=2, step=3):\n",
    "    \"\"\"\n",
    "    Compute coherence scores for various number of topics for your topic model. \n",
    "    Adjust the parameters below based on your data\n",
    "\n",
    "    Parameters (input):\n",
    "    ----------\n",
    "    dictionary : Gensim dictionary created earlier from input texts\n",
    "    corpus : Gensim corpus created earlier from input texts\n",
    "    texts : List of input texts\n",
    "    limit : The maximum number of topics to test. Amazon Comprehend can detect up to 100 topics in a collection\n",
    "\n",
    "    Returns (output):\n",
    "    -------\n",
    "    models : List of LDA topic models\n",
    "    coherence_scores : Coherence values corresponding to the LDA model with respective number of topics\n",
    "    \"\"\"\n",
    "    coherence_scores = []\n",
    "    models = []\n",
    "    for num_topics in range(start, limit, step):\n",
    "        model = gensim.models.LdaMulticore(corpus=corpus, num_topics=num_topics, id2word=id2word)\n",
    "        models.append(model)\n",
    "        coherencemodel = CoherenceModel(model=model, texts=corpus_words, dictionary=id2word, coherence='c_v')\n",
    "        coherence_scores.append(coherencemodel.get_coherence())\n",
    "\n",
    "    return models, coherence_scores\n",
    "\n",
    "models, coherence_scores = compute_coherence_scores(dictionary=id2word, corpus=corpus_tdf, texts=corpus_words, start=2, limit=100, step=3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "35a4a647",
   "metadata": {
    "scrolled": true,
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "outputs": [],
   "source": [
    "\"\"\"\n",
    "Visualize the coherence scores for topic models trained with different values for number of topics\n",
    "The number of topics with the highest coherence score is the optimal number of topics for your text corpus\n",
    "Note that Amazon Comprehend can detect up to 100 topics in a collection\n",
    "\"\"\"\n",
    "\n",
    "%matplotlib inline\n",
    "\n",
    "limit=100; start=2; step=3;\n",
    "x = range(start, limit, step)\n",
    "plt.plot(x, coherence_scores)\n",
    "plt.xlabel(\"Num Topics\")\n",
    "plt.ylabel(\"Coherence score\")\n",
    "plt.legend((\"coherence_scores\"), loc='best')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e207b9f3",
   "metadata": {},
   "outputs": [],
   "source": [
    "num_topics= 0 #change to the number of topics with the highest coherence score"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a34dd37a",
   "metadata": {
    "slideshow": {
     "slide_type": "skip"
    }
   },
   "source": [
    "Process results of Amazon Comprehend Topic Modeling"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "030c3c26",
   "metadata": {},
   "outputs": [],
   "source": [
    "\"\"\"\n",
    "Run Amazon Comprehend topic modeling job using num_topics as the number of topics\n",
    "Input format requires all documents saved as individual text files in your Amazon S3 bucket\n",
    "\n",
    "\"\"\"\n",
    "\n",
    "#Test if the variable defaults for the path to text files and topics have been changed. No action needed\n",
    "if path_texts == 'texts-path-name':\n",
    "    print(\"Replace the variable 'path_texts' with the name of the folder where extracted text files are stored\")\n",
    "elif path_topics == 'topics-path-name':\n",
    "    print(\"Replace the variable 'path_topics' with the name of the folder where you want to store topic modeling output files\")\n",
    "\n",
    "\n",
    "\n",
    "job_id = comprehend_client.start_topics_detection_job(\n",
    "    InputDataConfig={\n",
    "        'S3Uri': 's3://{}/{}'.format(bucket, path_texts), #Amazon S3 uri of the bucket and folder with all source text files\n",
    "        'InputFormat': 'ONE_DOC_PER_FILE',\n",
    "    },\n",
    "    OutputDataConfig={\n",
    "        'S3Uri': 's3://{}/{}'.format(bucket, path_topics) #Amazon S3 uri of the bucket and folder where you store output from Amazon Comprehend topic modeling\n",
    "    },\n",
    "    DataAccessRoleArn='arn:aws:iam::account-ID:role/data-access-role', #change this to the arn of a data access role...\n",
    "    JobName='Name-of-topic-modeling-job', #change to the desired name for the topic modeling job\n",
    "    NumberOfTopics=num_topics)['JobId']\n",
    "\n",
    "sleep_time_sec = 60 # 1 min\n",
    "times_polled = 0\n",
    "max_polls = 60 # max 1 hr\n",
    "while times_polled < max_polls:\n",
    "    status = comprehend_client.describe_topics_detection_job(JobId=job_id)['TopicsDetectionJobProperties']['JobStatus']\n",
    "    if status in ['COMPLETED', 'FAILED', 'STOPPED']:\n",
    "        break\n",
    "    time.sleep(sleep_time_sec)\n",
    "else:\n",
    "    # Exceeded max poll number of polls\n",
    "    pass"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "41551a60",
   "metadata": {},
   "outputs": [],
   "source": [
    "\"\"\"\n",
    "Amazon Comprehend topic modeling outputs two csv files compressed as a tarfile:\n",
    "    1. topic-terms.csv -- a list of the topics detected in the collection \n",
    "    2. doc-topics.csv -- lists the documents associated with a topic and the proportion of the document that is concerned with the topic.\n",
    "\n",
    "You will need to download the topic modeling output locally and decompress the tarfile to access them\n",
    "\"\"\"\n",
    " \n",
    "#download the topic modeling output locally     \n",
    "topics-data='{}/{}-{}-{}/{}'.format(path_topics,aws_account_id,'TOPICS',job_id,'output/output.tar.gz')\n",
    "s3_client.download_file(bucket, topics-data, 'output.tar.gz')\n",
    "\n",
    "#decompress the output file\n",
    "file_name = 'output.tar.gz'\n",
    "tar = tarfile.open(file_name, \"r:gz\")\n",
    "tar.extractall()\n",
    "tar.close()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3fbb1dc4",
   "metadata": {
    "scrolled": true,
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "outputs": [],
   "source": [
    "#read the topic terms file into a pandas dataframe\n",
    "topics=pd.read_csv('topic-terms.csv')\n",
    "\n",
    "#view a sample topic and its keywords\n",
    "topics[topics['topic']==0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "deb056dc",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "outputs": [],
   "source": [
    "#read document to topic assignments file into a pandas dataframe\n",
    "doc_topics=pd.read_csv('doc-topics.csv')\n",
    "\n",
    "#view a sample documnet to topic assignments\n",
    "doc_topics.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e63d6616",
   "metadata": {
    "scrolled": true,
    "slideshow": {
     "slide_type": "skip"
    }
   },
   "outputs": [],
   "source": [
    "#transfer individual individual topic keywords to a list\n",
    "topic_keywords=topics['term'].to_list()\n",
    "topic_ids=topics['topic'].to_list()\n",
    "keyword_weights=topics['weight'].to_list()\n",
    "\n",
    "print(len(topic_keywords))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "51b7c4eb",
   "metadata": {
    "scrolled": true,
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "outputs": [],
   "source": [
    "#create and show embeddings for topic keywords\n",
    "keyword_embeddings = model.encode(topic_keywords, normalize_embeddings=True)\n",
    "for keyword, embedding in zip(topic_keywords, keyword_embeddings):\n",
    "    print(\"Keyword:\", keyword)\n",
    "    print(\"Embedding:\", embedding)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e5aec1df",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "# 3. Prep for Image Label Feature Vectors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c79797c6",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "outputs": [],
   "source": [
    "\n",
    "\"\"\"\n",
    "Create a function to extract object labels from a given image using Amazon Rekognition\n",
    "\n",
    "\"\"\"\n",
    "\n",
    "def get_image_labels(image_loc):\n",
    "    labels = []\n",
    "    with fs.open(image_loc, \"rb\") as im:\n",
    "        response = rekognition_client.detect_labels(Image={\"Bytes\": im.read()})\n",
    "    \n",
    "    for label in response[\"Labels\"]:\n",
    "        if label[\"Confidence\"] >= 60:   #change to desired confidence score threshold, value between [0,100]:\n",
    "            object_label = label[\"Name\"]\n",
    "            labels.append(object_label)\n",
    "    return labels"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "be9107ef",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "# 4. Compute Cosine Similarity Scores "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "61004925",
   "metadata": {
    "scrolled": true,
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "outputs": [],
   "source": [
    "def compute_similarity(entity_embeddings, entity_terms, taxonomy_embeddings, taxonomy_terms):\n",
    "    \"\"\"\n",
    "    Compute cosine scores between entity embeddings and taxonomy embeddings\n",
    "    \n",
    "    Parameters (input):\n",
    "    ----------\n",
    "    entity_embeddings : Embeddings for either topic keywords from Amazon Comprehend or image labels from Amazon Rekognition\n",
    "    entity_terms : Terms for topic keywords or image labels\n",
    "    taxonomy_embeddings : Embeddings for the content taxonomy\n",
    "    taxonomy_terms : Terms for the taxonomy keywords\n",
    "\n",
    "    Returns (output):\n",
    "    -------\n",
    "    mapping_df : Dataframe that matches each entity keyword to each taxonomy keyword and their cosine similarity score\n",
    "    \"\"\"\n",
    "    \n",
    "    #calculate cosine score, pairing each entity embedding with each taxonomy keyword embedding\n",
    "    cosine_scores = util.pytorch_cos_sim(entity_embeddings, taxonomy_embeddings)\n",
    "    pairs = []\n",
    "    for i in range(len(cosine_scores)-1):\n",
    "        for j in range(0, cosine_scores.shape[1]):\n",
    "            pairs.append({'index': [i, j], 'score': cosine_scores[i][j]})\n",
    "    \n",
    "    #Sort cosine similarity scores in decreasing order\n",
    "    pairs = sorted(pairs, key=lambda x: x['score'], reverse=True)\n",
    "    rows = []\n",
    "    for pair in pairs:\n",
    "        i, j = pair['index']\n",
    "        rows.append([entity_terms[i], taxonomy_terms[j], pair['score']])\n",
    "    \n",
    "    #move sorted values to a dataframe\n",
    "    mapping_df= pd.DataFrame(rows, columns=[\"term\", \"taxonomy_keyword\",\"cosine_similarity\"])\n",
    "    mapping_df['cosine_similarity'] = mapping_df['cosine_similarity'].astype('float')\n",
    "    mapping_df= mapping_df.sort_values(by=['term','cosine_similarity'], ascending=False)\n",
    "    drop_dups= mapping_df.drop_duplicates(subset=['term'], keep='first')\n",
    "    mapping_df = drop_dups.sort_values(by=['cosine_similarity'], ascending=False).reset_index(drop=True)\n",
    "    return mapping_df\n",
    "                                               \n",
    "#compute cosine_similairty score between topic keywords and content taxonomy keywords using BERT embeddings                                               \n",
    "text_taxonomy_mapping=compute_similarity(keyword_embeddings, topic_keywords, taxonomy_embeddings, taxonomy_terms)  \n",
    "\n",
    "#View some mappings from topic keywords to keywords on the content taxonomy and check the similarity score for each\n",
    "text_taxonomy_mapping.head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cbb22592",
   "metadata": {
    "slideshow": {
     "slide_type": "skip"
    }
   },
   "outputs": [],
   "source": [
    "def pair_text_taxonomy(mapping_df, topic_df):\n",
    "    \n",
    "    \"\"\"\n",
    "    Create a mapping of topic keywords to content taxonomy keywords with topic keywords grouped in thier assigned topics\n",
    "    \n",
    "    Parameters (input):\n",
    "    ----------\n",
    "    mapping_df : Output dataframe from the compute_similarity function\n",
    "    topic_df : Terms for topic keywords\n",
    "\n",
    "    Returns (output):\n",
    "    -------\n",
    "    ordered_df : Dataframe mapping of topic keywords to content taxonomy keywords with topic keywords grouped in thier assigned topics\n",
    "    \"\"\"\n",
    "    \n",
    "    df=mapping_df.drop_duplicates(subset=[\"term\",\"taxonomy_keyword\"], keep='first')\n",
    "    merged=df.merge(topic_df, how='inner', on=['term'])\n",
    "    sorted_df= merged.sort_values(by='cosine_similarity', ascending=False)\n",
    "    topic_taxonomy_df= sorted_df.sort_values(by='topic').reset_index(drop=True)\n",
    "    ordered_df=topic_taxonomy_df[[\"topic\",\"term\",\"taxonomy_keyword\",\"weight\",\"cosine_similarity\"]]\n",
    "    return ordered_df\n",
    "\n",
    "\n",
    "#view text to taxonomy mapping with topics grouped together\n",
    "topic_taxonomy_mapping=pair_text_taxonomy(text_taxonomy_mapping,topics)\n",
    "topic_taxonomy_mapping.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "79cc945f",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "# 5. Analyze Webpage Content"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6549f1b3",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "outputs": [],
   "source": [
    "#Test if path_texts variable was updated: No action needed\n",
    "if path_texts == 'texts-path-name':\n",
    "    print(\"Replace the variable 'path_texts' with the name of the folder where extracted text files are stored\")\n",
    "    \n",
    "\n",
    "#Select and review a sample article and review its content    \n",
    "file_name = 'my-sample-file.txt' #replace with the name of a text file in your Amazon S3 bucket\n",
    "file_path = '{}/{}'.format(path_texts, file_name)\n",
    "\n",
    "file_object= s3_client.get_object(Bucket=bucket, Key=file_path)\n",
    "file_content = file_object['Body'].read()\n",
    "\n",
    "print(file_content.decode(\"utf-8\"))\n",
    "\n",
    "#Test if file_name variable was updated: No action needed\n",
    "if file_name == 'my-sample-file.txt':\n",
    "    print(\"Replace the variable 'file_name' with the name of a text file in your Amazon S3 bucket\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d47d30d9",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "Text to Taxonomy Mapping"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "dc54c7da",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "outputs": [],
   "source": [
    "#check the topic assignment(s) for your selected article and identify the topic number with the highest proportion\n",
    "doc_topics[doc_topics['docname']== file_name]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fee78fa8",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "outputs": [],
   "source": [
    "#check topic keywords and content taxonomy mapping for the topic number identified in the previous step\n",
    "\n",
    "text_taxonomy_mapping=topic_taxonomy_mapping[topic_taxonomy_mapping['topic']== 0] #replace the 0 with your topic number\n",
    "text_taxonomy_mapping.sort_values(by='cosine_similarity', ascending=False)\n",
    "print(text_taxonomy_mapping)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "bea7ab71",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "Image to Taxonomy Mapping"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6088b0ea",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "outputs": [],
   "source": [
    "#Test if path_images variable was updated: No action needed\n",
    "if path_images == 'images-path-name':\n",
    "    print(\"Replace the variable 'path_images' with the name of the folder where extracted images are stored\")\n",
    "\n",
    "    \n",
    "#Select an image from the same webpage as the article you selected    \n",
    "image_name = 'my-sample-image.jpg' #replace with the name of a text file in your Amazon S3 bucket\n",
    "image_path = 's3://{}/{}/{}'.format(bucket, path_images, image_name)\n",
    "\n",
    "\n",
    "#Test if image_name variable was updated: No action needed\n",
    "if image_name == 'my-sample-image.jpg':\n",
    "    print(\"Replace the variable 'image_name' with the name of a text file in your Amazon S3 bucket\")\n",
    "\n",
    "\n",
    "#View the image\n",
    "with fs.open(image_path) as im:\n",
    "    display(Image.open(im))\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3b5c3cc7",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "outputs": [],
   "source": [
    "#Call Amazon Reckognition label detection API to get labels\n",
    "\n",
    "image_labels=get_image_labels(image_path)\n",
    "print(image_labels)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7f725c74",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "outputs": [],
   "source": [
    "#create BERT embeddings for each of the image labels\n",
    "label_embeddings = model.encode(image_labels, normalize_embeddings=True)\n",
    "\n",
    "#view the mapping of image labels to content taxonomy for the image selected \n",
    "image_taxonomy_mapping=compute_similarity(label_embeddings, image_labels, taxonomy_embeddings, taxonomy_terms)\n",
    "print(image_taxonomy_mapping)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b2b3bdee",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "Select Keywords for Realtime Bidding"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cc1d02fe",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "outputs": [],
   "source": [
    "#merge text and image keywords mapped to content taxonomy\n",
    "rtb_keywords=pd.concat([text_taxonomy_mapping[[\"term\",\"taxonomy_keyword\",\"cosine_similarity\"]],image_taxonomy_mapping]).sort_values(by='cosine_similarity',ascending=False).reset_index(drop=True)\n",
    "\n",
    "#select keywords with a cosine_similarity score greater than your desired threshold ( the value should be from 0 to 1)\n",
    "rtb_keywords[rtb_keywords[\"cosine_similarity\"]> 50] # change to desired threshold for cosine score, value between [0,100]:"
   ]
  }
 ],
 "metadata": {
  "celltoolbar": "Slideshow",
  "kernelspec": {
   "display_name": "conda_pytorch_p38",
   "language": "python",
   "name": "conda_pytorch_p38"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}