# Setup

In [None]:
#import helper modules
import sys
import subprocess 

#import needed aws modules
import sagemaker
from sagemaker import get_execution_role
import boto3
import s3fs
fs = s3fs.S3FileSystem()

#install needed packages for machine learning
packages=['ruamel.yaml','sentence-transformers', 'torch','torchvision', 'spacy', 'setuptools', 'wheel', 'gensim', '-Uq']
subprocess.check_call([sys.executable, '-m', 'pip', 'install'] + packages)
import torch

#import bert packages
import sentence_transformers
from sentence_transformers import SentenceTransformer, util

#import needed nlp and text processing libraries
!python -m spacy download en_core_web_sm
import spacy
import en_core_web_sm
import nltk
import re
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
nltk.download('stopwords')
from nltk.corpus import stopwords

#import image processing utilities
from IPython.display import HTML, display, Image as IImage
from PIL import Image, ImageDraw, ImageFont, ExifTags, ImageColor
import io
from io import BytesIO
import matplotlib.pyplot as plt

#import other utilities
import json
import pandas as pd
import numpy as np
import datetime
import time
import glob
import os
import csv
from pprint import pprint
import tarfile

In [None]:
aws_account_id = boto3.client('sts').get_caller_identity()['Account'] #store your aws account id in a variable that you will use to access topic results


sess = sagemaker.Session()
role = get_execution_role()
print(role)



bucket='bucket-name' #replace with the name of the bucket where your taxonomy and content are stored
path_taxonomy='taxonomy-path-name' #replace with the name of the folder where the Content Taxonomy is stored
file_taxonomy='taxonomy-file-name' #replace with the name of the file for the Content Taxonomy saved as csv 
path_texts='texts-path-name' #replace with the name of the folder where extracted text files are stored
path_images='images-path-name'#replace with the name of the folder where extracted images are stored
path_topics='topics-path-name' #replace with the name of the folder where you want to store outputs from Amazon Comprehend topic modeling

region = boto3.Session().region_name 
s3_client = boto3.client("s3")
comprehend_client = boto3.client('comprehend')
rekognition_client = boto3.client("rekognition")



#Test if variable defaults for file paths have been changed. No action needed
if bucket == 'bucket-name':
 print("Replace the variable 'bucket' with the name of your bucket")
elif path_taxonomy=='taxonomy-path-name':
 print("Replace the variable 'path_taxonomy' with the name the folder where the content taxonomy is stored") 
elif file_taxonomy == 'taxonomy-file-name':
 print("Replace the variable 'file_taxonomy' with the name of the file for the content taxonomy saved as a csv file")
 
 


# 1. Create Content Taxonomy Feature Vectors

In [None]:
#read content taxonomy from Amazon S3
taxonomy_path = 's3://{}/{}/{}'.format(bucket, path_taxonomy, file_taxonomy)
read_taxonomy=pd.read_csv(taxonomy_path)

In [None]:
"""
The function below is designed to prepare the IAB Tech Lab's Content Taxonomy. 
If using another taxonomy, modify the code below as needed

"""

def prepare_taxonomy(taxonomy_df):
 
 """
 Concatenate IAB Tech Lab content taxonomy tiers and prepare keywords for BERT embedding. 
 Use this function as-is if using the IAB Content Taxonomy
 
 Parameters (input):
 ----------
 taxonomy_df : Content taxonomy dataframe

 Returns (output):
 -------
 df_clean : Content taxonomy with tiers in the taxonomy concatenated
 keyword_list: List of concatenated content taxonomy keywords
 ids: List of ids for the content taxonomy keywords
 """
 
 df = taxonomy_df[['Unique ID ','Parent','Name','Tier 1','Tier 2','Tier 3']] 
 df_str = df.astype({"Unique ID ": 'str', "Parent": 'str', "Tier 1": 'str', "Tier 2": 'str', "Tier 3": 'str'})
 df_clean = df_str.replace('nan','')
 
 #create a column that concatenates all tiers for each taxonomy keyword
 df_clean['combined']=df_clean[df_clean.columns[2:6]].apply(lambda x: ' '.join(x.dropna().astype(str)),axis=1)
 
 #turn taxonomy keyords to list of strings a prep for encoding with BERT sentence transformer
 keyword_list=df_clean['combined'].to_list()
 
 #get list of taxonomy ids
 ids = df_clean['Unique ID '].to_list() 
 
 return df_clean, keyword_list, ids

taxonomy_df, taxonomy_terms, taxonomy_ids = prepare_taxonomy(read_taxonomy)
taxonomy_df.head()

In [None]:
#check length of taxonomy terms and view them for quick audit
print(len(taxonomy_terms))
print(taxonomy_terms[0:5])

In [None]:
#initialize BERT sentence transformer model
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

#create embeddings for taxonomy terms using sentence transformer
taxonomy_embeddings = model.encode(taxonomy_terms, normalize_embeddings=True)

In [None]:
# View a sample embedding
for term, embedding in zip(taxonomy_terms, taxonomy_embeddings):
 print("Term:", term)
 print("Embedding:", embedding)

# 2. Create Topic Model Feature Vectors (Keywords)

Use gensim's coherence_model to find the optimal number of topics for your collection of article texts. Before using coherence_model, the first step is to preprocess the texts for NLP

In [None]:
"""
To train a topic model that fits your data, first find the optimal number of topics for your collection of articles. 
You will use gensim's coherence_model to find the optimal number of topics for your collection of article texts. 
Before using coherence_model, the first step is to preprocess the texts for NLP
"""


#Test if the variable default for the path to text files has been changed. No action needed
if path_texts == 'texts-path-name':
 print("Replace the variable 'path_texts' with the name of the folder where extracted text files are stored ")



def iterate_bucket_items(Bucket, Path=''):
 
 """
 Create a generator that iterates over all object in an Amazon S3 bucket
 Using this fuction, you will iterate over all the text files in the Amazon S3 bucket
 
 Parameters (input):
 ----------
 Bucket: The name of the Amazon S3 bucket, stored in the 'bucket' variable created ealier
 Path: The name of the folder where extracted text files are store, stored in the 'path_texts' variable created earlier

 Returns (output):
 -------
 A list of uris for all text files in the Amazon S3 folder 
 
 """
 

 paginator = s3_client.get_paginator('list_objects_v2')
 page_iterator = paginator.paginate(Bucket=bucket, Prefix=path_texts)

 for page in page_iterator:
 if page['KeyCount'] > 0:
 for item in page['Contents']:
 yield item['Key']

#store all article texts in your corpus into a list
corpus=[]
for i in iterate_bucket_items(Bucket=bucket, Path=path_texts):
 file=s3_client.get_object(Bucket=bucket, Key=i)
 text = file['Body'].read()
 text_format = text.decode("utf-8")
 corpus.append(text_format)
 
#view a sample text file
print(corpus[0])

In [None]:
#tokenize each article in the corpus
def tokenize_corpus(corpus):
 #remove new line characters in articles
 remove_newline= [re.sub('\s+', ' ', each) for each in corpus]
 #remove article quotations
 remove_quotations = [re.sub("\"", "", each) for each in remove_newline]
 for each in remove_quotations:
 yield(gensim.utils.simple_preprocess(str(each), deacc=True))
 
#preprocess tokenized corpus for topic modelling
def clean_stops(corpus):
 #prepare stopwords
 stop_words = stopwords.words('english')
 #stop_words.extend([]) # optionally add a list over-indexed words in your corpus that don't give context (e.g.: name of content publisher)
 
 #remove stop words
 tokens=list(tokenize_corpus(corpus))
 corpus_cleaned= [[word for word in simple_preprocess(str(each)) if word not in stop_words] for each in tokens]
 return corpus_cleaned
 
def n_grams(corpus):
 corpus_words= clean_stops(corpus)
 # Build the bigram and trigram models
 bigram = gensim.models.Phrases(corpus_words, min_count=5, threshold=100) # higher threshold fewer phrases.
 trigram = gensim.models.Phrases(bigram[corpus_words], threshold=100)
 
 # Groups sentences as trigrams/bigram
 bigram_mod = gensim.models.phrases.Phraser(bigram)
 trigram_mod = gensim.models.phrases.Phraser(trigram)

 #create n_grams
 corpus_bigrams = [bigram_mod[each] for each in corpus_words]
 corpus_trigrams = [trigram_mod[bigram_mod[each]] for each in corpus_words]
 return corpus_trigrams


nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
def lemmatize(corpus, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
 corpus_ngrams=n_grams(corpus)
 corpus_processed = []
 for each in corpus_ngrams:
 story = nlp(" ".join(each)) 
 corpus_processed.append([token.lemma_ for token in story if token.pos_ in allowed_postags])
 return corpus_processed
 
corpus_words = lemmatize(corpus, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

#view a sample preprocessed text file
print(corpus_words[:1])

In [None]:
#create a topic modeling dictionary and corpus for a baseline topic model
id2word = corpora.Dictionary(corpus_words)

# compute term document frequency
corpus_tdf = [id2word.doc2bow(text) for text in corpus_words]

#create baseline topic model with arbitrary number of topics --lda
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus_tdf,
 id2word=id2word,
 num_topics=12, 
 random_state=100,
 update_every=1,
 chunksize=100,
 passes=10,
 alpha='auto',
 per_word_topics=True)

#calculate a baseline coherence score for the baseline topic model
coherence_model_lda = CoherenceModel(model=lda_model, texts=corpus_words, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()


In [None]:
#find the optimal number of topics for your text corpus starting with the baseline topic model
def compute_coherence_scores(dictionary, corpus, texts, limit, start=2, step=3):
 """
 Compute coherence scores for various number of topics for your topic model. 
 Adjust the parameters below based on your data

 Parameters (input):
 ----------
 dictionary : Gensim dictionary created earlier from input texts
 corpus : Gensim corpus created earlier from input texts
 texts : List of input texts
 limit : The maximum number of topics to test. Amazon Comprehend can detect up to 100 topics in a collection

 Returns (output):
 -------
 models : List of LDA topic models
 coherence_scores : Coherence values corresponding to the LDA model with respective number of topics
 """
 coherence_scores = []
 models = []
 for num_topics in range(start, limit, step):
 model = gensim.models.LdaMulticore(corpus=corpus, num_topics=num_topics, id2word=id2word)
 models.append(model)
 coherencemodel = CoherenceModel(model=model, texts=corpus_words, dictionary=id2word, coherence='c_v')
 coherence_scores.append(coherencemodel.get_coherence())

 return models, coherence_scores

models, coherence_scores = compute_coherence_scores(dictionary=id2word, corpus=corpus_tdf, texts=corpus_words, start=2, limit=100, step=3)

In [None]:
"""
Visualize the coherence scores for topic models trained with different values for number of topics
The number of topics with the highest coherence score is the optimal number of topics for your text corpus
Note that Amazon Comprehend can detect up to 100 topics in a collection
"""

%matplotlib inline

limit=100; start=2; step=3;
x = range(start, limit, step)
plt.plot(x, coherence_scores)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_scores"), loc='best')
plt.show()

In [None]:
num_topics= 0 #change to the number of topics with the highest coherence score

Process results of Amazon Comprehend Topic Modeling

In [None]:
"""
Run Amazon Comprehend topic modeling job using num_topics as the number of topics
Input format requires all documents saved as individual text files in your Amazon S3 bucket

"""

#Test if the variable defaults for the path to text files and topics have been changed. No action needed
if path_texts == 'texts-path-name':
 print("Replace the variable 'path_texts' with the name of the folder where extracted text files are stored")
elif path_topics == 'topics-path-name':
 print("Replace the variable 'path_topics' with the name of the folder where you want to store topic modeling output files")



job_id = comprehend_client.start_topics_detection_job(
 InputDataConfig={
 'S3Uri': 's3://{}/{}'.format(bucket, path_texts), #Amazon S3 uri of the bucket and folder with all source text files
 'InputFormat': 'ONE_DOC_PER_FILE',
 },
 OutputDataConfig={
 'S3Uri': 's3://{}/{}'.format(bucket, path_topics) #Amazon S3 uri of the bucket and folder where you store output from Amazon Comprehend topic modeling
 },
 DataAccessRoleArn='arn:aws:iam::account-ID:role/data-access-role', #change this to the arn of a data access role...
 JobName='Name-of-topic-modeling-job', #change to the desired name for the topic modeling job
 NumberOfTopics=num_topics)['JobId']

sleep_time_sec = 60 # 1 min
times_polled = 0
max_polls = 60 # max 1 hr
while times_polled < max_polls:
 status = comprehend_client.describe_topics_detection_job(JobId=job_id)['TopicsDetectionJobProperties']['JobStatus']
 if status in ['COMPLETED', 'FAILED', 'STOPPED']:
 break
 time.sleep(sleep_time_sec)
else:
 # Exceeded max poll number of polls
 pass

In [None]:
"""
Amazon Comprehend topic modeling outputs two csv files compressed as a tarfile:
 1. topic-terms.csv -- a list of the topics detected in the collection 
 2. doc-topics.csv -- lists the documents associated with a topic and the proportion of the document that is concerned with the topic.

You will need to download the topic modeling output locally and decompress the tarfile to access them
"""
 
#download the topic modeling output locally 
topics-data='{}/{}-{}-{}/{}'.format(path_topics,aws_account_id,'TOPICS',job_id,'output/output.tar.gz')
s3_client.download_file(bucket, topics-data, 'output.tar.gz')

#decompress the output file
file_name = 'output.tar.gz'
tar = tarfile.open(file_name, "r:gz")
tar.extractall()
tar.close()


In [None]:
#read the topic terms file into a pandas dataframe
topics=pd.read_csv('topic-terms.csv')

#view a sample topic and its keywords
topics[topics['topic']==0]

In [None]:
#read document to topic assignments file into a pandas dataframe
doc_topics=pd.read_csv('doc-topics.csv')

#view a sample documnet to topic assignments
doc_topics.head()

In [None]:
#transfer individual individual topic keywords to a list
topic_keywords=topics['term'].to_list()
topic_ids=topics['topic'].to_list()
keyword_weights=topics['weight'].to_list()

print(len(topic_keywords))

In [None]:
#create and show embeddings for topic keywords
keyword_embeddings = model.encode(topic_keywords, normalize_embeddings=True)
for keyword, embedding in zip(topic_keywords, keyword_embeddings):
 print("Keyword:", keyword)
 print("Embedding:", embedding)

# 3. Prep for Image Label Feature Vectors

In [None]:

"""
Create a function to extract object labels from a given image using Amazon Rekognition

"""

def get_image_labels(image_loc):
 labels = []
 with fs.open(image_loc, "rb") as im:
 response = rekognition_client.detect_labels(Image={"Bytes": im.read()})
 
 for label in response["Labels"]:
 if label["Confidence"] >= 60: #change to desired confidence score threshold, value between [0,100]:
 object_label = label["Name"]
 labels.append(object_label)
 return labels

# 4. Compute Cosine Similarity Scores 

In [None]:
def compute_similarity(entity_embeddings, entity_terms, taxonomy_embeddings, taxonomy_terms):
 """
 Compute cosine scores between entity embeddings and taxonomy embeddings
 
 Parameters (input):
 ----------
 entity_embeddings : Embeddings for either topic keywords from Amazon Comprehend or image labels from Amazon Rekognition
 entity_terms : Terms for topic keywords or image labels
 taxonomy_embeddings : Embeddings for the content taxonomy
 taxonomy_terms : Terms for the taxonomy keywords

 Returns (output):
 -------
 mapping_df : Dataframe that matches each entity keyword to each taxonomy keyword and their cosine similarity score
 """
 
 #calculate cosine score, pairing each entity embedding with each taxonomy keyword embedding
 cosine_scores = util.pytorch_cos_sim(entity_embeddings, taxonomy_embeddings)
 pairs = []
 for i in range(len(cosine_scores)-1):
 for j in range(0, cosine_scores.shape[1]):
 pairs.append({'index': [i, j], 'score': cosine_scores[i][j]})
 
 #Sort cosine similarity scores in decreasing order
 pairs = sorted(pairs, key=lambda x: x['score'], reverse=True)
 rows = []
 for pair in pairs:
 i, j = pair['index']
 rows.append([entity_terms[i], taxonomy_terms[j], pair['score']])
 
 #move sorted values to a dataframe
 mapping_df= pd.DataFrame(rows, columns=["term", "taxonomy_keyword","cosine_similarity"])
 mapping_df['cosine_similarity'] = mapping_df['cosine_similarity'].astype('float')
 mapping_df= mapping_df.sort_values(by=['term','cosine_similarity'], ascending=False)
 drop_dups= mapping_df.drop_duplicates(subset=['term'], keep='first')
 mapping_df = drop_dups.sort_values(by=['cosine_similarity'], ascending=False).reset_index(drop=True)
 return mapping_df
 
#compute cosine_similairty score between topic keywords and content taxonomy keywords using BERT embeddings 
text_taxonomy_mapping=compute_similarity(keyword_embeddings, topic_keywords, taxonomy_embeddings, taxonomy_terms) 

#View some mappings from topic keywords to keywords on the content taxonomy and check the similarity score for each
text_taxonomy_mapping.head(10)

In [None]:
def pair_text_taxonomy(mapping_df, topic_df):
 
 """
 Create a mapping of topic keywords to content taxonomy keywords with topic keywords grouped in thier assigned topics
 
 Parameters (input):
 ----------
 mapping_df : Output dataframe from the compute_similarity function
 topic_df : Terms for topic keywords

 Returns (output):
 -------
 ordered_df : Dataframe mapping of topic keywords to content taxonomy keywords with topic keywords grouped in thier assigned topics
 """
 
 df=mapping_df.drop_duplicates(subset=["term","taxonomy_keyword"], keep='first')
 merged=df.merge(topic_df, how='inner', on=['term'])
 sorted_df= merged.sort_values(by='cosine_similarity', ascending=False)
 topic_taxonomy_df= sorted_df.sort_values(by='topic').reset_index(drop=True)
 ordered_df=topic_taxonomy_df[["topic","term","taxonomy_keyword","weight","cosine_similarity"]]
 return ordered_df


#view text to taxonomy mapping with topics grouped together
topic_taxonomy_mapping=pair_text_taxonomy(text_taxonomy_mapping,topics)
topic_taxonomy_mapping.head()

# 5. Analyze Webpage Content

In [None]:
#Test if path_texts variable was updated: No action needed
if path_texts == 'texts-path-name':
 print("Replace the variable 'path_texts' with the name of the folder where extracted text files are stored")
 

#Select and review a sample article and review its content 
file_name = 'my-sample-file.txt' #replace with the name of a text file in your Amazon S3 bucket
file_path = '{}/{}'.format(path_texts, file_name)

file_object= s3_client.get_object(Bucket=bucket, Key=file_path)
file_content = file_object['Body'].read()

print(file_content.decode("utf-8"))

#Test if file_name variable was updated: No action needed
if file_name == 'my-sample-file.txt':
 print("Replace the variable 'file_name' with the name of a text file in your Amazon S3 bucket")

Text to Taxonomy Mapping

In [None]:
#check the topic assignment(s) for your selected article and identify the topic number with the highest proportion
doc_topics[doc_topics['docname']== file_name]

In [None]:
#check topic keywords and content taxonomy mapping for the topic number identified in the previous step

text_taxonomy_mapping=topic_taxonomy_mapping[topic_taxonomy_mapping['topic']== 0] #replace the 0 with your topic number
text_taxonomy_mapping.sort_values(by='cosine_similarity', ascending=False)
print(text_taxonomy_mapping)

Image to Taxonomy Mapping

In [None]:
#Test if path_images variable was updated: No action needed
if path_images == 'images-path-name':
 print("Replace the variable 'path_images' with the name of the folder where extracted images are stored")

 
#Select an image from the same webpage as the article you selected 
image_name = 'my-sample-image.jpg' #replace with the name of a text file in your Amazon S3 bucket
image_path = 's3://{}/{}/{}'.format(bucket, path_images, image_name)


#Test if image_name variable was updated: No action needed
if image_name == 'my-sample-image.jpg':
 print("Replace the variable 'image_name' with the name of a text file in your Amazon S3 bucket")


#View the image
with fs.open(image_path) as im:
 display(Image.open(im))



In [None]:
#Call Amazon Reckognition label detection API to get labels

image_labels=get_image_labels(image_path)
print(image_labels)

In [None]:
#create BERT embeddings for each of the image labels
label_embeddings = model.encode(image_labels, normalize_embeddings=True)

#view the mapping of image labels to content taxonomy for the image selected 
image_taxonomy_mapping=compute_similarity(label_embeddings, image_labels, taxonomy_embeddings, taxonomy_terms)
print(image_taxonomy_mapping)

Select Keywords for Realtime Bidding

In [None]:
#merge text and image keywords mapped to content taxonomy
rtb_keywords=pd.concat([text_taxonomy_mapping[["term","taxonomy_keyword","cosine_similarity"]],image_taxonomy_mapping]).sort_values(by='cosine_similarity',ascending=False).reset_index(drop=True)

#select keywords with a cosine_similarity score greater than your desired threshold ( the value should be from 0 to 1)
rtb_keywords[rtb_keywords["cosine_similarity"]> 50] # change to desired threshold for cosine score, value between [0,100]: