Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
SPDX-License-Identifier: MIT-0

* [Import libraries](#libraries)
* [Connect to DocumentDB](#dbconn)
* [Preview images](#preview)
* [Upload images](#upload)
* [Analyze and ingest images](#ingest)
* [Explore images using DocumentDB queries](#query)

# Import libraries <a class='anchor' id='libraries'></a>

In [1]:
import boto3
import glob
import ipyplot
import json
import os
import pandas as pd
import plotly.express as px
from pymongo import MongoClient, ASCENDING
import s3fs

pd.set_option('display.max_colwidth', 80)

In [2]:
# Set parameters
stack_name = "docdb-rekognition" # name of CloudFormation stack

s3_bucket = 'docdb-blog' # name of your S3 bucket
s3_prefix = 'rekognition/pics/' # S3 path where you want the images uploaded

In [3]:
rekognition = boto3.client('rekognition')
s3 = boto3.client("s3")

local_prefix = 'pics' # path to where the images were downloaded onto your SageMaker instance

# Connect to DocumentDB <a class='anchor' id='dbconn'></a>

In [4]:
# Get DocumentDB credentials stored in Secrets Manager
def get_secret(stack_name):

    # Create a Secrets Manager client
    session = boto3.session.Session()
    client = session.client(
        service_name='secretsmanager',
        region_name=session.region_name
    )
    
    secret_name = f'{stack_name}-DocDBSecret'
    get_secret_value_response = client.get_secret_value(SecretId=secret_name)
    secret = get_secret_value_response['SecretString']
    
    return json.loads(secret)

In [5]:
# Set up a connection to the Amazon DocumentDB database
secret = get_secret(stack_name)

db_username = secret['username']
db_password = secret['password']
db_port = secret['port']
db_host = secret['host']

# SSL connection string
uri_str = f"mongodb://{db_username}:{db_password}@{db_host}:{db_port}/?ssl=true&ssl_ca_certs=rds-combined-ca-bundle.pem&replicaSet=rs0&readPreference=secondaryPreferred&retryWrites=false"
client = MongoClient(uri_str)

In [6]:
# Show cluster details for the Amazon DocumentDB cluster, which verifies the connection
client["admin"].command("ismaster")["hosts"]

['documentdbinstanceone-ba0lmvhl0dml.chuzv8sgxhbr.us-west-2.docdb.amazonaws.com:27017',
 'documentdbinstancetwo-iulkk0vmfiln.chuzv8sgxhbr.us-west-2.docdb.amazonaws.com:27017',
 'documentdbinstancethree-epyakg0eahpb.chuzv8sgxhbr.us-west-2.docdb.amazonaws.com:27017']

In [7]:
db_name = "db" # name the database
coll_name = "coll" # name the collection

db = client[db_name] # create a database object
coll = db[coll_name] # create a collection object

In [8]:
# Optional: drop existing data in the collection if the collection exists
coll.drop()

# Preview images <a class='anchor' id='preview'></a>

In [9]:
# Get local paths of images
pic_local_paths = glob.glob(f"{local_prefix}/*.jpg")
pic_local_paths = sorted(pic_local_paths)

# Preview images
ipyplot.plot_images(
    images=pic_local_paths, 
    max_images=15, 
    img_width=180,
)

# Upload images to S3 <a class='anchor' id='upload'></a>

In [10]:
# Upload images to S3
for pic_local_path in pic_local_paths:
    pic_filename = os.path.basename(pic_local_path)
    boto3.Session().resource('s3').Bucket(s3_bucket).Object(os.path.join(s3_prefix, pic_filename)).upload_file(pic_local_path)

In [11]:
# Get S3 keys for images
fs = s3fs.S3FileSystem()
pic_keylist = fs.ls(f's3://{s3_bucket}/{s3_prefix}/')[1:]  ### BMH: I think we should remove the [1:]... I'm losing the first pic.
pic_keylist = [key.split(f'{s3_bucket}/')[1] for key in pic_keylist]

In [12]:
pic_keylist

['rekognition/pics/coleen-rivas-OZ2rS2zCjNo-unsplash.jpg',
 'rekognition/pics/erik-eastman-4HG5hlhmZg8-unsplash.jpg',
 'rekognition/pics/fikri-rasyid-amI09sbNZdE-unsplash.jpg',
 'rekognition/pics/gusandy-maulana-Rs3Z-j8QTEM-unsplash.jpg',
 'rekognition/pics/kourosh-qaffari-RrhhzitYizg-unsplash.jpg',
 'rekognition/pics/pickawood-8SfXsep8EIA-unsplash.jpg',
 'rekognition/pics/ranurte-Hnmb9wQucG4-unsplash.jpg',
 'rekognition/pics/riley-sullivan-kTb76cLODyE-unsplash.jpg',
 'rekognition/pics/robert-f-9t5sV4KarVA-unsplash.jpg',
 'rekognition/pics/roberto-carlos-roman-K77xDt7E1fE-unsplash.jpg',
 'rekognition/pics/simon-berger-39SHYToxfiQ-unsplash.jpg',
 'rekognition/pics/svetlana-kuznetsova-VgItkeIq6Ek-unsplash.jpg',
 'rekognition/pics/tom-ungerer-10Kd3Pm4BDg-unsplash.jpg',
 'rekognition/pics/volkan-vardar-tYBlm33PMxU-unsplash.jpg',
 'rekognition/pics/yeshi-kangrang-wTD1-_u8x1g-unsplash.jpg']

# Analyze images with Rekognition and ingest data into DocumentDB <a class='anchor' id='ingest'></a>

In [13]:
# Example Rekognition ouput
rekognition.detect_labels(
    Image={
        'S3Object':{
            'Bucket': s3_bucket,
            'Name': pic_keylist[0]
    }}, 
    MinConfidence=50, 
    MaxLabels=100,
)

{'Labels': [{'Name': 'Outdoors',
   'Confidence': 98.58585357666016,
   'Instances': [],
   'Parents': []},
  {'Name': 'Garden',
   'Confidence': 96.23029327392578,
   'Instances': [],
   'Parents': [{'Name': 'Outdoors'}]},
  {'Name': 'Arbour',
   'Confidence': 93.65332794189453,
   'Instances': [],
   'Parents': [{'Name': 'Garden'}, {'Name': 'Outdoors'}]},
  {'Name': 'Person',
   'Confidence': 93.00440979003906,
   'Instances': [{'BoundingBox': {'Width': 0.016103893518447876,
      'Height': 0.03213529288768768,
      'Left': 0.6525371670722961,
      'Top': 0.9264869689941406},
     'Confidence': 93.00440979003906},
    {'BoundingBox': {'Width': 0.010800352320075035,
      'Height': 0.020640190690755844,
      'Left': 0.781416118144989,
      'Top': 0.8592491149902344},
     'Confidence': 78.98234558105469},
    {'BoundingBox': {'Width': 0.017044249922037125,
      'Height': 0.02785704843699932,
      'Left': 0.7455113530158997,
      'Top': 0.8547402620315552},
     'Confidence': 66

In [14]:
# Analyze and ingest images data
for pic_key in pic_keylist:
    
    # Analyze an image with Rekognition
    pic_result = rekognition.detect_labels(
        Image={
            'S3Object':{
                'Bucket': s3_bucket,
                'Name': pic_key
        }}, 
        MinConfidence=50, 
        MaxLabels=100)

    # Extract S3 key and image labels
    pic_label = pic_result['Labels']
    doc = {
        "img": pic_key.split('/')[-1], 
        "Labels": pic_result['Labels']
    }
    
    # Ingest results into DocumentDB
    coll.insert_one(doc)

# Explore images using DocumentDB queries <a class='anchor' id='query'></a>

In [15]:
def preview_imgs(result):
    """
    Method to preview images resulting from queries
    """    
    # Process result as pandas dataframe
    result = pd.DataFrame(result)
    
    # Get corresponding local image paths
    result_list = result['img'].tolist()
    result_list = [f'{local_prefix}/{result}' for result in result_list]
    
    # Display images
    ipyplot.plot_images(
        images=result_list, 
        max_images=10, 
        img_width=180,
    )

## Frequency counts

In [16]:
# Count images
coll.count_documents({})

15

In [17]:
# Histogram count of labels with confidence >=90.0
result = coll.aggregate([
    {"$unwind": "$Labels"}, 
    {"$match": {"Labels.Confidence": {"$gte": 90.0}}}, 
    {"$group": {"_id": "$Labels.Name", "count": {"$sum": 1}}},
    {"$sort": {"count": -1} } 
])

In [18]:
# Plot histogram
fig = px.bar(result, x='_id', y='count')
fig.write_html("label_histogram.html") # save the histogram as a html file
fig.show()

## Select images with minumum confidence threshold (without index)

In [19]:
# Query images with a 'Book' label of 90% or more confidence
result = coll.find(
    {"Labels": {"$elemMatch": {"Name": "Book", "Confidence": {"$gte": 90.0}}}}, 
    {"_id": 0, "img": 1}
)

preview_imgs(result)

In [20]:
# Query images with a 'Book' label with 90% or more confidence, and a 'Person' label with 90% or more confidence
result = coll.find(
    {"$and": [
        {"Labels": {"$elemMatch": {"Name": "Book", "Confidence": {"$gte": 90.0}}}}, 
        {"Labels": {"$elemMatch": {"Name": "Person", "Confidence": {"$gte": 90.0}}}}]  
    }, 
    {"_id": 0, "img": 1})

preview_imgs(result)

## Select images with minumum confidence threshold (with index)
You can also create an index to help these last 2 queries. To create the index, run the following:

In [21]:
# Create an index to help identify labels in pictures
coll.create_index([
    ("Labels.Name", ASCENDING), 
    ("Labels.Confidence", ASCENDING)], 
    name="idx_labels")

'idx_labels'

In [22]:
# Query for 'Book' label with 90% or more confidence
query_book = coll.find({"$and": [
        {"Labels.Name": "Book"},
        {"Labels.Confidence": {"$gte": 90.0}},
        {"Labels": {"$elemMatch": {"Name": "Book", "Confidence": {"$gte": 90.0}}}}
    ]}, 
    {"_id": 0, "img": 1}
)

preview_imgs(query_book)

In [23]:
# We can see that the planner has chosen an Index Scan (IXSCAN) for this query now
query_book.explain()

{'queryPlanner': {'plannerVersion': 1,
  'namespace': 'db.coll',
  'winningPlan': {'stage': 'FETCH',
   'inputStage': {'stage': 'IXSCAN', 'indexName': 'idx_labels'}}},
 'serverInfo': {'host': 'documentdbinstancetwo-iulkk0vmfiln',
  'port': 27017,
  'version': '3.6.0'},
 'ok': 1.0}

In [24]:
# Query for 'Book' and 'Person' labels, both with 90% or more confidence
query_book_person = coll.find(
    {"$and": [
        {"Labels.Name": "Book"},
        {"Labels.Confidence": {"$gte": 90.0}},
        {"Labels.Name": "Person"},
        {"Labels.Confidence": {"$gte": 90.0}}, ## unnecessary, but adding for clarity
        {"Labels": {"$elemMatch": {"Name": "Book", "Confidence": {"$gte": 90.0}}}}, 
        {"Labels": {"$elemMatch": {"Name": "Person", "Confidence": {"$gte": 90.0}}}}]  
    }, 
    {"_id": 0, "img": 1}
)

preview_imgs(query_book_person)

In [25]:
# Again, we can see that the planner has chosen an Index Scan (IXSCAN) for this query now
query_book_person.explain()

{'queryPlanner': {'plannerVersion': 1,
  'namespace': 'db.coll',
  'winningPlan': {'stage': 'FETCH',
   'inputStage': {'stage': 'IXSCAN', 'indexName': 'idx_labels'}}},
 'serverInfo': {'host': 'documentdbinstancetwo-iulkk0vmfiln',
  'port': 27017,
  'version': '3.6.0'},
 'ok': 1.0}

## Select images with specified number instances of a label (array queries)

In [26]:
# Find all images with at least 4 instances of a person, with 90% or more confidence
# The query checks if the third instance, "Instances.3", exists, with instance count starting from zero
result = coll.find(
    {"Labels": {"$elemMatch": {"Name": "Person", 
                               "Confidence": {"$gte": 90.0}, 
                               "Instances.3": {"$exists": True}}}}, 
    {"_id": 0, "img": 1}
)

preview_imgs(result)

In [27]:
# Find all images with at least 2 but fewer than 4 instances of a person, with 90% or more confidence
result = coll.find(
    {"Labels": {"$elemMatch": {"Name": "Person", 
                               "Confidence": {"$gte": 90.0}, 
                               "Instances.1": {"$exists": True}, 
                               "Instances.3": {"$exists": False}}}}, 
    {"_id": 0, "img": 1}
)

preview_imgs(result)