{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Text detection using Amazon Rekognition"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"***\n",
"This notebook provides a walkthrough of the [text detection API](https://docs.aws.amazon.com/rekognition/latest/dg/text-detection.html) in Amazon Rekognition. You can quickly identify text in your video and image libraries to catalog footage and photos for marketing, advertising, and media industry use cases.\n",
"***"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Initialize stuff"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Update boto3 to current version\n",
"!conda upgrade -y boto3"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#Check to ensure that current version of boto3 is installed\n",
"import boto3\n",
"print(boto3.__version__)\n",
"\n",
"import botocore\n",
"print(botocore.__version__)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Initialise Notebook\n",
"import boto3\n",
"from IPython.display import HTML, display, Image as IImage\n",
"from PIL import Image, ImageDraw, ImageFont\n",
"import time\n",
"import os"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Curent AWS Region. Use this to choose corresponding S3 bucket with sample content\n",
"\n",
"mySession = boto3.session.Session()\n",
"awsRegion = mySession.region_name"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Init clients\n",
"rekognition = boto3.client('rekognition')\n",
"s3 = boto3.client('s3')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# S3 bucket that contains sample images and videos\n",
"\n",
"# We are providing sample images and videos in this bucket so\n",
"# you do not have to manually download/upload test images and videos.\n",
"bucketName = \"aws-workshops-\" + awsRegion"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Create temporary directory\n",
"# This directory is not needed to call Rekognition APIs.\n",
"# We will only use this directory to download images from S3 bucket and draw bounding boxes\n",
"\n",
"!mkdir m1tmp\n",
"tempFolder = 'm1tmp/'"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Detect text in image\n",
"***"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"imageName = \"content-moderation/media/coffee.jpg\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"display(IImage(url=s3.generate_presigned_url('get_object', Params={'Bucket': bucketName, 'Key': imageName})))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Call Rekognition to detect text in the image"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Call Amazon Rekognition to detect text in the image\n",
"# https://docs.aws.amazon.com/rekognition/latest/dg/API_DetectText.html\n",
"\n",
"detectTextResponse = rekognition.detect_text(\n",
" Image={\n",
" 'S3Object': {\n",
" 'Bucket': bucketName,\n",
" 'Name': imageName,\n",
" }\n",
" },\n",
" Filters={\n",
" 'WordFilter': {\n",
" 'MinConfidence': 90\n",
" }\n",
" }\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Review the raw JSON reponse from Rekognition"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Show JSON response returned by Rekognition Text API (Text Detection)\n",
"# In the JSON response below, you will see detected text, confidence score, and additional information.\n",
"\n",
"display(detectTextResponse)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Display list of detected unsafe text"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import string\n",
"unsafeWords = [\"crap\", \"darn\", \"damm\"]\n",
"for textDetection in detectTextResponse[\"TextDetections\"]:\n",
" # strip punctuation before checking match\n",
" text = textDetection[\"DetectedText\"].translate(str.maketrans('', '', string.punctuation))\n",
" if(textDetection[\"Type\"] == \"WORD\" and text in unsafeWords):\n",
" print(\"Detected unsafe word: {}\".format(textDetection[\"DetectedText\"]))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Show image with bounding boxes around detected objects"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Define a function that will display image with bounded boxes around recognized text\n",
"# We will call this function in next step\n",
" \n",
"def drawBoundingBoxes (sourceImage, boxes):\n",
" # blue, green, red, grey\n",
" colors = ((255,255,255),(255,255,255),(76,182,252),(52,194,123))\n",
" \n",
" # Download image locally\n",
" imageLocation = tempFolder+os.path.basename(sourceImage)\n",
" s3.download_file(bucketName, sourceImage, imageLocation)\n",
"\n",
" # Draws BB on Image\n",
" bbImage = Image.open(imageLocation)\n",
" draw = ImageDraw.Draw(bbImage)\n",
" width, height = bbImage.size\n",
" col = 0\n",
" maxcol = len(colors)\n",
" line= 3\n",
" for box in boxes:\n",
" x1 = int(box[1]['Left'] * width)\n",
" y1 = int(box[1]['Top'] * height)\n",
" x2 = int(box[1]['Left'] * width + box[1]['Width'] * width)\n",
" y2 = int(box[1]['Top'] * height + box[1]['Height'] * height)\n",
" \n",
" draw.text((x1,y1),box[0],colors[col])\n",
" for l in range(line):\n",
" draw.rectangle((x1-l,y1-l,x2+l,y2+l),outline=colors[col])\n",
" col = (col+1)%maxcol\n",
" \n",
" imageFormat = \"PNG\"\n",
" ext = sourceImage.lower()\n",
" if(ext.endswith('jpg') or ext.endswith('jpeg')):\n",
" imageFormat = 'JPEG'\n",
"\n",
" bbImage.save(imageLocation,format=imageFormat)\n",
"\n",
" display(bbImage)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"# Extract bounding box information from JSON response above and display image with bounding boxes around text.\n",
"\n",
"boxes = []\n",
"textDetections = detectTextResponse['TextDetections']\n",
"for textDetection in textDetections:\n",
" boxes.append ((textDetection['Type'], textDetection[\"Geometry\"]['BoundingBox']))\n",
" \n",
"drawBoundingBoxes(imageName, boxes)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Detect text in image using Filters and Regions of Interest\n",
"***"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"imageName = \"content-moderation/media/coffee.jpg\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"display(IImage(url=s3.generate_presigned_url('get_object', Params={'Bucket': bucketName, 'Key': imageName})))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Call Amazon Rekognition to detect text in the image\n",
"# https://docs.aws.amazon.com/rekognition/latest/dg/API_DetectText.html\n",
"\n",
"detectTextResponse = rekognition.detect_text(\n",
" Image={\n",
" 'S3Object': {\n",
" 'Bucket': bucketName,\n",
" 'Name': imageName,\n",
" }\n",
" },\n",
" Filters={\n",
" 'WordFilter': {\n",
" 'MinConfidence': 90,\n",
" 'MinBoundingBoxHeight': 0.05,\n",
" 'MinBoundingBoxWidth': 0.02\n",
" },\n",
" 'RegionsOfInterest': [\n",
" {\n",
" 'BoundingBox': {\n",
" 'Width': 0.1,\n",
" 'Height': 0.05,\n",
" 'Left': 0.01,\n",
" 'Top': 0.01\n",
" }\n",
" },\n",
" ]\n",
" }\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Show JSON response returned by Rekognition Text API (Text Detection)\n",
"# In the JSON response below, you will see detected text, confidence score, and additional information.\n",
"\n",
"display(detectTextResponse)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"for textDetection in detectTextResponse[\"TextDetections\"]:\n",
" text = textDetection[\"DetectedText\"]\n",
" if(textDetection[\"Type\"] == \"WORD\"):\n",
" print(\"Word: {}\".format(textDetection[\"DetectedText\"]))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Detect text in video\n",
" Text detection in video is an async operation. \n",
"https://docs.aws.amazon.com/rekognition/latest/dg/text-detecting-video-procedure.html.\n",
"\n",
"- First we start a text detection job which returns a Job Id.\n",
"- We can then call `get_text_detection` to get the job status and after job is complete, we can get object metadata.\n",
"- In production use cases, you would usually use StepFunction or SNS topic to get notified when job is complete.\n",
"***"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"videoName = \"content-moderation/media/serverless-bytes.mov\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Call Rekognition to start a job for text detection"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Start video text job\n",
"startTextDetection = rekognition.start_text_detection(\n",
" Video={\n",
" 'S3Object': {\n",
" 'Bucket': bucketName,\n",
" 'Name': videoName,\n",
" }\n",
" },\n",
")\n",
"\n",
"textJobId = startTextDetection['JobId']\n",
"display(\"Job Id: {0}\".format(textJobId))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Wait for text detection job to complete"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Wait for text detection job to complete\n",
"# In production use cases, you would usually use StepFunction or SNS topic to get notified when job is complete.\n",
"getTextDetection = rekognition.get_text_detection(\n",
" JobId=textJobId\n",
")\n",
"\n",
"while(getTextDetection['JobStatus'] == 'IN_PROGRESS'):\n",
" time.sleep(5)\n",
" print('.', end='')\n",
" \n",
" getTextDetection = rekognition.get_text_detection(\n",
" JobId=textJobId\n",
" )\n",
" \n",
"display(getTextDetection['JobStatus'])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Review raw JSON reponse from Rekognition"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Show JSON response returned by Rekognition Text Detection API\n",
"# In the JSON response below, you will see list of detected text.\n",
"# For each detected object, you will see information like Timestamp\n",
"\n",
"display(getTextDetection)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Display recognized text in the video"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"flaggedTextInVideo = [\"AWS\", \"Twitter\"]\n",
"\n",
"theLines = {}\n",
"\n",
"# Display timestamps and objects detected at that time\n",
"strDetail = \"Text detected in video
=======================================
\"\n",
"strOverall = \"Text in the overall video:
=======================================
\"\n",
"\n",
"# Objects detected in each frame\n",
"for obj in getTextDetection['TextDetections']:\n",
" if(obj['TextDetection']['Type'] == 'WORD'):\n",
" ts = obj [\"Timestamp\"]\n",
" cconfidence = obj['TextDetection'][\"Confidence\"]\n",
" oname = obj['TextDetection'][\"DetectedText\"]\n",
"\n",
" if(oname in flaggedTextInVideo):\n",
" print(\"Found flagged text at {} ms: {} (Confidence: {})\".format(ts, oname, round(cconfidence,2)))\n",
"\n",
" strDetail = strDetail + \"At {} ms: {} (Confidence: {})
\".format(ts, oname, round(cconfidence,2))\n",
" if oname in theLines:\n",
" cojb = theLines[oname]\n",
" theLines[oname] = {\"Text\" : oname, \"Count\": 1+cojb[\"Count\"]}\n",
" else:\n",
" theLines[oname] = {\"Text\" : oname, \"Count\": 1}\n",
"\n",
"# Unique objects detected in video\n",
"for theLine in theLines:\n",
" strOverall = strOverall + \"Name: {}, Count: {}
\".format(theLine, theLines[theLine][\"Count\"])\n",
"\n",
"# Display results\n",
"display(HTML(strOverall))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Show video in the player"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Show video in a player\n",
"\n",
"s3VideoUrl = s3.generate_presigned_url('get_object', Params={'Bucket': bucketName, 'Key': videoName})\n",
"\n",
"videoTag = \"\".format(s3VideoUrl)\n",
"\n",
"videoui = \"
\".format(videoTag)\n",
"\n",
"display(HTML(videoui))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"listui = \"\".format(strDetail)\n",
"display(HTML(listui))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"***\n",
"### References\n",
"- https://docs.aws.amazon.com/rekognition/latest/dg/API_DetectText.html\n",
"- https://docs.aws.amazon.com/rekognition/latest/dg/API_StartTextDetection.html\n",
"- https://docs.aws.amazon.com/rekognition/latest/dg/API_GetTextDetection.html\n",
"\n",
"***"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"You have successfully used Amazon Rekognition to identify text in images an videos."
]
}
],
"metadata": {
"kernelspec": {
"display_name": "conda_python3",
"language": "python",
"name": "conda_python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}