{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Text detection using Amazon Rekognition" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "***\n", "This notebook provides a walkthrough of the [text detection API](https://docs.aws.amazon.com/rekognition/latest/dg/text-detection.html) in Amazon Rekognition. You can quickly identify text in your video and image libraries to catalog footage and photos for marketing, advertising, and media industry use cases.\n", "***" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Initialize stuff" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Update boto3 to current version\n", "!conda upgrade -y boto3" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Check to ensure that current version of boto3 is installed\n", "import boto3\n", "print(boto3.__version__)\n", "\n", "import botocore\n", "print(botocore.__version__)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Initialise Notebook\n", "import boto3\n", "from IPython.display import HTML, display, Image as IImage\n", "from PIL import Image, ImageDraw, ImageFont\n", "import time\n", "import os" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Curent AWS Region. Use this to choose corresponding S3 bucket with sample content\n", "\n", "mySession = boto3.session.Session()\n", "awsRegion = mySession.region_name" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Init clients\n", "rekognition = boto3.client('rekognition')\n", "s3 = boto3.client('s3')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# S3 bucket that contains sample images and videos\n", "\n", "# We are providing sample images and videos in this bucket so\n", "# you do not have to manually download/upload test images and videos.\n", "bucketName = \"aws-workshops-\" + awsRegion" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Create temporary directory\n", "# This directory is not needed to call Rekognition APIs.\n", "# We will only use this directory to download images from S3 bucket and draw bounding boxes\n", "\n", "!mkdir m1tmp\n", "tempFolder = 'm1tmp/'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Detect text in image\n", "***" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "imageName = \"content-moderation/media/coffee.jpg\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "display(IImage(url=s3.generate_presigned_url('get_object', Params={'Bucket': bucketName, 'Key': imageName})))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Call Rekognition to detect text in the image" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Call Amazon Rekognition to detect text in the image\n", "# https://docs.aws.amazon.com/rekognition/latest/dg/API_DetectText.html\n", "\n", "detectTextResponse = rekognition.detect_text(\n", " Image={\n", " 'S3Object': {\n", " 'Bucket': bucketName,\n", " 'Name': imageName,\n", " }\n", " },\n", " Filters={\n", " 'WordFilter': {\n", " 'MinConfidence': 90\n", " }\n", " }\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Review the raw JSON reponse from Rekognition" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Show JSON response returned by Rekognition Text API (Text Detection)\n", "# In the JSON response below, you will see detected text, confidence score, and additional information.\n", "\n", "display(detectTextResponse)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Display list of detected unsafe text" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import string\n", "unsafeWords = [\"crap\", \"darn\", \"damm\"]\n", "for textDetection in detectTextResponse[\"TextDetections\"]:\n", " # strip punctuation before checking match\n", " text = textDetection[\"DetectedText\"].translate(str.maketrans('', '', string.punctuation))\n", " if(textDetection[\"Type\"] == \"WORD\" and text in unsafeWords):\n", " print(\"Detected unsafe word: {}\".format(textDetection[\"DetectedText\"]))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Show image with bounding boxes around detected objects" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Define a function that will display image with bounded boxes around recognized text\n", "# We will call this function in next step\n", " \n", "def drawBoundingBoxes (sourceImage, boxes):\n", " # blue, green, red, grey\n", " colors = ((255,255,255),(255,255,255),(76,182,252),(52,194,123))\n", " \n", " # Download image locally\n", " imageLocation = tempFolder+os.path.basename(sourceImage)\n", " s3.download_file(bucketName, sourceImage, imageLocation)\n", "\n", " # Draws BB on Image\n", " bbImage = Image.open(imageLocation)\n", " draw = ImageDraw.Draw(bbImage)\n", " width, height = bbImage.size\n", " col = 0\n", " maxcol = len(colors)\n", " line= 3\n", " for box in boxes:\n", " x1 = int(box[1]['Left'] * width)\n", " y1 = int(box[1]['Top'] * height)\n", " x2 = int(box[1]['Left'] * width + box[1]['Width'] * width)\n", " y2 = int(box[1]['Top'] * height + box[1]['Height'] * height)\n", " \n", " draw.text((x1,y1),box[0],colors[col])\n", " for l in range(line):\n", " draw.rectangle((x1-l,y1-l,x2+l,y2+l),outline=colors[col])\n", " col = (col+1)%maxcol\n", " \n", " imageFormat = \"PNG\"\n", " ext = sourceImage.lower()\n", " if(ext.endswith('jpg') or ext.endswith('jpeg')):\n", " imageFormat = 'JPEG'\n", "\n", " bbImage.save(imageLocation,format=imageFormat)\n", "\n", " display(bbImage)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "# Extract bounding box information from JSON response above and display image with bounding boxes around text.\n", "\n", "boxes = []\n", "textDetections = detectTextResponse['TextDetections']\n", "for textDetection in textDetections:\n", " boxes.append ((textDetection['Type'], textDetection[\"Geometry\"]['BoundingBox']))\n", " \n", "drawBoundingBoxes(imageName, boxes)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Detect text in image using Filters and Regions of Interest\n", "***" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "imageName = \"content-moderation/media/coffee.jpg\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "display(IImage(url=s3.generate_presigned_url('get_object', Params={'Bucket': bucketName, 'Key': imageName})))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Call Amazon Rekognition to detect text in the image\n", "# https://docs.aws.amazon.com/rekognition/latest/dg/API_DetectText.html\n", "\n", "detectTextResponse = rekognition.detect_text(\n", " Image={\n", " 'S3Object': {\n", " 'Bucket': bucketName,\n", " 'Name': imageName,\n", " }\n", " },\n", " Filters={\n", " 'WordFilter': {\n", " 'MinConfidence': 90,\n", " 'MinBoundingBoxHeight': 0.05,\n", " 'MinBoundingBoxWidth': 0.02\n", " },\n", " 'RegionsOfInterest': [\n", " {\n", " 'BoundingBox': {\n", " 'Width': 0.1,\n", " 'Height': 0.05,\n", " 'Left': 0.01,\n", " 'Top': 0.01\n", " }\n", " },\n", " ]\n", " }\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Show JSON response returned by Rekognition Text API (Text Detection)\n", "# In the JSON response below, you will see detected text, confidence score, and additional information.\n", "\n", "display(detectTextResponse)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "for textDetection in detectTextResponse[\"TextDetections\"]:\n", " text = textDetection[\"DetectedText\"]\n", " if(textDetection[\"Type\"] == \"WORD\"):\n", " print(\"Word: {}\".format(textDetection[\"DetectedText\"]))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Detect text in video\n", " Text detection in video is an async operation. \n", "https://docs.aws.amazon.com/rekognition/latest/dg/text-detecting-video-procedure.html.\n", "\n", "- First we start a text detection job which returns a Job Id.\n", "- We can then call `get_text_detection` to get the job status and after job is complete, we can get object metadata.\n", "- In production use cases, you would usually use StepFunction or SNS topic to get notified when job is complete.\n", "***" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "videoName = \"content-moderation/media/serverless-bytes.mov\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Call Rekognition to start a job for text detection" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Start video text job\n", "startTextDetection = rekognition.start_text_detection(\n", " Video={\n", " 'S3Object': {\n", " 'Bucket': bucketName,\n", " 'Name': videoName,\n", " }\n", " },\n", ")\n", "\n", "textJobId = startTextDetection['JobId']\n", "display(\"Job Id: {0}\".format(textJobId))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Wait for text detection job to complete" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Wait for text detection job to complete\n", "# In production use cases, you would usually use StepFunction or SNS topic to get notified when job is complete.\n", "getTextDetection = rekognition.get_text_detection(\n", " JobId=textJobId\n", ")\n", "\n", "while(getTextDetection['JobStatus'] == 'IN_PROGRESS'):\n", " time.sleep(5)\n", " print('.', end='')\n", " \n", " getTextDetection = rekognition.get_text_detection(\n", " JobId=textJobId\n", " )\n", " \n", "display(getTextDetection['JobStatus'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Review raw JSON reponse from Rekognition" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Show JSON response returned by Rekognition Text Detection API\n", "# In the JSON response below, you will see list of detected text.\n", "# For each detected object, you will see information like Timestamp\n", "\n", "display(getTextDetection)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Display recognized text in the video" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "flaggedTextInVideo = [\"AWS\", \"Twitter\"]\n", "\n", "theLines = {}\n", "\n", "# Display timestamps and objects detected at that time\n", "strDetail = \"Text detected in video
=======================================
\"\n", "strOverall = \"Text in the overall video:
=======================================
\"\n", "\n", "# Objects detected in each frame\n", "for obj in getTextDetection['TextDetections']:\n", " if(obj['TextDetection']['Type'] == 'WORD'):\n", " ts = obj [\"Timestamp\"]\n", " cconfidence = obj['TextDetection'][\"Confidence\"]\n", " oname = obj['TextDetection'][\"DetectedText\"]\n", "\n", " if(oname in flaggedTextInVideo):\n", " print(\"Found flagged text at {} ms: {} (Confidence: {})\".format(ts, oname, round(cconfidence,2)))\n", "\n", " strDetail = strDetail + \"At {} ms: {} (Confidence: {})
\".format(ts, oname, round(cconfidence,2))\n", " if oname in theLines:\n", " cojb = theLines[oname]\n", " theLines[oname] = {\"Text\" : oname, \"Count\": 1+cojb[\"Count\"]}\n", " else:\n", " theLines[oname] = {\"Text\" : oname, \"Count\": 1}\n", "\n", "# Unique objects detected in video\n", "for theLine in theLines:\n", " strOverall = strOverall + \"Name: {}, Count: {}
\".format(theLine, theLines[theLine][\"Count\"])\n", "\n", "# Display results\n", "display(HTML(strOverall))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Show video in the player" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Show video in a player\n", "\n", "s3VideoUrl = s3.generate_presigned_url('get_object', Params={'Bucket': bucketName, 'Key': videoName})\n", "\n", "videoTag = \"\".format(s3VideoUrl)\n", "\n", "videoui = \"
{}
\".format(videoTag)\n", "\n", "display(HTML(videoui))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "listui = \"
{}
\".format(strDetail)\n", "display(HTML(listui))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "***\n", "### References\n", "- https://docs.aws.amazon.com/rekognition/latest/dg/API_DetectText.html\n", "- https://docs.aws.amazon.com/rekognition/latest/dg/API_StartTextDetection.html\n", "- https://docs.aws.amazon.com/rekognition/latest/dg/API_GetTextDetection.html\n", "\n", "***" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "You have successfully used Amazon Rekognition to identify text in images an videos." ] } ], "metadata": { "kernelspec": { "display_name": "conda_python3", "language": "python", "name": "conda_python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.5" } }, "nbformat": 4, "nbformat_minor": 2 }