# ================================================================================== # Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved. # Permission is hereby granted, free of charge, to any person obtaining a copy of this # software and associated documentation files (the "Software"), to deal in the Software # without restriction, including without limitation the rights to use, copy, modify, # merge, publish, distribute, sublicense, and/or sell copies of the Software, and to # permit persons to whom the Software is furnished to do so. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, # INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A # PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT # HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE # SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. # ================================================================================== # # stvblogCreatePhrasesFromTranscription # by: Rob Dachowski # For questions or feedback, please contact robdac@amazon.com # # Purpose: The labmda creates a file that contains phrased groupings of words derived from # the Transcribe JSON output. It delineates the phrases in a way that (hopefully) # translates without incident. The format is { ]. # For example, it might look like: {1} ["This is a test phrase"] # # Change Log: # 3/01/21: Initial version # # ================================================================================== import json import uuid import datetime import re import boto3 from botocore.exceptions import ClientError import ssmparms as sp import stmparms as stm import stverrors # ================================================================================== # Function: newPhrase # Purpose: simply create a phrase tuple # Parameters: # None # ================================================================================== def createNewPhrase(): return { 'phrase_id': '', 'start_time': '', 'end_time': '', 'time_code': '', 'phrase_words' : [] } # ================================================================================== # Function: getTimeCode # Purpose: Format and return a string that contains the converted number of seconds into SRT or VTT format # Parameters: # seconds - the duration in seconds to convert to HH:MM:SS,mmm # format - "srt" or "vtt" # ================================================================================== # Format and return a string that contains the converted number of seconds into SRT or VTT format def getTimeCode( seconds, format ): t_hund = int(seconds % 1 * 1000) t_seconds = int( seconds ) t_secs = ((float( t_seconds) / 60) % 1) * 60 t_mins = int( t_seconds / 60 ) if( format == "srt" ): return str( "%02d:%02d:%02d,%03d" % (00, t_mins, int(t_secs), t_hund )) elif( format == "vtt" ): return str( "%02d:%02d:%02d.%03d" % (00, t_mins, int(t_secs), t_hund )) # ================================================================================== # Function: getPhrasesFromTranscript # Purpose: Based on the JSON transcript provided by Amazon Transcribe, get the phrases from the translation # and write it out to an SRT file # Parameters: # transcript - the JSON output from Amazon Transcribe # ================================================================================== def getPhrasesFromTranscript( transcript, format ): # This function is intended to be called with the JSON structure output from the Transcribe service. However, # if you only have the translation of the transcript, then you should call getPhrasesFromTranslation instead # Now create phrases from the translation ts = json.loads( transcript ) items = ts['results']['items'] #set up some variables for the first pass phrase = createNewPhrase() phrases = [] newPhrase = True wordCount = 0 phraseCount = 1 itemCount = 0 print("\t---> Creating phrases from transcript...") print("\t\t---> Total Items to Process: ", len(items) ) while itemCount < len(items): # Get the item to examine item = items[itemCount] # if it is a new phrase, then get the start_time of the first item if newPhrase == True: # if it is punctuation, then there is no time encoding, so skip this if item["type"] == "pronunciation": phrase["start_time"] = getTimeCode( float(item["start_time"]), format ) phrase["end_time"] = getTimeCode( float(item["end_time"]), format ) newPhrase = False phrase['phrase_id'] = str(phraseCount) phraseCount += 1 else: # get the end_time if the item is a pronuciation and store it # We need to determine if this pronunciation or puncuation here # Punctuation doesn't contain timing information, so we'll want # to set the end_time to whatever the last word in the phrase is. if item["type"] == "pronunciation": phrase["end_time"] = getTimeCode( float(item["end_time"]), format ) # We don't want to add punctuation as the first item. For example, have a phrase, ", this is a phrase" doesn't make sense # so, we'll add it if it is a word no matter the word count, but only add punctuation if the wordcount is > 1 content = "" if item["type"] == "pronunciation": content = item['alternatives'][0]["content"] wordCount += 1 else: #if item["type"] == "punctuation": content = item['alternatives'][0]["content"] phrase['phrase_words'].append(content) itemCount += 1 # now add the phrase to the phrases, generate a new phrase, etc. if wordCount == 10 or itemCount == len(items): # We need to look ahead to see if the next item is punctuation. If it is, # then we'll want to add it and bump the itemCount try: if items[itemCount]['type'] == "punctuation": phrase['phrase_words'].append(items[itemCount]['alternatives'][0]["content"]) itemCount += 1 except Exception as e: # Looking ahead could cause an out of bounds condition. This is ok, just ignore it itemCount += 1 # print( "\t\t+++> phrase #: ", itemCount, " phrase:", phrase) phrases.append(phrase) phrase = createNewPhrase() newPhrase = True wordCount = 0 print( "\t---> Phrases calculated" ) return phrases # ================================================================================== # Function: getPhraseText # Purpose: For a given phrase, return the string of words including punctuation # Parameters: # phrase - the array of JSON tuples containing the words to show up as subtitles # ================================================================================== def getPhraseText( phrase ): length = len(phrase["phrase_words"]) out = "" for i in range( 0, length ): if re.match( '[a-zA-Z0-9]', phrase["phrase_words"][i]): if i > 0: out += " " + phrase["phrase_words"][i] else: out += phrase["phrase_words"][i] else: out += phrase["phrase_words"][i] return out # ================================================================================== # Function: getTranslatableText # Purpose: For the transcribed phrases, return the list of phrases structured for easy translation # Parameters: # phrases - the array of JSON tuples containing the words to show up as subtitles # ================================================================================== def getTranslatableText( phrases ): text = "" count = 1 length = len( phrases ) for phrase in phrases: text += "{" + str(phrase['phrase_id']) + "}[" + getPhraseText( phrase ) if count < length: text += "]\n" count += 1 text += ']\n' return text # ================================================================================== # Function: labmda_handler # Purpose: This is the "main" code for this lambda function # Parameters: # event - the JSON input structure containing the parameters from the step function process # ================================================================================== def lambda_handler(event, context): print("===> stvblogCreatePhrasesFromTranscription: " + "\nEvent:" + str(event) + "\nContext: " + str( context ) ) print( "\t---> Boto Version: ", boto3.__version__ ) # Load the parms from DynamoDB parms = stm.get_stm_parms( event['Outputs']['process']['ProcessName']) if not parms: # We have an issue, so get out raise stvDynamoDBError( "*** Unable to load parms from DynamoDB ***") # set up a shortcut pc = parms['Item']['Config'] pi = parms['Item']['Inputs'] po = parms['Item']['Outputs'] sls = pi['sourceLanguageShort'] slf = pi['sourceLanguageFull'] s3 = boto3.resource( 's3') # Open and read the transcript file try: bucket = s3.Bucket( pc['baseBucketName']) tFileKey = po['process']['uuid'] + '/' + pc['transcriptionOutput'] + '/' + 'transcribe_' + po['process']['uuid'] + '.json' tFile = bucket.Object( key=tFileKey) response = tFile.get() except ClientError as e: # Oops, we have a problem... print("*** Error opening on the Transcribe Job JSON", tFileKey + " ***") raise stvError("*** Error Code: ", e.response['Error']['Message'] + " ***") # Get the bytes from the "body" section transcriptIn = response['Body'].read() # Get the phrases from the transcript phrases = getPhrasesFromTranscript( transcriptIn, "srt" ) # Write the raw phrases out to S3 for use by the TranslateTranscription Step try: phraseKey = po['process']['uuid'] + '/' + pc['subtitleInput'] + '/' + 'phrases-' + sls + '_' + po['process']['uuid'] + '.txt' bucket.put_object(Body=json.dumps(phrases).encode(), ContentType="text/plain", Key=phraseKey ) except ClientError as e: # Oops, we have a problem... print("*** Error writing ", phaseKey + " ***") raise stvError("*** Error Code: ", e.response['Error']['Message'] + " ***") # Now create the translatable input file from the phrases translatableText = getTranslatableText( phrases ) # print( "Translatable Text: ", translatableText) # And write the translateable input to a file try: textKey = po['process']['uuid'] + '/' + pc['translationInput'] + '/' + 'translatableText-' + sls + '_' + po['process']['uuid'] + '.txt' bucket.put_object(Body=translatableText, ContentType="text/plain", Key=textKey ) except ClientError as e: # Oops, we have a problem... print("*** Error writing ", textKey + " ***") raise stvError("*** Error Code: ", e.response['Error']['Message'] + " ***") # Set up the response response = event response['Outputs']['transcribe']['phraseTextKey'] = phraseKey response['Outputs']['transcribe']['translatableTextKey'] = textKey # Put the ouptut back into DynamoDB if stm.update_stm_parms( response['Outputs']['process']['ProcessName'], response['Config'], response['Inputs'],response['Outputs'], event['Targets'] ): print('===>stvblogCreatePhrasesFromTranscription Complete') return response else: raise stvError( "*** Error writing to the stvblog table ***" )