# ================================================================================== # Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved. # Permission is hereby granted, free of charge, to any person obtaining a copy of this # software and associated documentation files (the "Software"), to deal in the Software # without restriction, including without limitation the rights to use, copy, modify, # merge, publish, distribute, sublicense, and/or sell copies of the Software, and to # permit persons to whom the Software is furnished to do so. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, # INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A # PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT # HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE # SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. # ================================================================================== # # stvblogCombinePhrasesAndTranslation # by: Rob Dachowski # For questions or feedback, please contact robdac@amazon.com # # Purpose: The labmda merges the original time encoded phrases from the Transcribe output with the translated, # boundary marked phrases. # # Change Log: # 3/1/2021: Initial version # # ================================================================================== import json import uuid import datetime import boto3 from botocore.exceptions import ClientError import ssmparms as sp import stmparms as stm import stverrors # ================================================================================== # Function: labmda_handler # Purpose: This is the "main" code for this lambda function # Parameters: # event - the JSON input structure containing the parameters from the step function process # ================================================================================== def lambda_handler(event, context): #debugging message print("===> stvblogCombinePhrasesAndTranslation: " + "\nEvent:" + str(event) ) print( "\t---> Boto Version: ", boto3.__version__ ) # Load the parms from DynamoDB parms = stm.get_stm_parms( event['input']['Outputs']['process']['ProcessName']) if not parms: # We have an issue, so get out raise stvDynamoDBError( "*** Unable to load parms from DynamoDB ***") # set up a shortcut pc = parms['Item']['Config'] pi = parms['Item']['Inputs'] po = parms['Item']['Outputs'] ptgts = parms['Item']['Targets'] pt = event['item']['translate'] sls = pi['sourceLanguageShort'] slf = pi['sourceLanguageFull'] tls = event['item']['translate']['targetLanguageShort'] tlf = event['item']['translate']['targetLanguageFull'] #set up AWS resource for S3 s3 = boto3.resource( 's3') bucket = s3.Bucket(pc['baseBucketName']) # ===Step 1=== # Set up the file key phraseFileKey = po['transcribe']['phraseTextKey'] try: # Get the phrases file from S3 and read it into memory print( '\t---> Getting phrases from: ', phraseFileKey ) phraseFile = bucket.Object( key=phraseFileKey) phraseResponse = phraseFile.get() except Exception as e: print( "***Exception***: Issue reading phrase file: ", phraseFileKey) raise stvError("*** Error Code: ", e.response['Error']['Message'] + " ***") # Get the bytes from the "body" section phrasesIn = phraseResponse['Body'].read() # ===Step 2=== # First, split off the the S3://baseBucket info from the S3 URI leftPart = "s3://" + pc['baseBucketName'] + '/' fullURI = event['item']['translate']['s3Uri'] rightPart = fullURI.split(leftPart) print( "\t---> Translation Key Info: ") print( "\t---> full", fullURI) print( "\t---> left: ", leftPart) print( "\t---> right: ", rightPart) # Set up the key to Full Translation File (FTF) and read it into memory # translatedFileKey = rightPart[1] + targetLanguageShort + ".translatableText-" + event['transcribeJobParms']['sourceLanguage'] + "_" + pp['uuid'] + ".txt" translatedFileKey = rightPart[1] try: print( "\t---> Opening file: ", translatedFileKey ) translatedFile = bucket.Object( key=translatedFileKey) translatedResponse = translatedFile.get() except Exception as e: print( "***Exception***: Issue reading translation file: ", translatedFileKey) raise stvError("*** Error Code: ", e.response['Error']['Message'] + " ***") # Get the bytes from the "body" section translationIn = translatedResponse['Body'].read() # === STEP 3 === # Now convert the byte streams from the PF and FTF into the appropriate JSON structures # the FTF is a list of dictionaries, so split based on the end of a dictionary translatedPhrases = translationIn.decode('utf-8').split("]\n") # The original phrases are a list of strings that need to be decoded originalPhrases = json.loads(phrasesIn.decode('utf-8')) # Set up the new structure to hold the list of phrases newPhrases = [] count = 0 # debugging print("tphrase length: ", len(translatedPhrases)) # walk through each translated phrase, split it up to remove the markers, then combine the # originalPhrase fields (start_time, end_time, etc.) with the translated phrase that aligns to # the phrase Id marker. Finally add the combined phrase to the new structure for i in range (0, len(translatedPhrases)): if i < len(translatedPhrases): tphrase = translatedPhrases[i] else: print( "\t---> Reached max length. Breaking.") break if len(tphrase) == 0: print( "\t---> Phrase doesn't contain text. Breaking.") break; p1 = tphrase.split('[') num = p1[0].lstrip(' {') num = num.rstrip('} ') if len(p1) < 2: break p2 = p1[1].strip('[') phr = {} phr['phrase_id'] = num phr['start_time'] = originalPhrases[i]['start_time'] phr['end_time'] = originalPhrases[i]['end_time'] phr['timecode'] = originalPhrases[i]['start_time'] + " --> " + originalPhrases[i]['end_time'] phr['phrase_words'] = originalPhrases[i]['phrase_words'] phr['words'] = p2 # Now add the new phrase to the structure newPhrases.append(phr) i += 1 # debugging # print("===>newPhrases:", newPhrases) # Write the combined structure to S3 for further processing # And write the translateable input to a file combinedPhrasesKey = po['process']['uuid'] + '/'+ pc['subtitleInput'] + '/' + tls + '/' + 'combinedPhrases-' + tls + '_' + po['process']['uuid'] + '.txt' #convert the JSON structure to a string so that we can write it out to S3 combinedPhrasesString = json.dumps(newPhrases) # put the new S3 object bucket.put_object(Body=combinedPhrasesString, ContentType="text/plain", Key=combinedPhrasesKey ) # Generate and return the repsonse response = event response['item']['subtitle']['combinedPhrasesKey'] = combinedPhrasesKey ptgts[event['index']]['subtitle']['combinedPhrasesKey'] = combinedPhrasesKey # Put the ouptut back into DynamoDB if stm.update_stm_target( event['input']['Outputs']['process']['ProcessName'], ptgts[event['index']], event['index'] ): print('===> stvblogCombinePhrasesAndTranslation Complete') return response else: raise stvError( "*** Error writing to the stvblog table ***" )