import json import boto3 import os import pandas as pd def lambda_handler(event, context): # Splits remarks file and readies to send them to Comprehend #Setting up necessary client and resource variables s3client = boto3.client('s3') s3 = boto3.resource('s3') input_bucket = os.environ['S3Bucket'] DeploymentType = os.environ['Type'] input_key = 'inputs/mv_war_main_prop_remark.txt' # Changes the current working directory to a temporary folder called '/tmp/' os.chdir('/tmp/') # Splits the subdomain from the file name and extension input_file_list = input_key.split('/') # Assigns the file name to an variable input_file = input_file_list[1] # Assigns a path for the to-be-created file as the temporary folder input_file_path = '/tmp/' + str(input_file) print("File: " + str(input_file)) print("File Path: " + '/tmp/' + str(input_file)) # Uses the client to download the object that triggered the event into the '/tmp/' directory previously made' s3client.download_file(input_bucket, input_key, input_file_path) df = pd.read_csv('/tmp/mv_war_main_prop_remark.txt',encoding='latin-1',header=0) # Lists the files in the /tmp directory print("OS File List:" + str(os.listdir('/tmp/'))) # Sets a chunk size to about 90 MB Limit = 85000 totalRows = len(df) numFiles = totalRows//Limit + 1 # Sets an iterator file_number = 1 rowTracker = 0 errors = 0 if DeploymentType == 'Test': numFiles = 1 Limit = totalRows//100 # Opens the file and prepares to iterate through it while file_number <= numFiles: toWrite = [] for index,row in df.iterrows(): if rowTracker < Limit: try: writeStr = str(row['SN_WAR'])+',' + str(row['TEXT_REMARK'].encode('utf-8','ignore')).replace("\n",' ').replace('\r','')+"\n" toWrite.append(writeStr) except: errors +=1 if errors <=10: print (str(row['TEXT_REMARK'])) rowTracker +=1 else: fileWriter = open(str(file_number)+'-'+input_file, 'w+') fileWriter.writelines(toWrite) fileWriter.close() s3.meta.client.upload_file('/tmp/'+str(file_number)+'-'+input_file, input_bucket, 'model-input/'+str(file_number)+'-'+input_file) os.remove('/tmp/'+str(file_number)+'-'+input_file) rowTracker = 0 toWrite = [] file_number +=1 if DeploymentType == 'Test': break # Write one last time for the last file if DeploymentType != 'Test': fileWriter = open(str(file_number)+'-'+input_file, 'w+') fileWriter.writelines(toWrite) fileWriter.close() s3.meta.client.upload_file('/tmp/'+str(file_number)+'-'+input_file, input_bucket, 'model-input/'+str(file_number)+'-'+input_file) os.remove('/tmp/'+str(file_number)+'-'+input_file) print ('Number of Errors: ', str(errors)) print ('Total length of df: ', str(len(df))) # # Opens a new file with write, add the iterator to the name # with open(str(file_number)+ '-'+input_file,'w+') as chunk_file: # chunk_file.write(chunk) # # Where the new file currently is # file = '/tmp/' + str(file_number)+'-'+input_file # # Where the new file will be written to # put_key = 'model-input/' + str(file_number)+'-'+ input_file # # Pushes to s3 # s3.meta.client.upload_file(file, input_bucket, put_key) # # Deletes the current content of chunk_file, is this necessary if we just overwrite the variable in the next loop? # chunk_file.seek(0) # chunk_file.truncate() # # Iterates up and moves on to the next 90 MB chunk # file_number += 1 # chunk = f.read(CHUNK_SIZE) print("OS File List:" + str(os.listdir('.')))