import numpy as np from math import sqrt, ceil import cv2 import time import os import pandas as pd import boto3 import pathlib import os import zlib import awswrangler as wr import io from malware_detection_utils.utils import logMessage LOGTYPE_ERROR = 'ERROR' LOGTYPE_INFO = 'INFO' LOGTYPE_DEBUG = 'DEBUG' def upload_file_using_client(bucket_name, bucket_prefix,object_name): """ Uploads file to S3 bucket using S3 client object :return: None """ s3 = boto3.client("s3") file_name = os.path.join(pathlib.Path(object_name).parent.resolve(), object_name) response = s3.upload_file(file_name, bucket_name, bucket_prefix + "/" + object_name) def main(): start_time = time.time() array_index = int(os.environ.get('AWS_BATCH_JOB_ARRAY_INDEX', '0')) aws_region=os.environ.get('AWS_REGION') input_bucket_csv=os.environ.get('INPUT_BUCKET_CSV') input_bucket_csv_prefix=os.environ.get('INPUT_BUCKET_CSV_PREFIX') input_bucket=os.environ.get('INPUT_BUCKET') image_bucket=os.environ.get('IMAGE_BUCKET') metadatadb_bucket=os.environ.get('METADATADB_BUCKET') metadata_key=os.environ.get('METADATA_KEY') logMessage(f"AWS_REGION {aws_region}", LOGTYPE_INFO) logMessage(f"INPUT_BUCKET_CSV {input_bucket_csv}", LOGTYPE_INFO) logMessage(f"INPUT_BUCKET_CSV_PREFIX {input_bucket_csv_prefix}", LOGTYPE_INFO) logMessage(f"INPUT_BUCKET {input_bucket}", LOGTYPE_INFO) logMessage(f"IMAGE_BUCKET {image_bucket}", LOGTYPE_INFO) logMessage(f"METADATADB_BUCKET {metadatadb_bucket}", LOGTYPE_INFO) # Reading metadatadb CSV logMessage("Reading metadatadb CSV", LOGTYPE_INFO) s3 = boto3.client("s3") obj = s3.get_object(Bucket=metadatadb_bucket, Key=metadata_key) data = io.BytesIO(obj["Body"].read()) colnames = ["id","md5","sha1","sha256","total","positives","list","filetype","submitted","user_id","length","entropy"] metadatadf = pd.read_csv(data, sep=",",names=colnames, header=0) # Initialize Variables height=9999 width=9999 malware_type="" # Reading Group File logMessage(f"Reading s3://{input_bucket_csv}/{input_bucket_csv_prefix}-{array_index}.csv", LOGTYPE_INFO) # Read CSV Files by Index df=pd.read_csv(f"s3://{input_bucket_csv}/{input_bucket_csv_prefix}-{array_index}.csv") # make sure indexes pair with number of rows df = df.reset_index() s3_resource = boto3.resource('s3', aws_region) for index, row in df.iterrows(): start_time = time.time() file_name=str(row['Key']) df_list = metadatadf.loc[(metadatadf.id == row['Key'])] logMessage(f"Processing {file_name}", LOGTYPE_INFO) s3_object = s3_resource.Bucket(input_bucket).Object(file_name) data = s3_object.get()['Body'].read() # Data length in bytes data_len = len(data) if data_len < 15000000: # d is a verctor of data_len bytes d = np.frombuffer(data, dtype=np.uint8) # Assume image shape should be close to square sqrt_len = int(ceil(sqrt(data_len))) # Compute square toot and round up # Requiered length in bytes. new_len = sqrt_len*sqrt_len # Number of bytes to pad (need to add zeros to the end of d) pad_len = new_len - data_len # Pad d with zeros at the end. # padded_d = np.pad(d, (0, pad_len)) padded_d = np.hstack((d, np.zeros(pad_len, np.uint8))) # Reshape 1D array into 2D array with sqrt_len pad_len x sqrt_len (im is going to be a Grayscale image). im = np.reshape(padded_d, (sqrt_len, sqrt_len)) # Save image cv2.imwrite(f'{file_name}.png', im) img = cv2.imread(f'{file_name}.png',0) height, width = img.shape[:2] # upload to S3 only if height and width of the image is <= 4096 and metadatadf['list']=='Whitelist' if height <= 4096 and width <= 4096 and (df_list['list']=='Whitelist').any(): upload_file_using_client(image_bucket, 'benign', f'{file_name}.png') else: logMessage(f"Image Size {height} X {width}", LOGTYPE_INFO) logMessage(f"List is {df_list['list']}", LOGTYPE_INFO) end_time = time.time() time_spent = (end_time - start_time) * 1000 print(f"Time Spent Processing {file_name} {time_spent} ms") print("Python file invoked") if __name__ == '__main__': main()