import numpy as np from math import sqrt, ceil import cv2 import time import os import pandas as pd import boto3 import pathlib import os import zlib import awswrangler as wr import io from malware_detection_utils.utils import logMessage from malware_detection_utils.utils import upload_file_using_client LOGTYPE_ERROR = 'ERROR' LOGTYPE_INFO = 'INFO' LOGTYPE_DEBUG = 'DEBUG' def main(): start_time = time.time() array_index = int(os.environ.get('AWS_BATCH_JOB_ARRAY_INDEX', '0')) aws_region=os.environ.get('AWS_REGION') input_bucket_csv=os.environ.get('INPUT_BUCKET_CSV') input_bucket_csv_prefix=os.environ.get('INPUT_BUCKET_CSV_PREFIX') input_bucket=os.environ.get('INPUT_BUCKET') image_bucket=os.environ.get('IMAGE_BUCKET') metadatadb_bucket=os.environ.get('METADATADB_BUCKET') metadata_key=os.environ.get('METADATA_KEY') multiplier=int(os.environ.get('MULTIPLIER',1)) logMessage(f"AWS_REGION {aws_region}", LOGTYPE_INFO) logMessage(f"INPUT_BUCKET_CSV {input_bucket_csv}", LOGTYPE_INFO) logMessage(f"INPUT_BUCKET_CSV_PREFIX {input_bucket_csv_prefix}", LOGTYPE_INFO) logMessage(f"INPUT_BUCKET {input_bucket}", LOGTYPE_INFO) logMessage(f"IMAGE_BUCKET {image_bucket}", LOGTYPE_INFO) logMessage(f"METADATADB_BUCKET {metadatadb_bucket}", LOGTYPE_INFO) logMessage(f"METADATA_KEY {metadata_key}", LOGTYPE_INFO) # Reading metadatadb CSV logMessage("Reading metadatadb CSV", LOGTYPE_INFO) s3 = boto3.client("s3") obj = s3.get_object(Bucket=metadatadb_bucket, Key=metadata_key) data = io.BytesIO(obj["Body"].read()) # The fields of the metadata db are specific to the sorel dataset. It must be customized when using another dataset. colnames = ["index","sha256", "is_malware","rl_fs_t","rl_ls_const_positives","adware", "flooder", "ransomware", "dropper", "spyware", "packed", "crypto_miner", "file_infector", "installer", "worm", "downloader"] metadf = pd.read_csv(data, sep=",",names=colnames, header=None) metadatadf = metadf[["adware", "flooder", "ransomware", "dropper", "spyware", "packed", "crypto_miner", "file_infector", "installer", "worm","downloader","sha256"]] count=0 for i in range(0,multiplier): count=count+1 if i == 0: array_index=array_index else: array_index=array_index+1000*i # Reading Group File logMessage(f"Reading s3://{input_bucket_csv}/{input_bucket_csv_prefix}-{array_index}.csv", LOGTYPE_INFO) # Read CSV Files by Index df=pd.read_csv(f"s3://{input_bucket_csv}/{input_bucket_csv_prefix}-{array_index}.csv") # make sure indexes pair with number of rows df = df.reset_index() s3_resource = boto3.resource('s3', aws_region) for index, row in df.iterrows(): count=count+1 start_time = time.time() # Initialize Variables highest_ranked_malware_type_value=0 second_highest_ranked_malware_type_value=0 height=9999 width=9999 malware_type="" file_key=str(row['Key']) file_name=str(row['Key']).split('/')[2] logMessage(f"Processing {file_name}", LOGTYPE_INFO) s3_object = s3_resource.Bucket(input_bucket).Object(file_key) data = zlib.decompress(s3_object.get()['Body'].read()) # Data length in bytes data_len = len(data) if data_len < 15000000: # d is a verctor of data_len bytes d = np.frombuffer(data, dtype=np.uint8) # Assume image shape should be close to square sqrt_len = int(ceil(sqrt(data_len))) # Compute square toot and round up # Requiered length in bytes. new_len = sqrt_len*sqrt_len # Number of bytes to pad (need to add zeros to the end of d) pad_len = new_len - data_len # Pad d with zeros at the end. # padded_d = np.pad(d, (0, pad_len)) padded_d = np.hstack((d, np.zeros(pad_len, np.uint8))) # Reshape 1D array into 2D array with sqrt_len pad_len x sqrt_len (im is going to be a Grayscale image). im = np.reshape(padded_d, (sqrt_len, sqrt_len)) # Save image cv2.imwrite(f'{file_name}.png', im) img = cv2.imread(f'{file_name}.png',0) height, width = img.shape[:2] # lookup malware family for the file df_withsha256 = metadatadf.loc[(metadatadf.sha256 == file_name)] df=df_withsha256.drop('sha256', axis=1) # Check if the binary is classified as malware but the catagories are empty if (df['adware']==0).any() and (df['flooder']==0).any() and (df['ransomware']==0).any() and (df['dropper']==0).any() and (df['spyware']==0).any() and (df['packed']==0).any() and (df['crypto_miner']==0).any() and (df['file_infector']==0).any() and (df['installer']==0).any() and (df['worm']==0).any(): malware_type="not_classified" else: df['highest_ranked_malware_type']=df.idxmax(axis=1) df = df.reset_index() # Get highest malware type for index, row in df.iterrows(): highest_ranked_malware_type=str(row['highest_ranked_malware_type']) # Get highest malware type value from column highest_ranked_malware_type_value=df[f'{highest_ranked_malware_type}'] # Drop highest malware type column to get the 2nd highest malware type column df=df.drop(f"{highest_ranked_malware_type}", axis=1) df=df.drop("highest_ranked_malware_type", axis=1) df=df.drop("index", axis=1) df['second_highest_ranked_malware_type']=df.idxmax(axis=1) df = df.reset_index() for index, row in df.iterrows(): second_highest_ranked_malware_type=str(row['second_highest_ranked_malware_type']) # Get 2nd highest malware type value from column second_highest_ranked_malware_type_value=df[f'{second_highest_ranked_malware_type}'] # upload to S3 only if highest_ranked_malware_type_value > 2X second_highest_ranked_malware_type_value # and if height and width of the image is <= 4096 and the malware type is classfied logMessage(f"Checking if {file_name}.png meeting all criteria", LOGTYPE_INFO) logMessage(f"highest_ranked_malware_type_value for {file_name}.png is {highest_ranked_malware_type_value}", LOGTYPE_INFO) logMessage(f"second_highest_ranked_malware_type_value for {file_name}.png is {second_highest_ranked_malware_type_value}", LOGTYPE_INFO) if malware_type!="not_classified": if (int(float(highest_ranked_malware_type_value)) >= 2 * int(float(second_highest_ranked_malware_type_value))) and height <= 4096 and width <= 4096: logMessage(f"{highest_ranked_malware_type} value {highest_ranked_malware_type_value} for {file_name}.png is 2X greater than {second_highest_ranked_malware_type} {second_highest_ranked_malware_type_value}", LOGTYPE_INFO) malware_type=highest_ranked_malware_type logMessage(f"malware_type for {file_name}.png is {malware_type}", LOGTYPE_INFO) upload_file_using_client(image_bucket, highest_ranked_malware_type, f'{file_name}.png') logMessage(f"{file_name}.png upload complete", LOGTYPE_INFO) else: logMessage(f"Image Size {height} X {width}", LOGTYPE_INFO) logMessage(f"Malware Type is {malware_type}", LOGTYPE_INFO) logMessage(f"Malware classification is too close to call. Discarding this file", LOGTYPE_INFO) else: logMessage(f"Malware Type is {malware_type}", LOGTYPE_INFO) end_time = time.time() time_spent = (end_time - start_time) * 1000 print(f"Time Spent Processing {file_name} {time_spent} ms") logMessage(f"Number of files processed {count}", LOGTYPE_INFO) print("Python file invoked") if __name__ == '__main__': main()