# # All or portions of this file Copyright (c) Amazon.com, Inc. or its affiliates or # its licensors. # # For complete copyright and license terms please see the LICENSE at the root of this # distribution (the "License"). All use of this software is governed by the License, # or, if provided, by the license below or the license accompanying this file. Do not # remove or modify any license notices. This file is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # from __future__ import print_function from s3 import S3 from keyparts import KeyParts import metric_constant as c import time import datetime import sensitivity VALID_ROOT_PATHS = [sensitivity.SENSITIVITY_TYPE.NONE,sensitivity.SENSITIVITY_TYPE.ENCRYPT] class Crawler(object): def __init__(self, context, bucket): self.__context = context self.__s3 = S3(context, bucket) self.__bucket = bucket self.__sep = context[c.KEY_SEPERATOR_PARTITION] @property def valid_root_paths(self): return VALID_ROOT_PATHS def exists(self, prefix): for page in self.__s3.list(prefix=prefix): if "Contents" in page: for obj in page[ "Contents" ]: return True return False def crawl(self, prefix, lambda_pool, func, depth=10): #combine only files for the past X. Older files should already be fully aggregated print("Locating paths to crawl on bucket '{}' with prefix '{}'".format(self.__bucket, prefix)) crawled_paths = {} idx = 0 count = 0 for page in self.__s3.list(prefix=prefix): if "Contents" in page: for obj in page[ "Contents" ]: key = obj['Key'] parts = KeyParts(key, self.__sep) path = self.__sep.join(parts.path.split(self.__sep)[:depth]) if path not in crawled_paths: crawled_paths[path]=True func(self.__context, path, lambda_pool[idx]) count += 1 idx += 1 if idx >= len(lambda_pool): idx = 0 print("Path scouting complete on bucket '{}'".format(self.__bucket)) return count def crawl_from_relative(self, prefix): #combine only files for the past two days. Older files should already be fully aggregated start = datetime.datetime.utcnow() - datetime.timedelta(days=2) crawl_paths = dict({}) for page in self.__s3.list(prefix): if "Contents" in page: for obj in page[ "Contents" ]: key = obj['Key'] parts = KeyParts(key, self.__sep) event_date = datetime.datetime(parts.year, parts.month, parts.day, parts.hour) if event_date >= start: path = self.__sep.join(parts.path.split(self.__sep)[:-1]) if path not in crawl_paths: crawl_paths[path] = [] crawl_paths[path].append(parts.filename) #assign an amoeba generator per identified path return crawl_paths