import re from datetime import datetime, timezone import boto3 import json from urllib.parse import urlparse import urllib import botocore import pandas as pd from boto3 import client from botocore.exceptions import * def bucket_dict(bucket_url): bucket, path = None, None try: parsed = urlparse(bucket_url) bucket, path = parsed.netloc, parsed.path except ValueError as e: print( f"{e}\nPlease enter a valid S3 url following one of the following style conventions: " f"S3://bucket-name/key-name" ) exit(-1) if path.startswith("/"): path = path[1:] if not path == "" and not path.endswith("/"): path = f"{path}/" if path == "replays/": path = "" return {"url": bucket_url, "bucket_name": bucket, "prefix": path + "analysis/"} def list_replays(bucket_url, session): """Iterates through S3 and aggregates list of successful replays @param bucket_url: str, S3 bucket location """ table = [] bucket = bucket_dict(bucket_url) try: if not session: resp = client("s3").list_objects_v2( Bucket=bucket.get("bucket_name"), Delimiter="/", Prefix=bucket.get("prefix") ) s3_resource = boto3.resource("s3") else: resp = session.client("s3").list_objects_v2( Bucket=bucket.get("bucket_name"), Delimiter="/", Prefix=bucket.get("prefix") ) s3_resource = session.resource("s3") if resp["KeyCount"] == 0: print( f"No replays available in S3. Please run a replay with replay analysis to access replays " f"from the command line." ) return None, table except Exception as e: return None, e except botocore.errorfactory.NoSuchBucket as e: return None, e for x in resp["CommonPrefixes"]: try: s3_resource.Object(bucket.get("bucket_name"), f'{x.get("Prefix")}info.json').load() content_object = s3_resource.Object( bucket.get("bucket_name"), f'{x.get("Prefix")}info.json' ) file_content = content_object.get()["Body"].read().decode("utf-8") json_content = json.loads(file_content) json_content["bucket"] = bucket.get("bucket_name") json_content["s3_prefix"] = x.get("Prefix") table.append(json_content) except ClientError as e: if ( e.response["Error"]["Code"] == "404" ): # if info.json does not exist in folder, do not add to list continue else: print(f"Unable to access replay. {e}") # use tabulate lib to format output return bucket.get("bucket_name"), table def remove_comments(string): pattern = r"(\".*?\"|\'.*?\')|(/\*.*?\*/|//[^\r\n]*$)" # first group captures quoted strings (double or single) # second group captures comments (//single-line or /* multi-line */) regex = re.compile(pattern, re.MULTILINE | re.DOTALL) def _replacer(match): # if the 2nd group (capturing comments) is not None, # it means we have captured a non-quoted (real) comment string. if match.group(2) is not None: return "" # so we will return empty to remove the comment else: # otherwise, we will return the 1st group return match.group(1) # captured quoted-string return regex.sub(_replacer, string) def hash_query(st): # first group captures quoted strings (double or single) # second group captures comments (//single-line or /* multi-line */) if "xid" in st: return hash(st[(st.find("xid")) : (st.find("replay_start") - 3)]) return 0 def calc_diff(replay_start, timestamp): if len(timestamp.split(".")[-1]) < 6: for i in range(6 - len(timestamp.split(".")[-1])): timestamp = timestamp + "0" start = datetime.fromisoformat(replay_start) stamp = datetime.fromisoformat(timestamp) if start.tzinfo is None: start = start.replace(tzinfo=timezone.utc) if stamp.tzinfo is None: stamp = stamp.replace(tzinfo=timezone.utc) return ((stamp - start).total_seconds()) * 1000 def filter_data(data, replay, query_types=None, users=None, duration=None): if users is None: users = [] if query_types is None: query_types = [] replay_data = data.loc[data["sid"] == replay["sid"]] if len(users) != 0: replay_data = replay_data.loc[replay_data["user_name"].isin(users)] if len(query_types) != 0: replay_data = replay_data.loc[replay_data["query_type"].isin(query_types)] if duration is not None and duration != [0, 0]: if {"start_diff", "end_diff"}.issubset(data.columns): start = replay_data[replay_data["start_diff"].between(duration[0], duration[1])] end = replay_data[replay_data["end_diff"].between(duration[0], duration[1])] replay_data = pd.concat( [start, end], join="inner", ignore_index=True ).drop_duplicates() elif {"time_diff"}.issubset(data.columns): replay_data = replay_data[replay_data["start_diff"].between(duration[0], duration[1])] else: start_x = replay_data[replay_data["start_diff_x"].between(duration[0], duration[1])] start_y = replay_data[replay_data["start_diff_y"].between(duration[0], duration[1])] end_x = replay_data[replay_data["end_diff_x"].between(duration[0], duration[1])] end_y = replay_data[replay_data["end_diff_y"].between(duration[0], duration[1])] replay_data = pd.concat( [start_x, start_y, end_x, end_y], join="inner", ignore_index=True ).drop_duplicates() return replay_data