#! /usr/bin/env python3 import re import sys import numpy as np import pandas as pd def parse_start_date(line): # Get start date from line in ISO 8601 format hdr = re.compile(r'''.*?\)\s+(?P\d+\-\d+\-\d+).*''') match_hdr = hdr.match(line) start_date = None if match_hdr: start_date = match_hdr['start'] return start_date class ParseInterface(object): def __init__(self, start_date): self.regex_hdr = None self.regex_data = None self.regex_footer = re.compile(r'''Average:.*''') self.start = start_date self.last_date = None self.parquet_name = "sar.parquet" self.fields = [] # Pass in a dict s: # { date: string in YYYY-MM-DD # time: string in hh:mm:ss # } # Pass in a last_date as an np.datetime64 obj or None # Returns a np.datetime64 object def parse_time(self, s, last_date): d = np.datetime64("{} {}".format(s['date'], s['time'])) if last_date: while (d - last_date) < np.timedelta64(0, 's'): d = d + np.timedelta64(1, 'D') return d def parse_data(self, f, save_parquet=True): line = f.readline() data = {} for key in self.fields: data[key[0]] = [] while(line): match_data = self.regex_data.match(line) if match_data: s = {'date': self.start, 'time': match_data['time']} d = self.parse_time(s, self.last_date) data['time'].append(d) self.last_date = d # Every other field is not special for key in self.fields[1:]: data[key[0]].append(key[1](match_data[key[0]])) line = f.readline() continue match_footer = self.regex_footer.match(line) if match_footer: break line = f.readline() df = pd.DataFrame(data) df = df.set_index('time') if (save_parquet): df.to_parquet(self.parquet_name, compression='gzip') return df # Look for the header, if we find it, read until we hit the end of the section # Return the data frame if we get one. def parse_for_header(self, line, f, save_parquet=True): match = self.regex_hdr.match(line) if match: return self.parse_data(f, save_parquet) return None class ParseIfaceUtil(ParseInterface): def __init__(self, start_date, parquet=None): super().__init__(start_date) self.regex_hdr = re.compile(r'''(?P