import config import os import pandas as pd import numpy as np import itertools import datetime from sklearn.metrics import mean_squared_error from scipy.signal import savgol_filter from matplotlib import pyplot as plt plt.rcParams["figure.figsize"] = (12,4) import warnings warnings.filterwarnings("ignore") class GeoTransmissionStats: def __init__(self, country_code, n_population, wave1_case_rates, wave1_rel_case_rates, wave1_start_dt, wave1_peak_dt, wave2_case_rates, wave2_rel_case_rates, wave2_start_dt, wave2_peak_dt): self.country_code = country_code self.n_population = n_population self.w1_case_rates = wave1_case_rates self.w1_rel_case_rates = wave1_rel_case_rates if wave1_start_dt is None: return self.w1_start_dt = wave1_start_dt self.w1_peak_dt = wave1_peak_dt self.w1_days_to_peak = None if wave1_peak_dt is None else (wave1_peak_dt - wave1_start_dt).days self.w1_mean_case_rate = np.mean(wave1_case_rates) self.w1_median_case_rate = np.median(wave1_case_rates) self.w1_max_case_rate = np.max(wave1_case_rates) self.w1_10_pctl_case_rate = np.percentile(wave1_case_rates, 10) self.w1_25_pctl_case_rate = np.percentile(wave1_case_rates, 25) self.w1_case_rate_std_dev = np.std(wave1_case_rates) self.w1_mean_rel_case_rate = np.mean(wave1_rel_case_rates) self.w1_rel_case_rate_std_dev = np.std(wave1_rel_case_rates) self.w2_case_rates = wave2_case_rates self.w2_rel_case_rates = wave2_rel_case_rates self.w2_start_dt = wave2_start_dt self.w2_peak_dt = wave2_peak_dt self.w2_days_to_peak = None if wave2_start_dt is not None and wave2_peak_dt is not None and wave2_peak_dt > wave2_start_dt: self.w2_days_to_peak = (wave2_peak_dt - wave2_start_dt).days self.days_between_disease_waves = None if self.w2_peak_dt is not None: self.days_between_disease_waves = (self.w2_peak_dt - self.w1_peak_dt).days elif self.w2_start_dt is not None and self.w1_peak_dt is not None: self.days_between_disease_waves = (self.w2_start_dt - self.w1_start_dt).days + (self.w1_peak_dt - self.w1_start_dt).days self.w2_mean_case_rate = None if wave2_case_rates is None else np.mean(wave2_case_rates) self.w2_mean_rel_case_rate = None if wave2_rel_case_rates is None else np.mean(wave2_rel_case_rates) class geo_transmission_analyzer: ### Primary data source DATA_SRC = os.path.join(config.base_data_dir, config.oxcgrt_intervention_data_offline) ### Case count threshold to consider countries CONF_CASES_THRESHOLD = config.min_country_conf_case_threshold COUNTRY_POPULATION_SRC = os.path.join(config.base_data_dir, config.country_populations_data) def __init__(self, src_population, country_level_projection=False): self.target_country = None self.df = None self.country_level_projection = country_level_projection self.country_dict = None self.country_populations = None self.src_population = src_population self.period = config.period #time-span length in days # Get confirmed covid19 cases data for countries def get_conf_cases (self, start_date=None): data_extended = pd.read_csv(self.__class__.DATA_SRC) data_extended = data_extended.loc[(data_extended['RegionCode'] == '') | (data_extended['RegionCode'].isnull())] data_extended['Date'] = pd.to_datetime(data_extended['Date'], format='%Y%m%d') if start_date is not None: data_extended = data_extended.loc[data_extended['Date'] >= start_date] df_populations = pd.read_csv(self.__class__.COUNTRY_POPULATION_SRC) #might choose population from later years as well based on the population data used for simulation df_populations['2011 [YR2011]'] = df_populations['2011 [YR2011]'].astype(int) selected_countries = data_extended.loc[ (data_extended['ConfirmedCases'] > self.__class__.CONF_CASES_THRESHOLD), 'CountryCode'].unique() data_extended = data_extended.loc[data_extended['CountryCode'].isin(selected_countries)] df = data_extended[ ['Date', 'CountryCode', 'CountryName', 'ConfirmedCases', 'ConfirmedDeaths', 'StringencyIndex']] # df = df.fillna(method='ffill') df = df.fillna(0) ### Remove last few entries for each country - due to possible data error / deplayed reporting for c in df['CountryCode'].unique().tolist(): df.loc[df['CountryCode'] == c] = df.loc[df['CountryCode'] == c].iloc[:-2] return df def get_population_data(self, countries): df_populations = pd.read_csv(self.__class__.COUNTRY_POPULATION_SRC) df_populations['2011 [YR2011]'] = df_populations['2011 [YR2011]'].astype(int) country_dict = dict() country_populations = dict() for cc in countries['CountryCode']: if cc in df_populations['Country Code'].tolist(): country_dict[cc] = countries.loc[countries['CountryCode'] == cc, 'CountryName'].tolist()[0] country_populations[cc] = df_populations.loc[df_populations['Country Code'] == cc, '2011 [YR2011]'].iloc[0] return country_dict, country_populations # Measure relative rate of change def measure_rel_changes (self, conf_cases): period = self.period periodic_changes = list() #print (conf_cases.tolist()) for i in range(int(len(conf_cases) / period) + 1): end_index = i * period + period end_index = end_index if end_index <= len(conf_cases) else len(conf_cases) if (end_index - i * period) < np.ceil(period / 2): continue conf_cases_subset = conf_cases[i * period:end_index] mean_change = np.mean(np.diff(conf_cases_subset)) if period > 1 else conf_cases_subset[0] periodic_changes.append(mean_change) #print (conf_cases_subset.tolist(), mean_change) periodic_changes = np.array(periodic_changes) periodic_changes[(periodic_changes == -np.inf) | (periodic_changes == np.inf) | (periodic_changes == np.nan) | (np.isnan(periodic_changes))] = 0 periodic_changes[periodic_changes < 0] = 0 periodic_changes_rel = periodic_changes[1:] / periodic_changes[:-1] periodic_changes_rel[(periodic_changes_rel == -np.inf) | (periodic_changes_rel == np.inf) | (periodic_changes_rel == np.nan) | (np.isnan(periodic_changes_rel))] = 0 return periodic_changes, periodic_changes_rel # Optional function to plot disease waves, peaks etc. def plot_waves (self, country, df_country, periodic_changes, periodic_changes_smoothed_src, rising_idx, flattenning_idx, second_rising_idx, second_flattenning_idx): try: offset = len(df_country) % len(periodic_changes_smoothed_src) offset = 0 if rising_idx > -1: start = rising_idx * config.period + offset if flattenning_idx > -1: end = flattenning_idx * config.period - offset print ('\n* * * {}: Time to reach wave-1 peak: {} days | Start Dt:{} | End Dt:{} | Offset: {}'.format( self.country_dict[country], end - start, df_country['Date'].iloc[start].date(), df_country['Date'].iloc[end].date(), offset) ) else: print ('\n* * * {}: Time so far: {} days | Start Dt:{}'.format( self.country_dict[country], len(df_country['ConfirmedCases']) - start, df_country['Date'].iloc[start].date()) ) ### Plotting peaks/troughs against changes plt.plot(np.arange(len(periodic_changes)), periodic_changes) plt.plot(np.arange(len(periodic_changes_smoothed_src)), periodic_changes_smoothed_src) if flattenning_idx > -1: plt.plot(flattenning_idx, periodic_changes[flattenning_idx], 'o') if rising_idx > -1: plt.plot(rising_idx, periodic_changes[rising_idx], 'X') if second_rising_idx > -1: plt.plot(second_rising_idx, periodic_changes[second_rising_idx], 'o') if second_flattenning_idx > -1: plt.plot(second_flattenning_idx, periodic_changes[second_flattenning_idx], 'X') plt.show(); except: print ('Error for country: ' + country) # Detect disease waves and respective starts & peaks def get_disease_wave_details (self, df_country, country=None): periodic_changes, periodic_changes_rel = self.measure_rel_changes(df_country['ConfirmedCases']) def consecutive(data, stepsize=1, min_length=10): splits = np.split(data, np.where(np.diff(data) > stepsize)[0] + 1) splits = [split for split in splits if len(split) >= min_length] splits = splits if len(splits) > 0 else list() return splits #Smoothen the rates using savgol_filter before finding the peak window_len = int(len(periodic_changes)/2)-1 if int(len(periodic_changes)/2)%2==0 else int(len(periodic_changes)/2) poly_smoothing_degree = 6 periodic_changes_smoothed_src = savgol_filter(periodic_changes, window_len, poly_smoothing_degree) periodic_changes_smoothed_src[periodic_changes_smoothed_src < 0] = 0 ### To ensure that small upward oscillations are not picked as start of a wave periodic_changes_smoothed_cutoff = np.percentile(periodic_changes_smoothed_src, 95) * 0.02 periodic_changes_smoothed_src[periodic_changes_smoothed_src < periodic_changes_smoothed_cutoff] = 0 periodic_changes_smoothed = np.diff(periodic_changes_smoothed_src) rising_diffs_idx = np.where(periodic_changes_smoothed > 0.5)[0] consecutive_rises = consecutive(rising_diffs_idx) rising_diffs_idx = consecutive_rises[0] if len(consecutive_rises) > 0 else list() # Capture start of 1st wave rising_idx = -1 if len(rising_diffs_idx) > 0: rising_idx = rising_diffs_idx[1] if len(rising_diffs_idx) > 1 else rising_diffs_idx[0] rising_idx = min (len(periodic_changes_smoothed)-1, rising_idx + 1) declining_diffs_idx = np.where(periodic_changes_smoothed <= 0)[0] consecutive_drops = consecutive(declining_diffs_idx) # Capture peak of 1st wave flattenning_idx = -1 for k in range(len(consecutive_drops)): if consecutive_drops[k][0] > rising_idx: flattenning_idx = consecutive_drops[k][1] if len(consecutive_drops[k]) > 1 else consecutive_drops[k][0] break if rising_idx > flattenning_idx > -1: flattenning_idx, rising_idx = -1, -1 # Capture start of 2nd wave second_rising_idx = -1 if len(consecutive_rises) > 1 and flattenning_idx > -1 \ and rising_idx < flattenning_idx < consecutive_rises[1][0] \ and periodic_changes[consecutive_rises[1][0]] < 0.9 * periodic_changes[flattenning_idx]: second_rising_idx = consecutive_rises[1][0] # Capture peak of 2nd wave second_flattenning_idx = -1 for k in range(len(consecutive_drops)): if consecutive_drops[k][0] > flattenning_idx and second_rising_idx > -1 and second_rising_idx < consecutive_drops[k][0]: second_flattenning_idx = consecutive_drops[k][0] break offset = len(df_country) % len(periodic_changes_smoothed) if country is not None: self.plot_waves (country, df_country, periodic_changes, periodic_changes_smoothed_src, rising_idx, flattenning_idx, second_rising_idx, second_flattenning_idx) return rising_idx, flattenning_idx, second_rising_idx, second_flattenning_idx, offset def get_date (self, dates, index): if index > -1: return dates[index] else: return None # Get various stats on disease transmission patterns (rate, start dates, peaks, etc.) def get_disease_transmission_stats (self, df_country, n_population): w1_rising_idx, w1_flattenning_idx, w2_rising_idx, w2_flattenning_idx, offset = self.get_disease_wave_details (df_country) w1_start_idx = w1_rising_idx * config.period + offset w1_end_idx = w1_flattenning_idx * config.period - offset if w1_start_idx >= 0: if w1_end_idx > 0: w1_case_rates, w1_rel_case_rates = self.measure_rel_changes( df_country['ConfirmedCases'].iloc[w1_start_idx:w1_end_idx+1]) else: w1_case_rates, w1_rel_case_rates = self.measure_rel_changes( df_country['ConfirmedCases'].iloc[w1_start_idx:]) w2_start_idx, w2_end_idx = -1, -1 w2_case_rates, w2_rel_case_rates = None, None if w2_rising_idx >= 0: w2_start_idx = w2_rising_idx * config.period + offset w2_end_idx = w2_flattenning_idx * config.period - offset if w2_end_idx > 0: w2_case_rates, w2_rel_case_rates = self.measure_rel_changes( df_country['ConfirmedCases'].iloc[w2_start_idx:w2_end_idx+1]) else: w2_case_rates, w2_rel_case_rates = self.measure_rel_changes( df_country['ConfirmedCases'].iloc[w2_start_idx:]) w1_start_dt = self.get_date (df_country['Date'], w1_start_idx) if w1_start_idx > -1 else None w1_peak_dt = self.get_date (df_country['Date'], w1_end_idx) if w1_end_idx > -1 else None w2_start_dt = self.get_date (df_country['Date'], w2_start_idx) if w2_start_idx > -1 else None w2_peak_dt = self.get_date (df_country['Date'], w2_end_idx) if w2_end_idx > -1 else None geoTransmissionStats = GeoTransmissionStats(df_country, n_population, w1_case_rates, w1_rel_case_rates, w1_start_dt, w1_peak_dt, w2_case_rates, w2_rel_case_rates, w2_start_dt, w2_peak_dt) return geoTransmissionStats else: return None