# MIT License # # Copyright (c) 2020 Nguyen Ngo # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. # # https://github.com/mnguyenngo/ab-framework # import numpy as np import scipy.stats as scs def pooled_prob(N_A, N_B, X_A, X_B): """Returns pooled probability for two samples""" return (X_A + X_B) / (N_A + N_B) def pooled_SE(N_A, N_B, X_A, X_B): """Returns the pooled standard error for two samples""" p_hat = pooled_prob(N_A, N_B, X_A, X_B) SE = np.sqrt(p_hat * (1 - p_hat) * (1 / N_A + 1 / N_B)) return SE def confidence_interval(sample_mean=0, sample_std=1, sample_size=1, sig_level=0.05): """Returns the confidence interval as a tuple""" z = z_val(sig_level) left = sample_mean - z * sample_std / np.sqrt(sample_size) right = sample_mean + z * sample_std / np.sqrt(sample_size) return (left, right) def z_val(sig_level=0.05, two_tailed=True): """Returns the z value for a given significance level""" z_dist = scs.norm() if two_tailed: sig_level = sig_level/2 area = 1 - sig_level else: area = 1 - sig_level z = z_dist.ppf(area) return z def ab_dist(stderr, d_hat=0, group_type='control'): """Returns a distribution object depending on group type Examples: Parameters: stderr (float): pooled standard error of two independent samples d_hat (float): the mean difference between two independent samples group_type (string): 'control' and 'test' are supported Returns: dist (scipy.stats distribution object) """ if group_type == 'control': sample_mean = 0 elif group_type == 'test': sample_mean = d_hat # create a normal distribution which is dependent on mean and std dev dist = scs.norm(sample_mean, stderr) return dist def min_sample_size(bcr, mde, power=0.8, sig_level=0.05): """Returns the minimum sample size to set up a split test Arguments: bcr (float): probability of success for control, sometimes referred to as baseline conversion rate mde (float): minimum change in measurement between control group and test group if alternative hypothesis is true, sometimes referred to as minimum detectable effect power (float): probability of rejecting the null hypothesis when the null hypothesis is false, typically 0.8 sig_level (float): significance level often denoted as alpha, typically 0.05 Returns: min_N: minimum sample size (float) References: Stanford lecture on sample sizes http://statweb.stanford.edu/~susan/courses/s141/hopower.pdf """ # standard normal distribution to determine z-values standard_norm = scs.norm(0, 1) # find Z_beta from desired power Z_beta = standard_norm.ppf(power) # find Z_alpha Z_alpha = standard_norm.ppf(1-sig_level/2) # average of probabilities from both groups pooled_prob = (bcr + bcr+mde) / 2 min_N = (2 * pooled_prob * (1 - pooled_prob) * (Z_beta + Z_alpha)**2 / mde**2) return min_N def p_val(N_A, N_B, p_A, p_B): """Returns the p-value for an A/B test""" return scs.binom(N_A, p_A).pmf(p_B * N_B)