import numpy as np class StarOversampler: """ Implementation of the oversampler proposed in [1] using the `star` topology. The implementation is based on the implementation of https://github.com/analyticalmindsltd/smote_variants Parameters ---------- proportion: float (default = 1) proportion of the difference of n_maj and n_min to sample e.g. 1.0 means that after sampling the number of minority samples will be equal to the number of majority samples References ---------- .. [1] Gazzah, S. and Amara, N. E. B. "New Oversampling Approaches Based on Polynomial Fitting for Imbalanced Data Sets" The Eighth IAPR International Workshop on Document Analysis Systems """ def __init__(self, proportion=1.0): self.proportion = proportion def fit(self, X, y=None): pass def resample(self, X, y, verbose=False): """ Generate synthetic minority samples """ unique, counts = np.unique(y, return_counts=True) class_stats = dict(zip(unique, counts)) min_label = unique[0] if counts[0] < counts[1] else unique[1] maj_label = unique[1] if counts[0] < counts[1] else unique[0] # determine the number of samples to generate n_to_sample = self.det_n_to_sample(self.proportion, class_stats[maj_label], class_stats[min_label]) if n_to_sample == 0: if verbose: print("StarOversampler: Sampling is not needed") return X.copy(), y.copy() samples = [] # Implementation of the star topology X_min = X[y == min_label] X_mean = np.mean(X_min, axis=0) k = max([1, int(np.rint(n_to_sample / len(X_min)))]) for x in X_min: diff = X_mean - x for i in range(1, k + 1): samples.append(x + float(i) / (k + 1) * diff) return np.vstack([X, np.vstack(samples)]), np.hstack([y, np.repeat(min_label, len(samples))]) def det_n_to_sample(self, proportion, n_maj, n_min): """ Determines the number of samples to generate Parameters ---------- proportion: float proportion of the difference of n_maj and n_min to sample e.g. 1.0 means that after sampling the number of minority samples will be equal to the number of majority samples n_maj: int number of majority samples n_min: int number of minority samples """ return max([0, int((n_maj - n_min) * proportion)])