/*
* SPDX-License-Identifier: Apache-2.0
*
* The OpenSearch Contributors require contributions made to
* this file be licensed under the Apache-2.0 license or a
* compatible open source license.
*
* Modifications Copyright OpenSearch Contributors. See
* GitHub history for details.
*/
package org.opensearch.ad.ml;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.math3.special.Erf;
import org.apache.commons.math3.stat.descriptive.SummaryStatistics;
import com.google.gson.annotations.Expose;
import com.google.gson.annotations.JsonAdapter;
import com.yahoo.sketches.kll.KllFloatsSketch;
import com.yahoo.sketches.kll.KllFloatsSketchIterator;
/**
* A model for converting raw anomaly scores into anomaly grades.
*
* The hybrid thresholding model combines a log-normal distribution model/CDF as
* well as an empirical model/CDF for determining anomalous scores. The
* log-normal CDF is used to initialize the empirical CDF. This is done because
* the training set often does not include anomalies and predictions need to be
* made as to how often a large anomaly score will occur given the training
* data. The empirical model uses the technique described in "Optimal Quantile
* Approximation in Streams" by Karnin, Lang, and Liberty. The KLL model is
* implemented in {@code KllFloatSketchSerDe}.
*
* @see KllFloatsSketchSerDe
*/
public class HybridThresholdingModel implements ThresholdingModel {
/**
* The minimum anomaly score for labeling anomalies.
* Scores below this value result into anomaly grade 0.
*/
public static final double MIN_SCORE = 0.4;
private static final boolean USE_DOUBLE_SIDED_ERROR = true;
private static final double CONFIDENCE = 0.99;
@Expose
@JsonAdapter(KllFloatsSketchSerDe.class)
private KllFloatsSketch quantileSketch;
private double maxScore;
private int numLogNormalQuantiles;
private double minPvalueThreshold;
private int downsampleNumSamples;
private long downsampleMaxNumObservations;
/**
* Initializes a HybridThresholdingModel.
*
* The primary parameters to a HybridThresholdingModel are {@code
* minPvalueThreshold} and {@code maxRankError}. These two parameters define
* the p-value threshold at which anomalies are reported and how accurately
* this threshold can be measured. Note that in order to accurately measure
* the given minimum p-value threshold the maximum rank error needs to be
* small enough. In particular, {@code maxRankError} should be less than
* {@code minPvalueThreshold}.
*
* The maximum possible anomaly score is provided so that large anomaly
* scores can be estimated during training; even when they are not available
* in the training set. The quantile sketch is initialized using {@code
* numLogNormalQuantiles} quantile results from fitting a log-normal
* distribution to the training data.
*
* The size of the empirical CDF, as modeled by the KLL algorithm, is
* determined by the parameter, {@code maxRankError}. When a certain number
* of updates/observations have been processed, as set by {@code
* downsampleMaxNumObservations}, the ECDF model will be automatically
* downsampled to a number of scores equal to {@code downsampleNumSamples}.
*
* @param minPvalueThreshold the p-value threshold beyond which an
* anomaly score is classified as an
* panomaly. A value of 0.995 is
* recommended. (Between: 0 and 1)
* @param maxRankError desired maximum double-sided normalized
* rank error to use in the quantile
* sketch approximation. A value of 0.0001
* is recommended.
* @param maxScore the largest observable anomaly score
* @param numLogNormalQuantiles number of quantiles of the log-normal
* distribution to compute training.
* (Min: 0)
* @param downsampleNumSamples the number of scores to keep when
* downsampling the model. A value of
* 10_000 is recommended.
* @param downsampleMaxNumObservations the threshold number of observations /
* updates at which the model will be
* automatically downsampled. A value of
* 1_000_000 is recommended.
* @throws IllegalArgumentException if {@code minPvalueThreshold} is not
* strictly between 0 and 1, if {@code
* maxRankError} is larger than 1 -
* {@code minPvalueThreshold}, or if
* {@code numLogNormalQuantiles}
* negative
*
* @see KllFloatsSketchSerDe
*/
public HybridThresholdingModel(
double minPvalueThreshold,
double maxRankError,
double maxScore,
int numLogNormalQuantiles,
int downsampleNumSamples,
long downsampleMaxNumObservations
) {
if ((minPvalueThreshold <= 0.0) || (1.0 <= minPvalueThreshold)) {
throw new IllegalArgumentException("minPvalueThreshold must be strictly between 0 and 1.");
}
if (maxRankError > (1.0 - minPvalueThreshold)) {
throw new IllegalArgumentException(
"maxRankError must be smaller than 1 - minPvalueThreshold in order to accurately " + "estimate that threshold."
);
}
if (maxRankError <= 0.0) {
throw new IllegalArgumentException("maxRankError must be positive.");
}
if (maxScore <= 0.0) {
throw new IllegalArgumentException("maxScore must be positive.");
}
if (numLogNormalQuantiles < 0) {
throw new IllegalArgumentException("The maximum number of log-normal quantiles to compute must be non-negative.");
}
if (downsampleNumSamples <= 1) {
throw new IllegalArgumentException("Number of downsamples must be greater than one.");
}
if (downsampleNumSamples >= downsampleMaxNumObservations) {
throw new IllegalArgumentException(
"The number of samples to downsample to must be less than the number of observations " + "before downsampling is triggered."
);
}
this.minPvalueThreshold = minPvalueThreshold;
this.quantileSketch = new KllFloatsSketch(KllFloatsSketch.getKFromEpsilon(maxRankError, USE_DOUBLE_SIDED_ERROR));
this.maxScore = maxScore;
this.numLogNormalQuantiles = numLogNormalQuantiles;
this.downsampleNumSamples = downsampleNumSamples;
this.downsampleMaxNumObservations = downsampleMaxNumObservations;
}
/**
* Empty constructor only for serialization purpose - DO NOT USE.
*
* WARNING. All clients should avoid using this constructor
* for the objects from this constructor have undefined behaviors.
* This constructor is exclusively used for serialization.
*/
public HybridThresholdingModel() {}
/**
* Returns the minimum p-value threshold for anomaly classification.
*
* @return minPvalueThreshold
*/
public double getMinPvalueThreshold() {
return minPvalueThreshold;
}
/**
* Returns the approximate double-sided normalized rank error of the quantile sketch.
*
* @return MaxRankError
*/
public double getMaxRankError() {
return quantileSketch.getNormalizedRankError(USE_DOUBLE_SIDED_ERROR);
}
/**
* Returns the maximum possible anomaly score of the thresholding model.
*
* @return maxScore
*/
public double getMaxScore() {
return maxScore;
}
/**
* Returns the number of log-normal quantiles used to initialize the
* quantile sketch.
*
* @return numLogNormalQuantiles
*/
public int getNumLogNormalQuantiles() {
return numLogNormalQuantiles;
}
/**
* Returns the number of samples to retain when downsampling the model.
*
* @return downsampleNumSamples
*/
public int getDownsampleNumSamples() {
return downsampleNumSamples;
}
/**
* Returns the number of observations that triggers a model downsampling.
*
* @return downsampleMaxNumObservations
*/
public long getDownsampleMaxNumObservations() {
return downsampleMaxNumObservations;
}
/**
* Initializes the model using a training set of anomaly scores.
*
* The hybrid model initialization has several steps. First, a log-normal
* distribution is fit to the training set scores. Next, the quantile sketch
* is initialized with at {@code numLogNormalQuantiles} samples from the
* log-normal model up to {@code maxScore}.
*
* @param anomalyScores an array of anomaly scores with which to train the model.
*/
@Override
public void train(double[] anomalyScores) {
/*
We assume the anomaly scores are fit to a log-normal distribution.
Equivalent to fitting a Gaussian to the logs of the anomaly scores.
*/
SummaryStatistics stats = new SummaryStatistics();
for (int i = 0; i < anomalyScores.length; i++) {
stats.addValue(Math.log(anomalyScores[i]));
}
final double mu = stats.getMean();
final double sigma = stats.getStandardDeviation();
/*
Compute the 1/R quantiles for R = `numLogNormalQuantiles` of the
corresponding log-normal distribution and use these to initialize the
model. We only compute p-values up to the p-value of the known maximum
possible score. Finally, we do not compute the p=0.0 quantile because
raw anomaly scores are positive and non-zero.
*/
final double maxScorePvalue = computeLogNormalCdf(maxScore, mu, sigma);
final double pvalueStep = maxScorePvalue / (numLogNormalQuantiles + 1.0);
for (double pvalue = pvalueStep; pvalue < maxScorePvalue; pvalue += pvalueStep) {
double currentScore = computeLogNormalQuantile(pvalue, mu, sigma);
update(currentScore);
}
}
/**
* The log-normal cumulative distribution function.
*
* Given and anomaly score compute the corresponding p-value.
*
* @param anomalyScore an anomaly score
* @param mu mean parameter of the log-normal distribution
* @param sigma standard deviation of the log-normal distribution
* @return the p-value of the input anomaly score
*/
private double computeLogNormalCdf(double anomalyScore, double mu, double sigma) {
return (1.0 + Erf.erf((Math.log(anomalyScore) - mu) / (Math.sqrt(2.0) * sigma))) / 2.0;
}
/**
* The log-normal quantile function.
*
* Given a p-value and log-normal distribution parameters compute the
* corresponding anomaly score.
*
* @param pvalue a p-value between 0 and 1
* @param mu mean parameter of the log-normal distribution
* @param sigma standard deviation of the log-normal distribution
* @return anomaly score at the given p-value quantile
*/
private double computeLogNormalQuantile(double pvalue, double mu, double sigma) {
return Math.exp(mu + Math.sqrt(2.0) * sigma * Erf.erfInv(2.0 * pvalue - 1.0));
}
/**
* Updates the model with a new anomaly score.
*
* Note that once we initialize the hybrid model we only update the
* empirical CDF. The model is downsampled when the total number of
* observations/updates exceeds {@code downsampleMaxNumObservations}.
*
* @param anomalyScore an anomaly score.
* @see HybridThresholdingModel
*/
@Override
public void update(double anomalyScore) {
quantileSketch.update((float) anomalyScore);
long totalNumObservations = quantileSketch.getN();
if (totalNumObservations >= downsampleMaxNumObservations) {
downsample();
}
}
/**
* Computes the anomaly grade associated with the given anomaly score. A
* non-zero grade implies that the given score is anomalous. The magnitude
* of the grade, a value between 0 and 1, indicates the severity of the
* anomaly.
*
* @param anomalyScore an anomaly score
* @return the associated anomaly grade
*/
@Override
public double grade(double anomalyScore) {
double anomalyGrade = 0.;
if (anomalyScore > MIN_SCORE) {
final double scale = 1.0 / (1.0 - minPvalueThreshold);
final double pvalue = quantileSketch.getRank((float) anomalyScore);
anomalyGrade = scale * (pvalue - minPvalueThreshold);
anomalyGrade = Double.isNaN(anomalyGrade) ? 0. : Math.max(0., anomalyGrade);
}
return anomalyGrade;
}
/**
* Returns the confidence of the model in predicting anomaly grades; that
* is, the probability that the reported anomaly grade is correct according
* to the underlying model.
*
* For the HybridThresholdingModel the model confidence is from underlying Sketch.
*
* @return the model confidence.
* @see
*/
@Override
public double confidence() {
return CONFIDENCE;
}
/**
* Replaces the model's ECDF sketch with a downsampled version.
*
* Periodic downsampling of the sketch is primarily useful for allowing the
* model to more easily adapt to changes in the score distribution. This is
* because, with fewer retained points in the sketch, new scores will have a
* larger impact on the distribution. A secondary benefit to the
* downsampling process is to prevent out of memory errors; although the
* memory requirements grows like log(log(N)) there is the chance of an
* allocation bug which we wish to mitigate.
*
* Uses the initialization parameter {@code downsampleNumSamples}.
*
*/
private void downsample() {
KllFloatsSketch downsampledQuantileSketch = new KllFloatsSketch(quantileSketch.getK());
double pvalueStep = 1.0 / (downsampleNumSamples - 1.0);
for (double pvalue = 0.0; pvalue < 1.0; pvalue += pvalueStep) {
float score = quantileSketch.getQuantile(pvalue);
downsampledQuantileSketch.update(score);
}
downsampledQuantileSketch.update((float) maxScore);
this.quantileSketch = downsampledQuantileSketch;
}
@Override
public List extractScores() {
KllFloatsSketchIterator iter = quantileSketch.iterator();
List scores = new ArrayList<>();
while (iter.next()) {
scores.add((double) iter.getValue());
}
return scores;
}
}