// ------------------------------------------------------------------------------------------- // Copyright Amazon.com Inc. or its affiliates. All Rights Reserved. // This file is part of the AWS CDI-SDK, licensed under the BSD 2-Clause "Simplified" License. // License details at: https://github.com/aws/aws-cdi-sdk/blob/mainline/LICENSE // ------------------------------------------------------------------------------------------- /** * @file * @brief * An implementation of the t-digest percentile estimation algorithm developed by Ted Dunning and Otmar Ertl and * available here: https://github.com/tdunning/t-digest/blob/master/docs/t-digest-paper/histo.pdf * * This algorithm gathers samples of a given metric and stores them in clusters of samples such that each cluster * contains a mean and sample count and no other information. As clusters grow and the number of clusters grows, * clusters can be combined or created in order to meet algorithm requirements for the max number of clusters and * cluster weight (number of samples per cluster). Clusters near the edges of the distribution of * samples are scaled such that they contain less samples and clusters near the center of the distribution are scaled * such that they contain more samples. Such scaling has the effect of keeping estimation error low. (Nearly) exact * values for a given percentile can be calculated from this set of clusters by interpolating between cluster means. * */ // Include headers in the following order: Related header, C system headers, other libraries' headers, your project's // headers. #include "t_digest.h" #include #include #include #include #include #include // The configuration.h file must be included first since it can have defines which affect subsequent files. #include "configuration.h" #include "cdi_logger_api.h" #include "cdi_os_api.h" #include "utilities_api.h" //********************************************************************************************************************* //***************************************** START OF DEFINITIONS AND TYPES ******************************************** //********************************************************************************************************************* /// @brief The maximum value for uint32_t type. Used to initialize the mimimum sample value in a digest. #define MAX_POSSIBLE_SAMPLE_VALUE (UINT32_MAX) /// @brief The amount of the distribution tail to force to be single-sample clusters. #define TAIL_PERCENT_FOR_SINGLE_SAMPLE (2) /// @brief The amount of times to retry merging before giving up. #define MAX_FAILED_MERGE_COUNT (5) /** * @brief This data structure represents a cluster. Each cluster in the t-digest contains a mean and a weight (number * of samples). */ typedef struct { /// @brief The average value of samples in this cluster. This is a cached value that is updated whenever clusters /// are merged. Mean is used during the sorting of clusters and this cached value greatly improves sort speed. uint32_t mean; uint64_t sum; ///< The sum of all values in this cluster. int samples; ///< The number of samples in this cluster. } Cluster; /** * @brief The main structure of the t-digest. */ typedef struct { uint32_t max_sample_value; ///< The maximum sample value. Only used in the last cluster. uint32_t min_sample_value; ///< The minimum sample value. Only used in the first cluster. int total_samples; ///< The total number of samples in the digest. This is the sum of all cluster weights. int total_clusters; ///< The total number of clusters that have been created. bool fully_merged; ///< True if the digest is fully merged; false if it is not. int failed_count; ///< Counter for the number of consecutive failed merges. Cluster clusters[MAX_CLUSTERS]; ///< Array of all clusters in the t-digest. } TDigest; //********************************************************************************************************************* //*********************************************** START OF VARIABLES ************************************************** //********************************************************************************************************************* //********************************************************************************************************************* //******************************************* START OF STATIC FUNCTIONS *********************************************** //********************************************************************************************************************* #ifdef DEBUG_T_DIGEST_ARRAYS /** * @brief Function used to pretty print a t-Digest. * * @param td_ptr Pointer to the TDigest object to use. */ static void TDigestPrint(TDigest* td_ptr) { printf("==== TDigest State ===="); printf("Max Value: %u", td_ptr->max_sample_value); printf("Min Value: %u", td_ptr->min_sample_value); printf("Samples: %d", td_ptr->total_samples); printf("Clusters: %d [", td_ptr->total_clusters); for (int i=0; itotal_clusters; i++) { printf("(%d, %lu, %u, %d)", i, td_ptr->clusters[i].sum, (uint32_t)(td_ptr->clusters[i].sum / td_ptr->clusters[i].samples), td_ptr->clusters[i].samples); if (i == (td_ptr->total_clusters-1)) { printf("]\n\n"); } else { printf(", "); } } } #endif /** * @brief Function used to compare two nodes by looking at their mean values. This is an input to the qsort function. * * @param operand1_ptr Pointer to the first cluster to compare. * @param operand2_ptr Pointer to the second cluster to compare. */ static int TDigestClusterCompare(const void* operand1_ptr, const void* operand2_ptr) { Cluster* cluster1_ptr = (Cluster*)(operand1_ptr); Cluster* cluster2_ptr = (Cluster*)(operand2_ptr); if (cluster1_ptr->mean < cluster2_ptr->mean) { return -1; } else if (cluster1_ptr->mean > cluster2_ptr->mean) { return 1; } else { return 0; } } /** * @brief Function used to interpolate from input_sample1 to input_sample2 by sample_index/num_samples_of_interest. * * @param input_sample1 The starting sample point for the interpolation. * @param input_sample2 The end sample point for the interpolation. * @param sample_index The numerator of the interpolation ratio (sample_index out of total_interpolation_points). * @param total_interpolation_points The denominator of the interpolation ratio (sample_index out of * total_interpolation_points). * * @return The resulting value from the interpolation. */ static uint32_t TDigestInterpolate(const uint32_t input_sample1, const uint32_t input_sample2, int sample_index, int total_interpolation_points) { uint32_t result; // Simple interpolation between the means of the two adjacent clusters based on the expected index of the // desired sample. uint32_t mean_delta = input_sample2 - input_sample1; if (mean_delta == 0) { result = input_sample1; } else { uint32_t left_mean_addition_num = (mean_delta * sample_index); uint32_t left_mean_addition = left_mean_addition_num / total_interpolation_points + ((left_mean_addition_num % total_interpolation_points != 0) ? 1 : 0); result = input_sample1 + left_mean_addition; } return result; } /** * @brief Function used to sort a TDigest structure. * * @param td_ptr Pointer to the TDigest object to use. */ static void TDigestSort(TDigest* td_ptr) { qsort(td_ptr->clusters, td_ptr->total_clusters, sizeof(Cluster), &TDigestClusterCompare); } /** * @brief Function used to find the cluster index where a given sample number resides. This function walks through all * clusters counting samples in each one until it finds the cluster with the requested sample number. * * @param td_ptr Pointer to the TDigest object to use. * @param desired_sample The sample number for which we want to find the host cluster. * @param total_samples_ptr Pointer to the total number of samples in all clusters up to and including the cluster * selected by this function. * * @return The zero-based cluster index that is host to the desired sample. */ static int TDigestFindCluster(const TDigest* td_ptr, int desired_sample, int* total_samples_ptr) { // Search through the clusters until we find the one that contains our desired sample. int i; for (i = 0; i < td_ptr->total_clusters; i++) { int next_total = *total_samples_ptr + td_ptr->clusters[i].samples; if (next_total >= desired_sample) { TDIGEST_LOG_THREAD(kLogInfo, "Found sample at cluster[%d], which has [%d] samples.", i, td_ptr->clusters[i].samples); break; } *total_samples_ptr = next_total; } return i; } /** * @brief Function used to figure out if a proposed max percentile for a cluster is under the percentile limit for that * cluster. * * @param td_ptr Pointer to the TDigest object to use. * @param cluster_index The current cluster index. * @return The maximum samples allowed for the given cluster index. */ static int TDigestGetClusterLimit(const TDigest* td_ptr, int cluster_index) // The above-referenced whitepaper indicates that "It is also possible to avoid evaluation of k and k^−1 by // estimating the maximum number of samples that can be in each candidate cluster directly from q. Such estimates // typically under-estimate the number of samples allowed, especially near the tails, but the size of the t-digest // is not substantially increased and accuracy can be somewhat increased." We therefore take this approach and // avoid the high-cycle-cost of calculating sin or log functions for every single new cluster. The method being // used here is a simple triangle distribution, where the center clusters are allowed to have the most samples, and // the end clusters are allowed to have the least. The slope on either side of the center cluster is constant, // forming a triangle. // ^ H = total samples at middle cluster // /|\ B = max clusters // / | \ H S = total samples across all clusters // /__+__\ b = current cluster index: 0 .. (B-1) // B h = sample limit at current cluster // // 1/2 * B * H = S ==> H = 2*S/B // The slope of the triangle from 0 to B/2 is m = h/b = H/(B/2) = 2*H/B // Note the slope is negative after B/2. // Therefore, for any cluster b: // // h = 4*S/(B^2) * {b if b < B/2 // {(B-b) if b >= B/2 // // Note that this algorithm gives some special treatment to the tails of the distribution by forcing 2% of the tail // samples to live in a cluster of 1. This greatly improves accuracy for percentile measurements close to 0 or 100. // // Also, if we encounter a situation where all clusters are already in use, then the ability to merge can sometimes // be limited if many clusters are already at or beyond 50% full. For example, if we have the following cluster // sizes in adjacent clusters where the cluster size limit is 10, then no merging will occur: // { ... 5, 6, 5, 7, 5, ...} // To remedy this situation, every time a failed merge attempt occurs, we become more aggressive about allowing // merging by allowing increased cluster sizes. To do this we simply use the failed_count member of TDigest as a // cluster size multiplier. { // NOTE: In this function we are using the current cluster index and MAX_MERGED_CLUSTERS to compute our location // within the distribution, but after a merge we may not have MAX_MERGED_CLUSTERS clusters. This can result in end // clusters being too full, reducing accuracy at the high end of the distribution. Logic could be improved if we use // 'q' (percentage of the way through the distribution) instead. // Multiply by 1 for first merge attempt, then get more aggressive the more tries we have done. int factor_multiplier = td_ptr->failed_count + 1; int factor_num = 4 * td_ptr->total_samples * factor_multiplier; int factor_den = MAX_MERGED_CLUSTERS * MAX_MERGED_CLUSTERS; int cluster_limit = 1; // Find the maximum number of samples for this cluster index. int cluster_limit_num = 0; if (cluster_index < MAX_MERGED_CLUSTERS / 2) { cluster_limit_num = factor_num * (cluster_index + 1); // +1 to convert from zero-based. } else { cluster_limit_num = factor_num * (MAX_MERGED_CLUSTERS - cluster_index); } cluster_limit = CDI_MAX(cluster_limit_num / factor_den, 1); // Keep the tails (+/-2%) limited to 1 sample, but for all others, allow 2 extra samples. // Also, force max to 1 until all clusters have been used. Then we free up the clusters to allow more samples. int tail_limit = (TAIL_PERCENT_FOR_SINGLE_SAMPLE * MAX_MERGED_CLUSTERS / 100); int abs_cluster_position_from_center = abs(cluster_index - (MAX_MERGED_CLUSTERS / 2)); int abs_cluster_position_from_tail = MAX_MERGED_CLUSTERS / 2 - abs_cluster_position_from_center; if (abs_cluster_position_from_tail < tail_limit) { return 1; } else { return cluster_limit; } } /** * @brief Function used to merge all clusters of the t-digest. This function follows Algorithm 1 from the * above-referenced white paper by Dunning and Ertl, which provides a means to merge a t-digest with a list of * additional samples. * * @param td_ptr Pointer to the TDigest object to use. * * @return True if merge was successful; false if not. */ static bool TDigestMerge(TDigest* td_ptr) { bool ret = true; // We keep trying the merge until we are successful or until we reach the maximum retry count. Each time we retry // merging we become more generous about how many samples we allow each cluster to contain in an effort to make // merging easier. See TDigestGetClusterLimit() for more details. Don't do anything if there aren't any clusters. while (td_ptr->total_clusters > 0 && !td_ptr->fully_merged && td_ptr->failed_count <= MAX_FAILED_MERGE_COUNT) { #ifdef DEBUG_T_DIGEST_ARRAYS TDIGEST_LOG_THREAD(kLogInfo, "Unmerged Digest"); TDigestPrint(td_ptr); #endif // Sort all merged and non-merged clusters by mean. Only sort on the first merge try because we know we // are already merged on subsequent attempts. if (td_ptr->failed_count == 0) { TDigestSort(td_ptr); } int max_cluster_samples = 0; // The allowable samples for a given cluster. int accumulated_samples = 0; // The total accumulated samples so far. #ifdef DEBUG_T_DIGEST_ARRAYS TDIGEST_LOG_THREAD(kLogInfo, "Sorted Digest"); TDigestPrint(td_ptr); printf("Clusters size limits: ["); for (int i = 0; i < MAX_MERGED_CLUSTERS; i++) { int max_cluster_samples = TDigestGetClusterLimit(td_ptr, i); if (i != MAX_MERGED_CLUSTERS-1) { printf("%d, ", max_cluster_samples); } else{ printf("%d]\n", max_cluster_samples); } } #endif // Initialize max_cluster_samples for the first cluster. int cluster_index = 0; // Cluster index points to the current new cluster being built. max_cluster_samples = TDigestGetClusterLimit(td_ptr, cluster_index); // Loop through all clusters and rebuild the cluster array by combining as many input clusters as possible // for each output cluster. The number of output clusters will be less than or equal to the number of input // clusters, so we just overwrite input clusters as we create new output clusters. Note that we skip the // first cluster in the looping since it is our starting output cluster. /// NOTE: We choose to always loop forward to simplify the algorithm, but the white paper /// referenced above discusses a potential improvement to error rates near q=0 if looping is alternated /// between forward and reverse from merge to merge. for (int i = 1; i < td_ptr->total_clusters; i++) { int proposed_cluster_samples = td_ptr->clusters[cluster_index].samples + td_ptr->clusters[i].samples; TDIGEST_LOG_THREAD(kLogInfo, "The current new cluster[%d] already has [%d] samples and a limit of [%d].", cluster_index, td_ptr->clusters[cluster_index].samples, max_cluster_samples); TDIGEST_LOG_THREAD(kLogInfo, "Proposing adding[%d] samples more from cluster [%d].", td_ptr->clusters[i].samples, i); TDIGEST_LOG_THREAD(kLogInfo, "Cluster would hold samples[%d] through[%d] of[%d], or[%d] percent of total.", accumulated_samples, accumulated_samples + td_ptr->clusters[i].samples, td_ptr->total_samples, accumulated_samples * 100 / td_ptr->total_samples); if (proposed_cluster_samples <= max_cluster_samples) { TDIGEST_LOG_THREAD(kLogInfo, "Adding cluster[%d] to new cluster[%d].", i, cluster_index); // Basically, merge this old cluster into the current new cluster by adding the old count to the new // count and recalculating a mean based on the means and counts of the two clusters. td_ptr->clusters[cluster_index].samples += td_ptr->clusters[i].samples; td_ptr->clusters[cluster_index].sum += td_ptr->clusters[i].sum; td_ptr->clusters[cluster_index].mean = td_ptr->clusters[cluster_index].sum / td_ptr->clusters[cluster_index].samples; } else { // Otherwise, we should create a new cluster from this input cluster. accumulated_samples += td_ptr->clusters[cluster_index].samples; // Increment the new cluster index and assign the current input cluster to this new output cluster. cluster_index++; TDIGEST_LOG_THREAD(kLogInfo, "Creating new cluster[%d].", cluster_index); td_ptr->clusters[cluster_index] = td_ptr->clusters[i]; // Now calculate our new percentile position and find the percentile cluster for this cluster. max_cluster_samples = TDigestGetClusterLimit(td_ptr, cluster_index); } TDIGEST_LOG_THREAD(kLogInfo, "Cluster[%d] now has num samples[%d], sum[%lu], and mean[%d].", cluster_index, td_ptr->clusters[cluster_index].samples, td_ptr->clusters[cluster_index].sum, td_ptr->clusters[cluster_index].sum / td_ptr->clusters[cluster_index].samples); } if ((cluster_index + 1) > MAX_MERGED_CLUSTERS) { TDIGEST_LOG_THREAD(kLogInfo, "Digest was not fully merged - attempt[%d].", td_ptr->failed_count); ret = false; td_ptr->fully_merged = false; td_ptr->failed_count++; } else { TDIGEST_LOG_THREAD(kLogInfo, "Merged Digest in [%d] attempts.", td_ptr->failed_count + 1); ret = true; td_ptr->fully_merged = true; td_ptr->failed_count = 0; } td_ptr->total_clusters = cluster_index + 1; #ifdef DEBUG_T_DIGEST_ARRAYS TDIGEST_LOG_THREAD(kLogInfo, "Merged Digest"); TDigestPrint(td_ptr); #endif }; return ret; } /** * @brief Function to run the calculation for a percentile value. * * @param td_ptr Pointer to the TDigest object to use. * @param percentile Integer representing the percentile being calculated. * * @return The calculated value at the given percentile. */ static uint32_t TDigestCalculatePercentile(TDigest* td_ptr, int percentile) { uint32_t value_at_percentile = 0; TDIGEST_LOG_THREAD(kLogInfo, "Scanning over [%d] clusters for percentile[%d].", td_ptr->total_clusters, percentile); bool add_one = ((percentile * td_ptr->total_samples) % 100) != 0; // ceiling function int desired_sample = (percentile * td_ptr->total_samples / 100) + (add_one ? 1 : 0); int total_samples = 0; int cluster_index = TDigestFindCluster(td_ptr, desired_sample, &total_samples); // Cluster found. Now interpolate. // If our sample is below the mean of the cluster that was found, then we interpolate between this mean and the // mean of the cluster before it. If our sample is above this cluster's mean, then we interpolate between the // mean of this cluster and the cluster after it. If the sample is in an end cluster and is on the side of the // mean closest to the end, then we simply use the max/min value that was saved with the digest. // To interpolate, find the delta between mean of this cluster and the chosen neighbor cluster, and then find // the offset of the desired sample between those means. Then, simply use that ratio to find out how much to // add or subtract to the mean of the cluster hosting our desired sample. If the chosen cluster is at the edge // of the distribution (i.e. the first or last cluster) and our desired sample is on the half of that cluster // nearest the tail, we use the max or min as the neighbor value for the interpolation. // NOTE: This all works under the assumption that samples are more or less evenly distributed around the mean... // hint, this is where error gets introduced. This is explained thoroughly in the white paper mentioned above. Cluster* this_cluster_ptr = &td_ptr->clusters[cluster_index]; uint32_t left_mean = 0; uint32_t right_mean = 0; int num_samples_of_interest = 0; int sample_index = 0; bool is_odd = (this_cluster_ptr->samples % 2) == 1; bool lower_half = desired_sample <= (total_samples + ((this_cluster_ptr->samples + 1) / 2)); bool first_cluster = (cluster_index == 0); TDIGEST_LOG_THREAD(kLogInfo, "The desired sample [%d/%d] for percentile[%d] is in the %s half of cluster[%d].", desired_sample, td_ptr->total_samples, percentile, lower_half ? "lower" : "upper", cluster_index); if ((this_cluster_ptr->samples == 1) || (is_odd && (desired_sample == (total_samples + (this_cluster_ptr->samples + 1) / 2)))) { // If we're pointing right at the mean then use it. For example, say we have 7 samples in clusters before // this one and we have 3 samples in this cluster and our desired sample is 9.0. If this is true then we // will get 7+(3+1)/2 = 9. This only works for odd-sized clusters. While it is possible to have an even // number of samples average out to the mean, our algorithm doesn't allow us to think of an even-sized // cluster as having any samples exactly at the mean value. Also, if the current cluster only has one // sample, then we know what the value is supposed to be, so use the mean. TDIGEST_LOG_THREAD(kLogInfo, "Selecting the cluster's mean sample[%u].", this_cluster_ptr->mean); value_at_percentile = this_cluster_ptr->mean; } else if (lower_half) { // The sample is in lower half of the cluster. // If the desired sample is in the lower half of the cluster, then we are interpolating between this one // and the previous one. if (first_cluster) { if (this_cluster_ptr->samples < 3) { // If this is the first cluster, and the number of samples is below 3, we can assume the one to the // left of center is the t-digest's min_sample_value. TDIGEST_LOG_THREAD(kLogInfo, "Selecting the digest's minimum sample."); value_at_percentile = td_ptr->min_sample_value; } else { // If this is the first cluster, and the number of samples is more than 2, then we interpolate as // usual, but use the min_sample_value as the left mean. left_mean = td_ptr->min_sample_value; right_mean = td_ptr->clusters[cluster_index].mean; num_samples_of_interest = td_ptr->clusters[cluster_index].samples / 2; sample_index = desired_sample; TDIGEST_LOG_THREAD(kLogInfo, "Interpolating to find sample[%d] over [%d] samples between[%u] " "and[%u].", sample_index, num_samples_of_interest, left_mean, right_mean); } } else { num_samples_of_interest = (td_ptr->clusters[cluster_index-1].samples + this_cluster_ptr->samples) / 2; left_mean = td_ptr->clusters[cluster_index-1].mean; right_mean = this_cluster_ptr->mean; sample_index = desired_sample - (total_samples - td_ptr->clusters[cluster_index-1].samples / 2); TDIGEST_LOG_THREAD(kLogInfo, "Interpolating to find sample[%d] over [%d] samples between[%u] " "and[%u].", sample_index, num_samples_of_interest, left_mean, right_mean); } } else { // The sample is in the upper half of the cluster. // If the desired sample is in the upper half of the cluster, then we are interpolating between this one // and the next one. if ((cluster_index + 1) == td_ptr->total_clusters) { // Last cluster. TDIGEST_LOG_THREAD(kLogInfo, "Sample is in the last cluster."); if (this_cluster_ptr->samples < 3) { // If this is the last cluster, and the number of samples is below 3, we can assume the one to the // right of center is the t-digest's max_sample_value. TDIGEST_LOG_THREAD(kLogInfo, "Selecting the digest's maximum sample."); value_at_percentile = td_ptr->max_sample_value; } else { // If this is the last cluster, and the number of samples is more than 2, then we interpolate as // usual, but use the min_sample_value as the left mean. left_mean = this_cluster_ptr->mean; right_mean = td_ptr->max_sample_value; num_samples_of_interest = td_ptr->clusters[cluster_index].samples / 2; sample_index = desired_sample - (total_samples + td_ptr->clusters[cluster_index].samples / 2); TDIGEST_LOG_THREAD(kLogInfo, "Interpolating to find sample[%d] over [%d] samples between[%u] " "and[%u].", sample_index, num_samples_of_interest, left_mean, right_mean); } } else { // Not the last cluster. num_samples_of_interest = (td_ptr->clusters[cluster_index+1].samples + this_cluster_ptr->samples) / 2; left_mean = this_cluster_ptr->mean; right_mean = td_ptr->clusters[cluster_index+1].mean; sample_index = desired_sample - (total_samples + td_ptr->clusters[cluster_index].samples / 2); TDIGEST_LOG_THREAD(kLogInfo, "Interpolating to find sample[%d] over [%d] samples between[%u] " "and [%u].", sample_index, num_samples_of_interest, left_mean, right_mean); } } // For some simple cases, the value_at_percentile_ptr has already been been stored. For the rest, do it here. // Also, never divide by zero! Both cases are covered by the num_samples_of_interest check, since that // variable is set above only when value_at_percentile_ptr is not already set. if (num_samples_of_interest != 0) { value_at_percentile = TDigestInterpolate(left_mean, right_mean, sample_index, num_samples_of_interest); } // Make sure we don't go beyond the min or max. if (value_at_percentile < td_ptr->min_sample_value) { value_at_percentile = td_ptr->min_sample_value; } else if (value_at_percentile > td_ptr->max_sample_value) { value_at_percentile = td_ptr->max_sample_value; } return value_at_percentile; } //********************************************************************************************************************* //******************************************* START OF PUBLIC FUNCTIONS *********************************************** //********************************************************************************************************************* bool TDigestCreate(TDigestHandle* ret_td_handle_ptr) { TDigest* td_temp_ptr; td_temp_ptr = CdiOsMemAllocZero(sizeof(TDigest)); if (NULL == td_temp_ptr) { return false; } TDigestClear((TDigestHandle)(td_temp_ptr)); *ret_td_handle_ptr = (TDigestHandle)td_temp_ptr; return true; } void TDigestDestroy(TDigestHandle td_handle) { if (td_handle) { CdiOsMemFree(td_handle); } } void TDigestClear(TDigestHandle td_handle) { if (td_handle) { TDigest* td_ptr = (TDigest*)td_handle; td_ptr->min_sample_value = MAX_POSSIBLE_SAMPLE_VALUE; td_ptr->max_sample_value = 0; td_ptr->total_samples = 0; td_ptr->total_clusters = 0; td_ptr->fully_merged = true; td_ptr->failed_count = 0; } } void TDigestAddSample(TDigestHandle td_handle, uint32_t value) { if (td_handle) { TDigest* td_ptr = (TDigest*)td_handle; TDIGEST_LOG_THREAD(kLogInfo, "Adding new value[%lu] to digest.", value); // Can't add samples if there are no unused clusters available. This should never happen unless a merge has // failed. if (td_ptr->total_clusters < MAX_CLUSTERS) { td_ptr->clusters[td_ptr->total_clusters] = (Cluster) { .mean = value, .sum = value, .samples = 1, }; // Check max and min values and update digest accordingly. if (value > td_ptr->max_sample_value) { TDIGEST_LOG_THREAD(kLogInfo, "Found new digest maximum[%lu].", value); td_ptr->max_sample_value = value; } if (value < td_ptr->min_sample_value) { TDIGEST_LOG_THREAD(kLogInfo, "Found new digest minimum[%lu].", value); td_ptr->min_sample_value = value; } td_ptr->total_clusters++; td_ptr->total_samples++; td_ptr->fully_merged = false; } else { TDIGEST_LOG_THREAD(kLogFatal, "Failed to add new value[%lu] to digest because there's no more space.", value); } // If we have now used all clusters, merge what can be merged to make space for more samples. if (td_ptr->total_clusters >= MAX_CLUSTERS) { if (!TDigestMerge(td_ptr)) { TDIGEST_LOG_THREAD(kLogFatal, "Failed to merge digest."); } } } } bool TDigestGetPercentileValue(TDigestHandle td_handle, int percentile, uint32_t* value_at_percentile_ptr) { if (NULL == td_handle) { return false; } TDigest* td_ptr = (TDigest*)td_handle; bool return_val = true; // Make sure the t-digest is fully merged before proceding. If it is not fully merged, then there has been at least // one single-sample cluster added to the end of the clusters array. if (!td_ptr->fully_merged) { if (!TDigestMerge(td_ptr)) { TDIGEST_LOG_THREAD(kLogFatal, "Failed to merge digest."); } } if (td_ptr->total_clusters == 0) { // If no clusters have been added yet, then we have nothing to compute and no valid value to return. return_val = false; } else if (percentile == 0) { // Skip computation and just return the minimum sample. *value_at_percentile_ptr = td_ptr->min_sample_value; } else if (percentile == 100) { // Skip computation and just return the maximum sample. *value_at_percentile_ptr = td_ptr->max_sample_value; } else if (percentile < 0 || percentile > 100) { // Don't do anything if an illegal percentile value is passed in (percentile < 0 or percentile > 100). CDI_LOG_THREAD(kLogError, "Illegal percentile request[%d]. Valid requests are between 0 and 100.", percentile); return_val = false; } else { // Compute the percentile value. *value_at_percentile_ptr = TDigestCalculatePercentile(td_ptr, percentile); } return return_val; } int TDigestGetCount(TDigestHandle td_handle) { TDigest* td_ptr = (TDigest*)td_handle; return td_ptr->total_samples; }