use std::cmp::min; use std::ops::Index; use std::slice::SliceIndex; use crate::types::Result; use crate::errors; use crate::util::check_argument; use rand::{Rng, SeedableRng}; use rand_chacha::ChaCha20Rng; use rayon::range; use crate::common::cluster::{Center, multi_cluster_as_weighted_obj, multi_cluster_as_weighted_ref, single_centroid_cluster_weighted_vec_with_distance_over_slices}; /// /// The goal of the summarization below is as follows: on being provided a collection of sampled weighted points /// represented by a slice &[(Vec,f32)] where each of the Vec has the same length/dimension /// and the f32 in the pair is the corresponding weight. /// The algorithm uses the philosophy of RCFs, in repeatedly using randomization. It proceeds as follows: /// 1. It uses an initial sampling which serves as a basis of efficiency as well as denoising, borrowing from /// https://en.wikipedia.org/wiki/CURE_algorithm, in that algorithm's robustness to outliers. /// 2. It uses a sampling mechanism to initialize some clusters based on https://en.wikipedia.org/wiki/Data_stream_clustering /// where the radom sampling achieves half of the the same effects as hierarchical compression. ///3. It repeatedly merges the most overlapping clusters, failing that, eliminates the least weighted cluster to achieve /// the same effect as hieararchical compression. /// /// The algorithm takes a distance function as an input, and tends to produce spherical (measured in the input /// distance function) clusters. These types of algorithms are unlikely to be useful for large number of output clusters. /// The output is the SampleSummary, which provides basic statistics of mean, median and deviation /// in addition it performs a grouping/clustering, assuming that the maximum number of clusters are not large /// the routine below bounds the number to be max_number_per_dimension times the dimension of Vec /// and a smaller number can also be provided in the summarize() function /// /// const MAX_NUMBER_PER_DIMENSION: usize = 5; const PHASE2_THRESHOLD: usize = 2; const LENGTH_BOUND: usize = 1000; const UPPER_FRACTION : f64 = 0.9; const LOWER_FRACTION : f64 = 0.1; #[repr(C)] pub struct SampleSummary { pub summary_points: Vec>, // a measure of comparison among the typical points; pub relative_weight: Vec, // number of samples, often the number of summary, but can handle weighted points // (possibly indicating confidence or othe measure) in the future pub total_weight: f32, // the global mean, median pub mean: Vec, pub median: Vec, // percentiles and bounds pub upper: Vec, pub lower : Vec, // This is the global deviation, pub deviation: Vec, } impl SampleSummary { pub fn new( total_weight: f32, summary_points: Vec>, relative_weight: Vec, median: Vec, mean: Vec, upper: Vec, lower: Vec, deviation: Vec, ) -> Self { SampleSummary { total_weight, summary_points: summary_points.clone(), relative_weight: relative_weight.clone(), median: median.clone(), mean: mean.clone(), upper: upper.clone(), lower: lower.clone(), deviation: deviation.clone(), } } pub fn add_typical(&mut self, summary_points: Vec>, relative_weight: Vec) { self.summary_points = summary_points.clone(); self.relative_weight = relative_weight.clone(); } pub fn pick(weighted_points : &[(f32,f32)], weight: f64, start: usize, initial_weight : f64) -> (usize,f64) { let mut running = initial_weight; let mut index = start; while index + 1 < weighted_points.len() && weighted_points[index].1 as f64 + running < weight { running += weighted_points[index].1 as f64; index += 1; } (index, running) } pub fn from_points(dimensions: usize,points: &[(Q, f32)], lower_fraction: f64, upper_fraction:f64) -> Result where Q: Index { check_argument(points.len() > 0, "cannot be empty list")?; check_argument(lower_fraction < 0.5, " has to be less than half")?; check_argument(upper_fraction > 0.5, "has to be larger than half")?; check_argument(dimensions > 0, " cannot have 0 dimensions")?; let total_weight: f64 = points.iter().map(|x| x.1 as f64).sum(); check_argument(total_weight > 0.0, "weights cannot be all zero")?; check_argument(total_weight.is_finite(), " cannot have infinite weights")?; let mut mean = vec![0.0f32; dimensions]; let mut deviation = vec![0.0f32; dimensions]; let mut sum_values_sq = vec![0.0f64; dimensions]; let mut sum_values = vec![0.0f64; dimensions]; for i in 0..points.len() { check_argument(points[i].1 >= 0.0, "point weights have to be non-negative")?; for j in 0..dimensions { check_argument( points[i].0[j].is_finite() && !points[i].0[j].is_nan(), " cannot have NaN or infinite values" )?; sum_values[j] += points[i].1 as f64 * points[i].0[j] as f64; sum_values_sq[j] += points[i].1 as f64 * points[i].0[j] as f64 * points[i].0[j] as f64; } } for j in 0..dimensions { mean[j] = (sum_values[j] / total_weight) as f32; let t: f64 = sum_values_sq[j] / total_weight - sum_values[j] * sum_values[j] / (total_weight * total_weight); deviation[j] = f64::sqrt(if t > 0.0 { t } else { 0.0 }) as f32; } let mut median = vec![0.0f32; dimensions]; let mut upper_vec = vec![0.0f32;dimensions]; let mut lower_vec = vec![0.0f32;dimensions]; let num = total_weight/2.0; let lower = total_weight * lower_fraction; let upper = total_weight * upper_fraction; for j in 0..dimensions { let mut y: Vec<(f32,f32)> = points.iter().map(|x| (x.0[j],x.1)).collect(); y.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap()); let first = Self::pick(&y,lower,0,0.0); lower_vec[j] = y[first.0].0; let second = Self::pick(&y,num,first.0,first.1); median[j] = y[second.0].0; let third = Self::pick(&y,upper,second.0,second.1); upper_vec[j] = y[third.0].0; } Ok(SampleSummary { summary_points: Vec::new(), relative_weight: Vec::new(), total_weight: total_weight as f32, mean, upper: upper_vec, lower: lower_vec, median, deviation, }) } pub fn from_references(dimensions: usize, points: &[(&Q, f32)], lower_fraction: f64, upper_fraction:f64) -> Result where Q:?Sized + Index { check_argument(points.len() > 0, "cannot be empty list")?; check_argument(lower_fraction < 0.5, " has to be less than half")?; check_argument(upper_fraction > 0.5, "has to be larger than half")?; check_argument(dimensions > 0, " cannot have 0 dimensions")?; let total_weight: f64 = points.iter().map(|x| x.1 as f64).sum(); check_argument(total_weight > 0.0, "weights cannot be all zero")?; check_argument(total_weight.is_finite(), " cannot have infinite weights")?; let mut mean = vec![0.0f32; dimensions]; let mut deviation = vec![0.0f32; dimensions]; let mut sum_values_sq = vec![0.0f64; dimensions]; let mut sum_values = vec![0.0f64; dimensions]; for i in 0..points.len() { check_argument(points[i].1 >= 0.0, "point weights have to be non-negative")?; for j in 0..dimensions { check_argument( points[i].0[j].is_finite() && !points[i].0[j].is_nan(), " cannot have NaN or infinite values" )?; sum_values[j] += points[i].1 as f64 * points[i].0[j] as f64; sum_values_sq[j] += points[i].1 as f64 * points[i].0[j] as f64 * points[i].0[j] as f64; } } for j in 0..dimensions { mean[j] = (sum_values[j] / total_weight) as f32; let t: f64 = sum_values_sq[j] / total_weight - sum_values[j] * sum_values[j] / (total_weight * total_weight); deviation[j] = f64::sqrt(if t > 0.0 { t } else { 0.0 }) as f32; } let mut median = vec![0.0f32; dimensions]; let mut upper_vec = vec![0.0f32;dimensions]; let mut lower_vec = vec![0.0f32;dimensions]; let num = total_weight/2.0; let lower = total_weight * lower_fraction; let upper = total_weight * upper_fraction; for j in 0..dimensions { let mut y: Vec<(f32,f32)> = points.iter().map(|x| (x.0[j],x.1)).collect(); y.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap()); let first = Self::pick(&y,lower,0,0.0); lower_vec[j] = y[first.0].0; let second = Self::pick(&y,num,first.0,first.1); median[j] = y[second.0].0; let third = Self::pick(&y,upper,second.0,second.1); upper_vec[j] = y[third.0].0; } Ok(SampleSummary { summary_points: Vec::new(), relative_weight: Vec::new(), total_weight: total_weight as f32, mean, upper: upper_vec, lower: lower_vec, median, deviation, }) } } pub fn summarize( points: &[(Vec, f32)], distance: fn(&[f32], &[f32]) -> f64, max_number: usize, parallel_enabled: bool, ) -> Result { let dimensions = points[0].0.len(); let mut summary = SampleSummary::from_points(dimensions,&points,LOWER_FRACTION,UPPER_FRACTION)?; if max_number > 0 { let max_allowed = min(dimensions * MAX_NUMBER_PER_DIMENSION, max_number); let mut list: Vec
= single_centroid_cluster_weighted_vec_with_distance_over_slices( &points, distance, max_allowed, parallel_enabled, )?; list.sort_by(|o1, o2| o2.weight().partial_cmp(&o1.weight()).unwrap()); // decreasing order let mut summary_points: Vec> = Vec::new(); let mut relative_weight: Vec = Vec::new(); let center_sum: f64 = list.iter().map(|x| x.weight()).sum(); for i in 0..list.len() { summary_points.push(list[i].representative().clone()); relative_weight.push((list[i].weight() / center_sum) as f32); } summary.add_typical(summary_points, relative_weight); } return Ok(summary); } pub fn multi_summarize_ref( points: &[(&[f32], f32)], distance: fn(&[f32], &[f32]) -> f64, number_of_representatives: usize, shrinkage : f32, max_number: usize, parallel_enabled: bool, ) -> Result { let dimensions = points[0].0.len(); let mut summary = SampleSummary::from_references(dimensions,points,LOWER_FRACTION,UPPER_FRACTION)?; if max_number > 0 { let max_allowed = min(dimensions * MAX_NUMBER_PER_DIMENSION, max_number); let mut list= multi_cluster_as_weighted_ref( &points, distance, number_of_representatives, shrinkage, false, max_allowed, parallel_enabled, )?; list.sort_by(|o1, o2| o2.weight().partial_cmp(&o1.weight()).unwrap()); // decreasing order let mut summary_points: Vec> = Vec::new(); let mut relative_weight: Vec = Vec::new(); let center_sum: f64 = list.iter().map(|x| x.weight()).sum(); for i in 0..list.len() { summary_points.push(Vec::from(list[i].representatives()[0].0)); relative_weight.push((list[i].weight() / center_sum) as f32); } summary.add_typical(summary_points, relative_weight); } return Ok(summary); }