diff --git a/crates/geo_filters/evaluation/accuracy.md b/crates/geo_filters/evaluation/accuracy.md index 2aeb646..ed7da65 100644 --- a/crates/geo_filters/evaluation/accuracy.md +++ b/crates/geo_filters/evaluation/accuracy.md @@ -4,6 +4,8 @@ ![geo_diff_7](./accuracy/geo_diff_7.png) +![geo_diff_10](./accuracy/geo_diff_10.png) + ![geo_diff_13](./accuracy/geo_diff_13.png) ## Geometric Distinct Count diff --git a/crates/geo_filters/evaluation/accuracy.rs b/crates/geo_filters/evaluation/accuracy.rs index 6248f07..1638bb6 100644 --- a/crates/geo_filters/evaluation/accuracy.rs +++ b/crates/geo_filters/evaluation/accuracy.rs @@ -9,7 +9,7 @@ use once_cell::sync::Lazy; use regex::{Captures, Regex}; use std::sync::Arc; -use geo_filters::diff_count::{GeoDiffCount, GeoDiffCount13, GeoDiffCount7}; +use geo_filters::diff_count::{GeoDiffCount, GeoDiffCount10, GeoDiffCount13, GeoDiffCount7}; use geo_filters::distinct_count::{GeoDistinctCount, GeoDistinctCount13, GeoDistinctCount7}; use geo_filters::evaluation::hll::{Hll, Hll14, Hll8, VariableHllConfig}; use geo_filters::evaluation::simulation::{ @@ -30,6 +30,7 @@ fn main() { #[clap(after_help = "\x1b[1;4;37mConfigurations:\x1b[0;37m geo_diff/BUCKET_TYPE/b=N/bytes=N/msb=N geo_diff_7 + geo_diff_10 geo_diff_13 geo_distinct/BUCKET_TYPE/b=N/bytes=N/msb=N geo_distinct_7 @@ -44,6 +45,7 @@ fn main() { #[clap(after_long_help = "\x1b[1;4;37mConfigurations:\x1b[0;37m geo_diff/BUCKET_TYPE/b=N/bytes=N/msb=N Diff count with the given parameters geo_diff_7 Predefined configuration for b=7 + geo_diff_10 Predefined configuration for b=10 geo_diff_13 Predefined configuration for b=13 geo_distinct/BUCKET_TYPE/b=N/bytes=N/msb=N Distinct count with the given parameters @@ -180,6 +182,9 @@ static SIMULATION_CONFIG_FROM_STR: Lazy> = Lazy::new SimulationConfigParser::new(r#"geo_diff_7"#, |_| { Box::new(|| Box::new(GeoDiffCount7::default())) }), + SimulationConfigParser::new(r#"geo_diff_10"#, |_| { + Box::new(|| Box::new(GeoDiffCount10::default())) + }), SimulationConfigParser::new(r#"geo_diff_13"#, |_| { Box::new(|| Box::new(GeoDiffCount13::default())) }), diff --git a/crates/geo_filters/evaluation/accuracy/geo_diff_10.png b/crates/geo_filters/evaluation/accuracy/geo_diff_10.png new file mode 100644 index 0000000..879758e Binary files /dev/null and b/crates/geo_filters/evaluation/accuracy/geo_diff_10.png differ diff --git a/crates/geo_filters/evaluation/accuracy/geo_diff_13.png b/crates/geo_filters/evaluation/accuracy/geo_diff_13.png index 1fd11d1..a6fa1cc 100644 Binary files a/crates/geo_filters/evaluation/accuracy/geo_diff_13.png and b/crates/geo_filters/evaluation/accuracy/geo_diff_13.png differ diff --git a/crates/geo_filters/evaluation/accuracy/geo_diff_7.png b/crates/geo_filters/evaluation/accuracy/geo_diff_7.png index b00b1c8..8f002cc 100644 Binary files a/crates/geo_filters/evaluation/accuracy/geo_diff_7.png and b/crates/geo_filters/evaluation/accuracy/geo_diff_7.png differ diff --git a/crates/geo_filters/evaluation/accuracy/geo_distinct_13.png b/crates/geo_filters/evaluation/accuracy/geo_distinct_13.png index ebeee17..e7530cb 100644 Binary files a/crates/geo_filters/evaluation/accuracy/geo_distinct_13.png and b/crates/geo_filters/evaluation/accuracy/geo_distinct_13.png differ diff --git a/crates/geo_filters/evaluation/accuracy/geo_distinct_7.png b/crates/geo_filters/evaluation/accuracy/geo_distinct_7.png index ac89387..6218bd4 100644 Binary files a/crates/geo_filters/evaluation/accuracy/geo_distinct_7.png and b/crates/geo_filters/evaluation/accuracy/geo_distinct_7.png differ diff --git a/crates/geo_filters/evaluation/accuracy/hll_14.png b/crates/geo_filters/evaluation/accuracy/hll_14.png index a4d6e78..2ed6947 100644 Binary files a/crates/geo_filters/evaluation/accuracy/hll_14.png and b/crates/geo_filters/evaluation/accuracy/hll_14.png differ diff --git a/crates/geo_filters/evaluation/accuracy/hll_8.png b/crates/geo_filters/evaluation/accuracy/hll_8.png index 184b682..84b78e8 100644 Binary files a/crates/geo_filters/evaluation/accuracy/hll_8.png and b/crates/geo_filters/evaluation/accuracy/hll_8.png differ diff --git a/crates/geo_filters/scripts/generate-accuracy-plots b/crates/geo_filters/scripts/generate-accuracy-plots index b63a423..6f07dae 100755 --- a/crates/geo_filters/scripts/generate-accuracy-plots +++ b/crates/geo_filters/scripts/generate-accuracy-plots @@ -7,14 +7,14 @@ set -eu plots_dir=evaluation/accuracy cargo run --release --features evaluation --bin accuracy -- \ - -o accuracy.csv -n 10000 -m 5000000 geo_diff_{7,13} geo_distinct_{7,13} hll_{8,14} "$@" + -o accuracy.csv -n 10000 -m 5000000 geo_diff_{7,10,13} geo_distinct_{7,13} hll_{8,14} "$@" evaluation/plot-accuracy.r accuracy.csv rm -f "$plots_dir"/* idx=0 -for c in geo_diff_{7,13} geo_distinct_{7,13} hll_{8,14}; do +for c in geo_diff_{7,10,13} geo_distinct_{7,13} hll_{8,14}; do echo "plot $c" convert -density 300 accuracy.pdf[$idx] -resize 1024x1024 -alpha remove -alpha off "$plots_dir/$c.png" idx=$(($idx + 1)) diff --git a/crates/geo_filters/src/diff_count.rs b/crates/geo_filters/src/diff_count.rs index 448589e..9084eaf 100644 --- a/crates/geo_filters/src/diff_count.rs +++ b/crates/geo_filters/src/diff_count.rs @@ -17,12 +17,15 @@ mod config; mod sim_hash; use bitvec::*; -pub use config::{GeoDiffConfig13, GeoDiffConfig7}; +pub use config::{GeoDiffConfig10, GeoDiffConfig13, GeoDiffConfig7}; pub use sim_hash::SimHash; /// Diff count filter with a relative error standard deviation of ~0.125. pub type GeoDiffCount7<'a> = GeoDiffCount<'a, GeoDiffConfig7>; +/// Diff count filter with a relative error standard deviation of ~0.04. +pub type GeoDiffCount10<'a> = GeoDiffCount<'a, GeoDiffConfig10>; + /// Diff count filter with a relative error standard deviation of ~0.015. pub type GeoDiffCount13<'a> = GeoDiffCount<'a, GeoDiffConfig13>; diff --git a/crates/geo_filters/src/diff_count/config.rs b/crates/geo_filters/src/diff_count/config.rs index 365c04d..689004b 100644 --- a/crates/geo_filters/src/diff_count/config.rs +++ b/crates/geo_filters/src/diff_count/config.rs @@ -20,6 +20,18 @@ use crate::Diff; // pub type GeoDiffConfig7 = FixedConfig; +/// Diff count configuration with a relative error standard deviation of ~0.04. +// +// Precision evaluation: +// +// scripts/accuracy -n 5000 geo_diff/u32/b=10/bytes={768,832,896,960,1024}/msb=64 +// +// Most-significant bytes evaluation: +// +// scripts/accuracy -n 5000 geo_diff/u32/b=10/bytes=896/msb={32,48,64,80,96,128} +// +pub type GeoDiffConfig10 = FixedConfig; + /// Diff count configuration with a relative error standard deviation of ~0.015. // // Precision evaluation: @@ -187,6 +199,25 @@ mod tests { ); } + #[test] + fn test_estimation_lut_10() { + let c = GeoDiffConfig10::::default(); + let err = (0..5000) + .step_by(10) + .map(|i| { + let a = c.expected_items(i); // uses estimation lookup + let b = estimate_count(c.phi_f64(), i as f64, expected_diff_buckets).0 as f32; + (a - b).abs() / a.max(1.0) + }) + .reduce(f32::max) + .expect("a value"); + let bound = 0.00035; + assert!( + (err - bound).abs() < 0.5e-4, + "found max error {err}, expected {bound}" + ); + } + #[test] fn test_estimation_lut_13() { let c = GeoDiffConfig13::::default();