diff --git a/datafusion/core/tests/dataframe/mod.rs b/datafusion/core/tests/dataframe/mod.rs index 1c661744e0867..e1e669688184d 100644 --- a/datafusion/core/tests/dataframe/mod.rs +++ b/datafusion/core/tests/dataframe/mod.rs @@ -1206,23 +1206,23 @@ async fn window_using_aggregates() -> Result<()> { | | | | | | | | 1 | -85 | | -85 | -101 | 14 | -12 | -12 | 83 | -101 | 4 | -54 | | -85 | -101 | 17 | -25 | -25 | 83 | -101 | 5 | -31 | - | -85 | -12 | 10 | -32 | -34 | 83 | -85 | 3 | 13 | + | -85 | -12 | 10 | -43 | -34 | 83 | -85 | 3 | 13 | | -85 | -25 | 3 | -56 | -56 | -25 | -85 | 1 | -5 | - | -85 | -31 | 18 | -29 | -28 | 83 | -101 | 5 | 36 | + | -85 | -31 | 18 | -31 | -28 | 83 | -101 | 5 | 36 | | -85 | -38 | 16 | -25 | -25 | 83 | -101 | 4 | 65 | | -85 | -43 | 7 | -43 | -43 | 83 | -85 | 2 | 45 | - | -85 | -48 | 6 | -35 | -36 | 83 | -85 | 2 | -43 | - | -85 | -5 | 4 | -37 | -40 | -5 | -85 | 1 | 83 | - | -85 | -54 | 15 | -17 | -18 | 83 | -101 | 4 | -38 | - | -85 | -56 | 2 | -70 | -70 | -56 | -85 | 1 | -25 | + | -85 | -48 | 6 | -48 | -36 | 83 | -85 | 2 | -43 | + | -85 | -5 | 4 | -56 | -40 | -5 | -85 | 1 | 83 | + | -85 | -54 | 15 | -25 | -18 | 83 | -101 | 4 | -38 | + | -85 | -56 | 2 | -85 | -70 | -56 | -85 | 1 | -25 | | -85 | -72 | 9 | -43 | -43 | 83 | -85 | 3 | -12 | | -85 | -85 | 1 | -85 | -85 | -85 | -85 | 1 | -56 | - | -85 | 13 | 11 | -17 | -18 | 83 | -85 | 3 | 14 | + | -85 | 13 | 11 | -25 | -18 | 83 | -85 | 3 | 14 | | -85 | 13 | 11 | -25 | -25 | 83 | -85 | 3 | 13 | | -85 | 14 | 12 | -12 | -12 | 83 | -85 | 3 | 17 | - | -85 | 17 | 13 | -11 | -8 | 83 | -85 | 4 | -101 | - | -85 | 45 | 8 | -34 | -34 | 83 | -85 | 3 | -72 | - | -85 | 65 | 17 | -17 | -18 | 83 | -101 | 5 | -101 | + | -85 | 17 | 13 | -12 | -8 | 83 | -85 | 4 | -101 | + | -85 | 45 | 8 | -43 | -34 | 83 | -85 | 3 | -72 | + | -85 | 65 | 17 | -25 | -18 | 83 | -101 | 5 | -101 | | -85 | 83 | 5 | -25 | -25 | 83 | -85 | 2 | -48 | +-------------+----------+-----------------+---------------+--------+-----+------+----+------+ " diff --git a/datafusion/functions-aggregate-common/src/tdigest.rs b/datafusion/functions-aggregate-common/src/tdigest.rs index a7450f0eb52e9..77b527f2ef7b4 100644 --- a/datafusion/functions-aggregate-common/src/tdigest.rs +++ b/datafusion/functions-aggregate-common/src/tdigest.rs @@ -434,6 +434,10 @@ impl TDigest { return 0.0; } + // No compression happened since each centroid = one data point, so use exact percentile instead of interpolation + if self.count as usize == self.centroids.len() { + return self.exact_quantile(q); + } let rank = q * self.count; let mut pos: usize; @@ -509,6 +513,20 @@ impl TDigest { Self::clamp(value, min, max) } + fn exact_quantile(&self, q: f64) -> f64 { + if q <= 0.0 { + return self.min(); + } + if q >= 1.0 { + return self.max(); + } + + let n = self.centroids.len(); + let idx = (q * n as f64).ceil() as usize; + let idx = idx.saturating_sub(1).min(n - 1); + self.centroids[idx].mean() + } + /// This method decomposes the [`TDigest`] and its [`Centroid`] instances /// into a series of primitive scalar values. /// diff --git a/datafusion/sqllogictest/test_files/aggregate.slt b/datafusion/sqllogictest/test_files/aggregate.slt index 02323671638c7..7157298c39429 100644 --- a/datafusion/sqllogictest/test_files/aggregate.slt +++ b/datafusion/sqllogictest/test_files/aggregate.slt @@ -1351,7 +1351,7 @@ DROP TABLE median_window_test; query RT select approx_median(arrow_cast(col_f32, 'Float16')), arrow_typeof(approx_median(arrow_cast(col_f32, 'Float16'))) from median_table; ---- -2.75 Float16 +2.1992188 Float16 # This shouldn't be NaN, see: # https://github.com/apache/datafusion/issues/18945 @@ -1369,7 +1369,7 @@ select arrow_typeof(approx_percentile_cont(0.5) within group (order by arrow_cast(col_f32, 'Float16'))) from median_table; ---- -2.75 Float16 +2.1992188 Float16 query ?T select approx_median(NULL), arrow_typeof(approx_median(NULL)) from median_table; @@ -1388,7 +1388,7 @@ select median(c), arrow_typeof(median(c)) from t; query RT select approx_median(c), arrow_typeof(approx_median(c)) from t; ---- -0.00035 Float64 +0.0003 Float64 statement ok drop table t; @@ -2319,33 +2319,33 @@ b NULL NULL 7732.315789473684 query TI SELECT c1, approx_percentile_cont(0.95) WITHIN GROUP (ORDER BY c3) AS c3_p95 FROM aggregate_test_100 GROUP BY 1 ORDER BY 1 ---- -a 73 +a 65 b 68 -c 122 -d 124 -e 115 +c 118 +d 125 +e 112 # csv_query_approx_percentile_cont_with_weight (should be the same as above) query TI SELECT c1, approx_percentile_cont(c3, 0.95) AS c3_p95 FROM aggregate_test_100 GROUP BY 1 ORDER BY 1 ---- -a 73 +a 65 b 68 -c 122 -d 124 -e 115 +c 118 +d 125 +e 112 # using approx_percentile_cont on 2 columns with same signature query TII SELECT c1, approx_percentile_cont(c2, 0.95) AS c2, approx_percentile_cont(c3, 0.95) AS c3 FROM aggregate_test_100 GROUP BY 1 ORDER BY 1 ---- -a 5 73 +a 5 65 b 5 68 -c 5 122 -d 5 124 -e 5 115 +c 5 118 +d 5 125 +e 5 112 # error is unique to this UDAF query TRR @@ -2363,50 +2363,50 @@ query TI SELECT c1, approx_percentile_cont(0.95) WITHIN GROUP (ORDER BY c3 DESC) AS c3_p95 FROM aggregate_test_100 GROUP BY 1 ORDER BY 1 ---- a -101 -b -114 -c -109 -d -98 -e -93 +b -117 +c -107 +d -99 +e -86 # csv_query_approx_percentile_cont_with_weight (2) query TI SELECT c1, approx_percentile_cont_with_weight(1, 0.95) WITHIN GROUP (ORDER BY c3) AS c3_p95 FROM aggregate_test_100 GROUP BY 1 ORDER BY 1 ---- -a 73 +a 65 b 68 -c 122 -d 124 -e 115 +c 118 +d 125 +e 112 # csv_query_approx_percentile_cont_with_weight alternate syntax query TI SELECT c1, approx_percentile_cont_with_weight(c3, 1, 0.95) AS c3_p95 FROM aggregate_test_100 GROUP BY 1 ORDER BY 1 ---- -a 73 +a 65 b 68 -c 122 -d 124 -e 115 +c 118 +d 125 +e 112 query TI SELECT c1, approx_percentile_cont_with_weight(1, 0.95) WITHIN GROUP (ORDER BY c3 DESC) AS c3_p95 FROM aggregate_test_100 GROUP BY 1 ORDER BY 1 ---- a -101 -b -114 -c -109 -d -98 -e -93 +b -117 +c -107 +d -99 +e -86 # csv_query_approx_percentile_cont_with_histogram_bins query TI SELECT c1, approx_percentile_cont(0.95, 200) WITHIN GROUP (ORDER BY c3) AS c3_p95 FROM aggregate_test_100 GROUP BY 1 ORDER BY 1 ---- -a 73 +a 65 b 68 -c 122 -d 124 -e 115 +c 118 +d 125 +e 112 query TI SELECT c1, approx_percentile_cont_with_weight(c2, 0.95) WITHIN GROUP (ORDER BY c3) AS c3_p95 FROM aggregate_test_100 GROUP BY 1 ORDER BY 1 @@ -8804,9 +8804,9 @@ FROM (SELECT * FROM stream_test ORDER BY g LIMIT 10000) GROUP BY g ORDER BY g; ---- -1 2 1.5 1.5 1.5 1.5 1.5 1.5 1.5 1.5 -2 2 1.5 1.5 1.5 1.5 1.5 1.5 1.5 1.5 -3 2 1.5 1.5 1.5 1.5 1.5 1.5 1.5 1.5 +1 2 1 1.5 1.5 1 1 1.5 1 1 +2 2 1 1.5 1.5 1 1 1.5 1 1 +3 2 1 1.5 1.5 1 1 1.5 1 1 # Config reset diff --git a/datafusion/sqllogictest/test_files/aggregate_skip_partial.slt b/datafusion/sqllogictest/test_files/aggregate_skip_partial.slt index e12ac5782e3a4..941787c7dc9a3 100644 --- a/datafusion/sqllogictest/test_files/aggregate_skip_partial.slt +++ b/datafusion/sqllogictest/test_files/aggregate_skip_partial.slt @@ -321,11 +321,11 @@ SELECT c2, median(c5), median(c11) FROM aggregate_test_100 GROUP BY c2 ORDER BY query IIR SELECT c2, approx_median(c5), approx_median(c11) FROM aggregate_test_100 GROUP BY c2 ORDER BY c2; ---- -1 191655437 0.59926736 -2 -587831330 0.43230486 +1 -335410409 0.5780736 +2 -587831330 0.4279275 3 240273900 0.40199697 4 762932956 0.48515016 -5 593204320 0.5156586 +5 586844478 0.44318348 # Test approx_distinct for varchar / int query III @@ -405,8 +405,8 @@ SELECT c2, approx_median(c3), approx_median(c11) FROM aggregate_test_100_null GR 1 12 0.6067944 2 1 0.46076488 3 14 0.40154034 -4 -7 0.48515016 -5 -39 0.5536642 +4 -38 0.48515016 +5 -40 0.5536642 # Test approx_distinct with nullable fields query II @@ -526,7 +526,7 @@ FROM aggregate_test_100 GROUP BY c2 ORDER BY c2; 1 57 -56 2 52 -60 3 71 -76 -4 65 -64 +4 65 -79 5 64 -59 # Test count with nullable fields and filter @@ -669,10 +669,10 @@ SELECT c2, FROM aggregate_test_100_null GROUP BY c2 ORDER BY c2; ---- 1 -5 0.6623719 -2 12 0.52930677 +2 1 0.52930677 3 13 0.32792538 4 -38 0.49774808 -5 -21 0.47652745 +5 -31 0.44318348 # Test approx_median with nullable fields and nullable filter query II @@ -680,11 +680,11 @@ SELECT c2, approx_median(c3) FILTER (WHERE c11 > 0.5) FROM aggregate_test_100_null GROUP BY c2 ORDER BY c2; ---- -1 35 +1 29 2 -29 3 22 4 -90 -5 -32 +5 -40 statement ok DROP TABLE aggregate_test_100_null; diff --git a/datafusion/sqllogictest/test_files/metadata.slt b/datafusion/sqllogictest/test_files/metadata.slt index f3836b23ec321..d1596266f1395 100644 --- a/datafusion/sqllogictest/test_files/metadata.slt +++ b/datafusion/sqllogictest/test_files/metadata.slt @@ -86,7 +86,7 @@ select count(distinct name) from table_with_metadata; query I select approx_median(distinct id) from table_with_metadata; ---- -2 +1 # Regression test: prevent field metadata loss per https://github.com/apache/datafusion/issues/12687 statement ok