Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 10 additions & 10 deletions datafusion/core/tests/dataframe/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1206,23 +1206,23 @@ async fn window_using_aggregates() -> Result<()> {
| | | | | | | | 1 | -85 |
| -85 | -101 | 14 | -12 | -12 | 83 | -101 | 4 | -54 |
| -85 | -101 | 17 | -25 | -25 | 83 | -101 | 5 | -31 |
| -85 | -12 | 10 | -32 | -34 | 83 | -85 | 3 | 13 |
| -85 | -12 | 10 | -43 | -34 | 83 | -85 | 3 | 13 |
| -85 | -25 | 3 | -56 | -56 | -25 | -85 | 1 | -5 |
| -85 | -31 | 18 | -29 | -28 | 83 | -101 | 5 | 36 |
| -85 | -31 | 18 | -31 | -28 | 83 | -101 | 5 | 36 |
| -85 | -38 | 16 | -25 | -25 | 83 | -101 | 4 | 65 |
| -85 | -43 | 7 | -43 | -43 | 83 | -85 | 2 | 45 |
| -85 | -48 | 6 | -35 | -36 | 83 | -85 | 2 | -43 |
| -85 | -5 | 4 | -37 | -40 | -5 | -85 | 1 | 83 |
| -85 | -54 | 15 | -17 | -18 | 83 | -101 | 4 | -38 |
| -85 | -56 | 2 | -70 | -70 | -56 | -85 | 1 | -25 |
| -85 | -48 | 6 | -48 | -36 | 83 | -85 | 2 | -43 |
| -85 | -5 | 4 | -56 | -40 | -5 | -85 | 1 | 83 |
| -85 | -54 | 15 | -25 | -18 | 83 | -101 | 4 | -38 |
| -85 | -56 | 2 | -85 | -70 | -56 | -85 | 1 | -25 |
| -85 | -72 | 9 | -43 | -43 | 83 | -85 | 3 | -12 |
| -85 | -85 | 1 | -85 | -85 | -85 | -85 | 1 | -56 |
| -85 | 13 | 11 | -17 | -18 | 83 | -85 | 3 | 14 |
| -85 | 13 | 11 | -25 | -18 | 83 | -85 | 3 | 14 |
| -85 | 13 | 11 | -25 | -25 | 83 | -85 | 3 | 13 |
| -85 | 14 | 12 | -12 | -12 | 83 | -85 | 3 | 17 |
| -85 | 17 | 13 | -11 | -8 | 83 | -85 | 4 | -101 |
| -85 | 45 | 8 | -34 | -34 | 83 | -85 | 3 | -72 |
| -85 | 65 | 17 | -17 | -18 | 83 | -101 | 5 | -101 |
| -85 | 17 | 13 | -12 | -8 | 83 | -85 | 4 | -101 |
| -85 | 45 | 8 | -43 | -34 | 83 | -85 | 3 | -72 |
| -85 | 65 | 17 | -25 | -18 | 83 | -101 | 5 | -101 |
| -85 | 83 | 5 | -25 | -25 | 83 | -85 | 2 | -48 |
+-------------+----------+-----------------+---------------+--------+-----+------+----+------+
"
Expand Down
18 changes: 18 additions & 0 deletions datafusion/functions-aggregate-common/src/tdigest.rs
Original file line number Diff line number Diff line change
Expand Up @@ -434,6 +434,10 @@ impl TDigest {
return 0.0;
}

// No compression happened since each centroid = one data point, so use exact percentile instead of interpolation
if self.count as usize == self.centroids.len() {
return self.exact_quantile(q);
}
let rank = q * self.count;

let mut pos: usize;
Expand Down Expand Up @@ -509,6 +513,20 @@ impl TDigest {
Self::clamp(value, min, max)
}

fn exact_quantile(&self, q: f64) -> f64 {
if q <= 0.0 {
return self.min();
}
if q >= 1.0 {
return self.max();
}

let n = self.centroids.len();
let idx = (q * n as f64).ceil() as usize;
let idx = idx.saturating_sub(1).min(n - 1);
self.centroids[idx].mean()
}

/// This method decomposes the [`TDigest`] and its [`Centroid`] instances
/// into a series of primitive scalar values.
///
Expand Down
76 changes: 38 additions & 38 deletions datafusion/sqllogictest/test_files/aggregate.slt
Original file line number Diff line number Diff line change
Expand Up @@ -1351,7 +1351,7 @@ DROP TABLE median_window_test;
query RT
select approx_median(arrow_cast(col_f32, 'Float16')), arrow_typeof(approx_median(arrow_cast(col_f32, 'Float16'))) from median_table;
----
2.75 Float16
2.1992188 Float16

# This shouldn't be NaN, see:
# https://github.com/apache/datafusion/issues/18945
Expand All @@ -1369,7 +1369,7 @@ select
arrow_typeof(approx_percentile_cont(0.5) within group (order by arrow_cast(col_f32, 'Float16')))
from median_table;
----
2.75 Float16
2.1992188 Float16

query ?T
select approx_median(NULL), arrow_typeof(approx_median(NULL)) from median_table;
Expand All @@ -1388,7 +1388,7 @@ select median(c), arrow_typeof(median(c)) from t;
query RT
select approx_median(c), arrow_typeof(approx_median(c)) from t;
----
0.00035 Float64
0.0003 Float64

statement ok
drop table t;
Expand Down Expand Up @@ -2319,33 +2319,33 @@ b NULL NULL 7732.315789473684
query TI
SELECT c1, approx_percentile_cont(0.95) WITHIN GROUP (ORDER BY c3) AS c3_p95 FROM aggregate_test_100 GROUP BY 1 ORDER BY 1
----
a 73
a 65
b 68
c 122
d 124
e 115
c 118
d 125
e 112


# csv_query_approx_percentile_cont_with_weight (should be the same as above)
query TI
SELECT c1, approx_percentile_cont(c3, 0.95) AS c3_p95 FROM aggregate_test_100 GROUP BY 1 ORDER BY 1
----
a 73
a 65
b 68
c 122
d 124
e 115
c 118
d 125
e 112


# using approx_percentile_cont on 2 columns with same signature
query TII
SELECT c1, approx_percentile_cont(c2, 0.95) AS c2, approx_percentile_cont(c3, 0.95) AS c3 FROM aggregate_test_100 GROUP BY 1 ORDER BY 1
----
a 5 73
a 5 65
b 5 68
c 5 122
d 5 124
e 5 115
c 5 118
d 5 125
e 5 112

# error is unique to this UDAF
query TRR
Expand All @@ -2363,50 +2363,50 @@ query TI
SELECT c1, approx_percentile_cont(0.95) WITHIN GROUP (ORDER BY c3 DESC) AS c3_p95 FROM aggregate_test_100 GROUP BY 1 ORDER BY 1
----
a -101
b -114
c -109
d -98
e -93
b -117
c -107
d -99
e -86

# csv_query_approx_percentile_cont_with_weight (2)
query TI
SELECT c1, approx_percentile_cont_with_weight(1, 0.95) WITHIN GROUP (ORDER BY c3) AS c3_p95 FROM aggregate_test_100 GROUP BY 1 ORDER BY 1
----
a 73
a 65
b 68
c 122
d 124
e 115
c 118
d 125
e 112

# csv_query_approx_percentile_cont_with_weight alternate syntax
query TI
SELECT c1, approx_percentile_cont_with_weight(c3, 1, 0.95) AS c3_p95 FROM aggregate_test_100 GROUP BY 1 ORDER BY 1
----
a 73
a 65
b 68
c 122
d 124
e 115
c 118
d 125
e 112


query TI
SELECT c1, approx_percentile_cont_with_weight(1, 0.95) WITHIN GROUP (ORDER BY c3 DESC) AS c3_p95 FROM aggregate_test_100 GROUP BY 1 ORDER BY 1
----
a -101
b -114
c -109
d -98
e -93
b -117
c -107
d -99
e -86

# csv_query_approx_percentile_cont_with_histogram_bins
query TI
SELECT c1, approx_percentile_cont(0.95, 200) WITHIN GROUP (ORDER BY c3) AS c3_p95 FROM aggregate_test_100 GROUP BY 1 ORDER BY 1
----
a 73
a 65
b 68
c 122
d 124
e 115
c 118
d 125
e 112

query TI
SELECT c1, approx_percentile_cont_with_weight(c2, 0.95) WITHIN GROUP (ORDER BY c3) AS c3_p95 FROM aggregate_test_100 GROUP BY 1 ORDER BY 1
Expand Down Expand Up @@ -8804,9 +8804,9 @@ FROM (SELECT * FROM stream_test ORDER BY g LIMIT 10000)
GROUP BY g
ORDER BY g;
----
1 2 1.5 1.5 1.5 1.5 1.5 1.5 1.5 1.5
2 2 1.5 1.5 1.5 1.5 1.5 1.5 1.5 1.5
3 2 1.5 1.5 1.5 1.5 1.5 1.5 1.5 1.5
1 2 1 1.5 1.5 1 1 1.5 1 1
2 2 1 1.5 1.5 1 1 1.5 1 1
3 2 1 1.5 1.5 1 1 1.5 1 1

# Config reset

Expand Down
20 changes: 10 additions & 10 deletions datafusion/sqllogictest/test_files/aggregate_skip_partial.slt
Original file line number Diff line number Diff line change
Expand Up @@ -321,11 +321,11 @@ SELECT c2, median(c5), median(c11) FROM aggregate_test_100 GROUP BY c2 ORDER BY
query IIR
SELECT c2, approx_median(c5), approx_median(c11) FROM aggregate_test_100 GROUP BY c2 ORDER BY c2;
----
1 191655437 0.59926736
2 -587831330 0.43230486
1 -335410409 0.5780736
2 -587831330 0.4279275
3 240273900 0.40199697
4 762932956 0.48515016
5 593204320 0.5156586
5 586844478 0.44318348

# Test approx_distinct for varchar / int
query III
Expand Down Expand Up @@ -405,8 +405,8 @@ SELECT c2, approx_median(c3), approx_median(c11) FROM aggregate_test_100_null GR
1 12 0.6067944
2 1 0.46076488
3 14 0.40154034
4 -7 0.48515016
5 -39 0.5536642
4 -38 0.48515016
5 -40 0.5536642

# Test approx_distinct with nullable fields
query II
Expand Down Expand Up @@ -526,7 +526,7 @@ FROM aggregate_test_100 GROUP BY c2 ORDER BY c2;
1 57 -56
2 52 -60
3 71 -76
4 65 -64
4 65 -79
5 64 -59

# Test count with nullable fields and filter
Expand Down Expand Up @@ -669,22 +669,22 @@ SELECT c2,
FROM aggregate_test_100_null GROUP BY c2 ORDER BY c2;
----
1 -5 0.6623719
2 12 0.52930677
2 1 0.52930677
3 13 0.32792538
4 -38 0.49774808
5 -21 0.47652745
5 -31 0.44318348

# Test approx_median with nullable fields and nullable filter
query II
SELECT c2,
approx_median(c3) FILTER (WHERE c11 > 0.5)
FROM aggregate_test_100_null GROUP BY c2 ORDER BY c2;
----
1 35
1 29
2 -29
3 22
4 -90
5 -32
5 -40

statement ok
DROP TABLE aggregate_test_100_null;
Expand Down
2 changes: 1 addition & 1 deletion datafusion/sqllogictest/test_files/metadata.slt
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ select count(distinct name) from table_with_metadata;
query I
select approx_median(distinct id) from table_with_metadata;
----
2
1

# Regression test: prevent field metadata loss per https://github.com/apache/datafusion/issues/12687
statement ok
Expand Down
Loading