diff --git a/datafusion-partitioned/README.md b/datafusion-partitioned/README.md index 503fa565d..a4052a523 100644 --- a/datafusion-partitioned/README.md +++ b/datafusion-partitioned/README.md @@ -1,38 +1,46 @@ # DataFusion -DataFusion is an extensible query execution framework, written in Rust, that uses Apache Arrow as its in-memory format. For more information, please check +[Apache DataFusion] is an extensible query execution framework, written in Rust, that uses [Apache Arrow] as its in-memory format. For more information, please check + +[Apache DataFusion]: https://arrow.apache.org/datafusion/ +[Apache Arrow]: https://arrow.apache.org/ We use parquet file here and create an external table for it; and then execute the queries. -## Generate benchmark results +## Cookbook: Generate benchmark results The benchmark should be completed in under an hour. On-demand pricing is $0.6 per hour while spot pricing is only $0.2 to $0.3 per hour (us-east-2). -1. manually start a AWS EC2 instance - - `c6a.4xlarge` - - Ubuntu 22.04 or later - - Root 500GB gp2 SSD - - no EBS optimized - - no instance store -1. wait for status check passed, then ssh to EC2 `ssh ubuntu@{ip}` -1. `git clone https://github.com/ClickHouse/ClickBench` -1. `cd ClickBench/datafusion` -1. `vi benchmark.sh` and modify following line to target Datafusion version +1. manually start a AWS EC2 instance, the following environments are included in this dir: + + | Instance Type | OS | Disk | Arch | + | :-----------: | :---------------------: | :----------------: | :---: | + | `c6a.xlarge` | `Ubuntu 24.04` or later | Root 500GB gp2 SSD | AMD64 | + | `c6a.2xlarge` | | | AMD64 | + | `c6a.4xlarge` | | | AMD64 | + | `c8g.4xlarge` | | | ARM64 | + + All with no EBS optimized, no instance store. For `c6a.xlarge` instance, its memory is not capable to compile datafusion. It's recommended to enable a 8GB swap with ```sudo fallocate -l 4G /swapfile && sudo chmod 600 /swapfile && sudo mkswap /swapfile && sudo swapon /swapfile```. + +2. wait for status check passed, then ssh to EC2 `ssh ubuntu@{ip}` +3. `git clone https://github.com/ClickHouse/ClickBench` +4. `cd ClickBench/datafusion-partitioned` +5. `vi benchmark.sh` and modify following line to target Datafusion version ```bash git checkout 46.0.0 ``` -1. `bash benchmark.sh` +6. `bash benchmark.sh` +7. Update corresponding `.json` file under `results`, or run `./save-result.sh` with instance type like `./save-result.sh c6a.4xlarge` ### Know Issues 1. importing parquet by `datafusion-cli` doesn't support schema, need to add some casting in queries.sql (e.g. converting EventTime from Int to Timestamp via `to_timestamp_seconds`) 2. importing parquet by `datafusion-cli` make column name column name case-sensitive, i change all column name in queries.sql to double quoted literal (e.g. `EventTime` -> `"EventTime"`) -3. `comparing binary with utf-8` and `group by binary` don't work in mac, if you run these queries in mac, you'll get some errors for queries contain binary format apache/arrow-datafusion#3050 ## Generate full human readable results (for debugging) 1. install datafusion-cli -2. download the parquet ```wget --continue --progress=dot:giga https://datasets.clickhouse.com/hits_compatible/hits.parquet``` -3. execute it ```datafusion-cli -f create_single.sql queries.sql``` or ```bash run2.sh``` +2. download the parquet ```seq 0 99 | xargs -P100 -I{} bash -c 'wget --directory-prefix partitioned --continue --progress=dot:giga https://datasets.clickhouse.com/hits_compatible/athena_partitioned/hits_{}.parquet'``` +3. execute it ```datafusion-cli -f create_single.sql queries.sql``` or ```PATH="$(pwd)/datafusion/target/release:$PATH" ./run.sh``` diff --git a/datafusion-partitioned/benchmark.sh b/datafusion-partitioned/benchmark.sh index 1c10a401d..8042c7119 100755 --- a/datafusion-partitioned/benchmark.sh +++ b/datafusion-partitioned/benchmark.sh @@ -11,10 +11,12 @@ sudo apt-get update -y sudo apt-get install -y gcc echo "Install DataFusion main branch" -git clone https://github.com/apache/arrow-datafusion.git -cd arrow-datafusion/ -git checkout 47.0.0 +git clone https://github.com/apache/datafusion.git +cd datafusion/ +git checkout 52.0.0 +sudo fallocate -l 4G /swapfile && sudo chmod 600 /swapfile && sudo mkswap /swapfile && sudo swapon /swapfile CARGO_PROFILE_RELEASE_LTO=true RUSTFLAGS="-C codegen-units=1" cargo build --release --package datafusion-cli --bin datafusion-cli +sudo swapoff /swapfile export PATH="`pwd`/target/release:$PATH" cd .. diff --git a/datafusion-partitioned/results/c6a.2xlarge.json b/datafusion-partitioned/results/c6a.2xlarge.json index 087386f3d..7cc3fe790 100644 --- a/datafusion-partitioned/results/c6a.2xlarge.json +++ b/datafusion-partitioned/results/c6a.2xlarge.json @@ -1,6 +1,6 @@ { "system": "DataFusion (Parquet, partitioned)", - "date": "2025-07-10", + "date": "2026-01-15", "machine": "c6a.2xlarge", "cluster_size": 1, "proprietary": "no", @@ -10,48 +10,48 @@ "load_time": 0, "data_size": 14737666736, "result": [ - [0.068, 0.022, 0.021], - [0.167, 0.06, 0.059], - [0.362, 0.144, 0.147], - [0.523, 0.109, 0.113], - [1.644, 1.224, 1.334], - [1.719, 1.167, 1.174], - [0.13, 0.037, 0.038], - [0.181, 0.07, 0.065], - [1.803, 1.414, 1.398], - [2.079, 1.591, 1.617], - [0.875, 0.396, 0.381], - [1.016, 0.452, 0.44], - [1.702, 1.216, 1.197], - [3.255, 1.883, 1.93], - [1.629, 1.124, 1.237], - [1.816, 1.529, 1.51], - [3.179, 2.585, 2.593], - [2.891, 2.197, 2.287], - [6.073, 4.78, 4.877], - [0.597, 0.1, 0.101], - [9.674, 1.35, 1.344], - [11.432, 1.673, 1.652], - [22.163, 3.015, 3.05], - [55.44, 46.286, 43.371], - [2.831, 0.611, 0.604], - [1.025, 0.535, 0.558], - [2.845, 0.724, 0.724], - [9.733, 2.09, 2.088], - [19.263, 18.559, 18.21], - [0.953, 0.806, 0.774], - [2.548, 1.265, 1.166], - [6.191, 1.162, 1.161], - [5.003, 4.177, 4.193], - [10.349, 4.795, 4.817], - [10.307, 4.831, 4.884], - [2.14, 1.835, 1.843], - [0.352, 0.121, 0.111], - [0.217, 0.056, 0.058], - [0.328, 0.11, 0.109], - [0.47, 0.156, 0.157], - [0.201, 0.05, 0.046], - [0.186, 0.046, 0.046], - [0.174, 0.041, 0.044] -] + [0.052, 0.002, 0.002], + [0.117, 0.040, 0.038], + [0.950, 0.116, 0.111], + [2.713, 0.100, 0.108], + [2.921, 1.162, 1.009], + [3.116, 1.176, 1.047], + [0.055, 0.002, 0.002], + [0.126, 0.041, 0.043], + [3.124, 1.198, 1.194], + [4.286, 1.531, 1.493], + [2.358, 0.276, 0.275], + [2.714, 0.312, 0.290], + [3.249, 1.089, 0.965], + [6.469, 1.600, 1.630], + [3.244, 1.031, 1.036], + [2.522, 1.228, 1.260], + [6.138, 2.155, 2.165], + [6.118, 2.022, 2.108], + [11.294, 4.265, 4.152], + [1.706, 0.091, 0.091], + [20.960, 1.253, 1.267], + [23.958, 1.558, 1.453], + [45.677, 2.494, 2.559], + [108.672, 95.195, 91.845], + [1.474, 0.157, 0.159], + [3.367, 0.327, 0.323], + [1.546, 0.156, 0.155], + [21.312, 1.754, 1.709], + [19.173, 15.870, 15.832], + [0.859, 0.756, 0.750], + [7.448, 0.959, 1.028], + [15.002, 1.040, 1.054], + [11.322, 3.872, 3.830], + [20.749, 4.133, 4.390], + [20.763, 4.043, 4.438], + [1.892, 1.689, 1.658], + [0.170, 0.049, 0.055], + [0.126, 0.037, 0.033], + [0.179, 0.058, 0.058], + [0.464, 0.076, 0.074], + [0.122, 0.020, 0.024], + [0.133, 0.017, 0.021], + [0.094, 0.020, 0.016] + ] } diff --git a/datafusion-partitioned/results/c6a.4xlarge.json b/datafusion-partitioned/results/c6a.4xlarge.json index e6f6f87db..7f29c3b48 100644 --- a/datafusion-partitioned/results/c6a.4xlarge.json +++ b/datafusion-partitioned/results/c6a.4xlarge.json @@ -1,6 +1,6 @@ { "system": "DataFusion (Parquet, partitioned)", - "date": "2025-07-10", + "date": "2026-01-15", "machine": "c6a.4xlarge", "cluster_size": 1, "proprietary": "no", @@ -10,48 +10,48 @@ "load_time": 0, "data_size": 14737666736, "result": [ - [0.058, 0.017, 0.015], - [0.116, 0.035, 0.037], - [0.2, 0.084, 0.088], - [0.43, 0.081, 0.084], - [1.086, 0.78, 0.799], - [0.977, 0.751, 0.756], - [0.086, 0.026, 0.026], - [0.125, 0.04, 0.037], - [1.011, 0.882, 0.862], - [1.349, 0.971, 0.983], - [0.565, 0.231, 0.24], - [0.677, 0.264, 0.265], - [1.062, 0.816, 0.82], - [2.769, 1.346, 1.201], - [1.135, 0.792, 0.78], - [1.021, 0.926, 0.916], - [2.638, 1.639, 1.63], - [2.585, 1.555, 1.592], - [5.159, 3.238, 3.24], - [0.26, 0.077, 0.077], - [10.045, 1.067, 1.082], - [11.424, 1.291, 1.269], - [22.117, 2.487, 2.511], - [55.492, 9.765, 9.851], - [2.825, 0.432, 0.423], - [0.853, 0.328, 0.33], - [2.837, 0.508, 0.504], - [9.744, 1.469, 1.478], - [9.444, 9.445, 9.475], - [0.515, 0.405, 0.415], - [2.433, 0.729, 0.735], - [6.158, 0.884, 0.891], - [4.608, 3.342, 3.281], - [10.221, 3.481, 3.455], - [10.145, 3.486, 3.46], - [1.261, 1.188, 1.168], - [0.309, 0.114, 0.114], - [0.175, 0.05, 0.048], - [0.313, 0.099, 0.117], - [0.451, 0.166, 0.192], - [0.183, 0.04, 0.043], - [0.171, 0.04, 0.041], - [0.143, 0.035, 0.037] -] + [0.042, 0.002, 0.002], + [0.082, 0.024, 0.023], + [0.177, 0.068, 0.064], + [0.615, 0.076, 0.073], + [1.198, 0.703, 0.718], + [1.059, 0.727, 0.723], + [0.054, 0.002, 0.002], + [0.100, 0.025, 0.026], + [0.996, 0.824, 0.840], + [1.713, 0.942, 0.981], + [0.632, 0.193, 0.192], + [0.849, 0.228, 0.220], + [1.156, 0.736, 0.745], + [2.658, 1.245, 1.244], + [1.188, 0.753, 0.749], + [0.977, 0.810, 0.818], + [2.701, 1.527, 1.521], + [2.655, 1.522, 1.538], + [5.484, 3.126, 3.143], + [0.275, 0.070, 0.065], + [10.288, 0.958, 0.937], + [11.562, 1.139, 1.109], + [22.298, 2.243, 2.250], + [52.816, 8.052, 8.039], + [0.247, 0.115, 0.129], + [1.284, 0.206, 0.208], + [0.481, 0.121, 0.126], + [10.408, 1.285, 1.342], + [9.295, 8.614, 8.565], + [0.487, 0.401, 0.401], + [3.186, 0.721, 0.691], + [6.936, 0.867, 0.894], + [5.055, 3.304, 3.237], + [10.231, 3.302, 3.297], + [10.289, 3.304, 3.270], + [1.182, 1.097, 1.115], + [0.158, 0.058, 0.054], + [0.112, 0.033, 0.035], + [0.161, 0.057, 0.054], + [0.224, 0.088, 0.086], + [0.093, 0.021, 0.024], + [0.092, 0.018, 0.018], + [0.090, 0.016, 0.016] + ] } diff --git a/datafusion-partitioned/results/c6a.xlarge.json b/datafusion-partitioned/results/c6a.xlarge.json index b5fdbd81d..1358f4c75 100644 --- a/datafusion-partitioned/results/c6a.xlarge.json +++ b/datafusion-partitioned/results/c6a.xlarge.json @@ -1,6 +1,6 @@ { "system": "DataFusion (Parquet, partitioned)", - "date": "2025-07-11", + "date": "2026-01-15", "machine": "c6a.xlarge", "cluster_size": 1, "proprietary": "no", @@ -10,48 +10,48 @@ "load_time": 0, "data_size": 14737666736, "result": [ - [0.075, 0.035, 0.034], - [0.209, 0.105, 0.107], - [0.558, 0.278, 0.281], - [0.681, 0.201, 0.209], - [3.153, 2.413, 2.399], - [2.628, 2.299, 2.034], - [0.155, 0.064, 0.065], - [0.244, 0.143, 0.137], - [3.546, 2.918, 2.963], - [4.135, 3.296, 3.367], - [1.376, 0.779, 0.817], - [1.548, 1.001, 0.951], - [2.942, 2.662, 2.272], - [4.581, 3.397, 3.699], - [2.802, 2.287, 2.28], - [3.964, 3.285, 3.753], - [5.96, 5.313, 5.198], - [4.913, 4.098, 4.001], + [0.050, 0.002, 0.002], + [0.155, 0.070, 0.069], + [0.916, 0.211, 0.210], + [2.559, 0.176, 0.177], + [3.135, 1.786, 1.855], + [3.332, 1.705, 1.709], + [0.053, 0.002, 0.002], + [0.165, 0.073, 0.073], + [3.476, 2.107, 2.106], + [4.591, 2.450, 2.461], + [2.405, 0.485, 0.461], + [2.598, 0.534, 0.576], + [3.340, 1.444, 1.455], + [6.839, 2.004, 2.061], + [3.427, 1.412, 1.403], + [2.831, 1.924, 1.911], + [6.857, 3.741, 3.456], + [6.659, 3.394, 3.398], [null, null, null], - [0.697, 0.169, 0.17], - [9.898, 2.361, 2.249], - [11.36, 3.659, 3.492], - [22.105, 17.643, 16.388], - [56.066, 49.612, 48.044], - [2.824, 1.274, 1.265], - [1.471, 1.07, 1.149], - [2.855, 1.477, 1.477], - [9.621, 4.491, 4.587], - [42.151, 40.396, 40.48], - [1.704, 1.498, 1.511], - [3.412, 2.41, 2.46], - [6.256, 2.544, 2.367], + [1.927, 0.151, 0.150], + [20.884, 2.116, 2.132], + [23.982, 2.600, 2.573], + [45.662, 34.659, 32.172], + [111.062, 100.587, 94.511], + [1.224, 0.198, 0.193], + [3.302, 0.520, 0.519], + [1.246, 0.218, 0.213], + [21.202, 2.943, 2.926], + [30.980, 29.455, 29.697], + [1.574, 1.418, 1.408], + [7.496, 1.788, 1.786], + [15.047, 1.533, 1.524], [null, null, null], - [null, null, 22.127], - [21.955, null, null], - [4.232, 4.072, 3.842], - [0.329, 0.121, 0.134], - [0.201, 0.073, 0.076], - [0.321, 0.129, 0.128], - [0.479, 0.214, 0.185], - [0.183, 0.064, 0.065], - [0.18, 0.07, 0.067], - [0.159, 0.061, 0.059] -] + [null, null, null], + [null, null, null], + [2.984, 2.515, 2.492], + [0.169, 0.051, 0.063], + [0.116, 0.034, 0.034], + [0.170, 0.055, 0.054], + [0.591, 0.077, 0.102], + [0.132, 0.027, 0.022], + [0.144, 0.025, 0.025], + [0.097, 0.018, 0.018] + ] } diff --git a/datafusion-partitioned/results/c8g.4xlarge.json b/datafusion-partitioned/results/c8g.4xlarge.json index 7a1b85655..e71c15353 100644 --- a/datafusion-partitioned/results/c8g.4xlarge.json +++ b/datafusion-partitioned/results/c8g.4xlarge.json @@ -1,6 +1,6 @@ { "system": "DataFusion (Parquet, partitioned)", - "date": "2025-07-12", + "date": "2026-01-15", "machine": "c8g.4xlarge", "cluster_size": 1, "proprietary": "no", @@ -10,48 +10,48 @@ "load_time": 0, "data_size": 14737666736, "result": [ - [0.055, 0.011, 0.011], - [0.105, 0.02, 0.02], - [0.199, 0.051, 0.051], - [0.432, 0.052, 0.053], - [1.007, 0.434, 0.385], - [0.921, 0.421, 0.421], - [0.083, 0.019, 0.019], - [0.115, 0.021, 0.023], - [0.763, 0.406, 0.398], - [1.276, 0.622, 0.59], - [0.548, 0.156, 0.147], - [0.867, 0.166, 0.177], - [1.161, 0.473, 0.461], - [2.485, 0.625, 0.561], - [1.018, 0.435, 0.452], - [0.604, 0.441, 0.418], - [2.378, 0.734, 0.74], - [2.348, 0.67, 0.661], - [4.44, 1.37, 1.367], - [0.287, 0.043, 0.046], - [10.103, 0.613, 0.621], - [11.384, 0.693, 0.689], - [22.081, 1.158, 1.173], - [55.487, 3.748, 3.699], - [2.837, 0.248, 0.27], - [0.846, 0.212, 0.217], - [2.917, 0.276, 0.276], - [9.713, 0.81, 0.824], - [9.263, 7.386, 7.351], - [0.474, 0.363, 0.377], - [2.404, 0.377, 0.383], - [6.132, 0.381, 0.392], - [4.446, 1.121, 1.123], - [9.862, 1.529, 1.503], - [9.915, 1.818, 1.754], - [0.754, 0.559, 0.556], - [0.278, 0.097, 0.098], - [0.155, 0.04, 0.041], - [0.27, 0.098, 0.096], - [0.41, 0.15, 0.157], - [0.164, 0.031, 0.034], - [0.152, 0.03, 0.03], - [0.132, 0.026, 0.026] -] + [0.038, 0.002, 0.002], + [0.066, 0.015, 0.018], + [0.151, 0.046, 0.047], + [0.824, 0.047, 0.042], + [1.210, 0.321, 0.315], + [0.994, 0.453, 0.461], + [0.044, 0.002, 0.002], + [0.072, 0.017, 0.018], + [0.839, 0.356, 0.448], + [1.648, 0.709, 0.688], + [0.619, 0.115, 0.119], + [1.110, 0.130, 0.131], + [1.365, 0.438, 0.450], + [2.495, 0.574, 0.581], + [1.077, 0.394, 0.396], + [0.639, 0.343, 0.356], + [2.458, 0.682, 0.646], + [2.440, 0.645, 0.644], + [4.804, 1.219, 1.233], + [0.277, 0.041, 0.040], + [10.332, 0.636, 0.638], + [11.491, 0.746, 0.687], + [22.285, 1.103, 1.076], + [53.877, 2.915, 2.906], + [0.239, 0.090, 0.097], + [1.400, 0.152, 0.147], + [0.575, 0.095, 0.095], + [10.501, 0.725, 0.743], + [8.956, 6.767, 6.827], + [0.431, 0.343, 0.346], + [3.119, 0.394, 0.388], + [6.932, 0.392, 0.413], + [4.933, 1.143, 0.994], + [9.930, 1.425, 1.553], + [9.941, 1.592, 1.396], + [0.669, 0.503, 0.588], + [0.143, 0.051, 0.051], + [0.099, 0.033, 0.032], + [0.143, 0.051, 0.052], + [0.218, 0.085, 0.087], + [0.079, 0.018, 0.016], + [0.077, 0.015, 0.015], + [0.077, 0.014, 0.014] + ] } diff --git a/datafusion-partitioned/save-result.sh b/datafusion-partitioned/save-result.sh new file mode 100755 index 000000000..9b38ce9f9 --- /dev/null +++ b/datafusion-partitioned/save-result.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +# This scripts converts the raw results.csv data from `benchmark.sh` into a the +# final json format used by the benchmark dashboard. +# +# usage : ./save-result.sh +# +# example (save results/c6a.4xlarge.json) +# ./save-result.sh c6a.4xlarge + +MACHINE=$1 +OUTPUT_FILE="results/${MACHINE}.json" +SYSTEM_NAME="DataFusion (Parquet, single)" +DATE=$(date +%Y-%m-%d) + + +# Read the CSV and build the result array using sed +RESULT_ARRAY=$(awk -F, '{arr[$1]=arr[$1]","$3} END {for (i=1;i<=length(arr);i++) {gsub(/^,/, "", arr[i]); printf " ["arr[i]"]"; if (i $OUTPUT_FILE +{ + "system": "$SYSTEM_NAME", + "date": "$DATE", + "machine": "$MACHINE", + "cluster_size": 1, + "proprietary": "no", + "tuned": "no", + "hardware": "cpu", + "tags": ["Rust","column-oriented","embedded","stateless"], + "load_time": 0, + "data_size": 14779976446, + "result": [ + $RESULT_ARRAY + ] +} +EOF \ No newline at end of file diff --git a/datafusion/README.md b/datafusion/README.md index 503fa565d..f8f52ccd0 100644 --- a/datafusion/README.md +++ b/datafusion/README.md @@ -1,38 +1,46 @@ # DataFusion -DataFusion is an extensible query execution framework, written in Rust, that uses Apache Arrow as its in-memory format. For more information, please check +[Apache DataFusion] is an extensible query execution framework, written in Rust, that uses [Apache Arrow] as its in-memory format. For more information, please check + +[Apache DataFusion]: https://arrow.apache.org/datafusion/ +[Apache Arrow]: https://arrow.apache.org/ We use parquet file here and create an external table for it; and then execute the queries. -## Generate benchmark results +## Cookbook: Generate benchmark results The benchmark should be completed in under an hour. On-demand pricing is $0.6 per hour while spot pricing is only $0.2 to $0.3 per hour (us-east-2). -1. manually start a AWS EC2 instance - - `c6a.4xlarge` - - Ubuntu 22.04 or later - - Root 500GB gp2 SSD - - no EBS optimized - - no instance store -1. wait for status check passed, then ssh to EC2 `ssh ubuntu@{ip}` -1. `git clone https://github.com/ClickHouse/ClickBench` -1. `cd ClickBench/datafusion` -1. `vi benchmark.sh` and modify following line to target Datafusion version +1. manually start a AWS EC2 instance, the following environments are included in this dir: + + | Instance Type | OS | Disk | Arch | + | :-----------: | :---------------------: | :----------------: | :---: | + | `c6a.xlarge` | `Ubuntu 24.04` or later | Root 500GB gp2 SSD | AMD64 | + | `c6a.2xlarge` | | | AMD64 | + | `c6a.4xlarge` | | | AMD64 | + | `c8g.4xlarge` | | | ARM64 | + + All with no EBS optimized, no instance store. For `c6a.xlarge` instance, its memory is not capable to compile datafusion. It's recommended to enable a 8GB swap with ```sudo fallocate -l 4G /swapfile && sudo chmod 600 /swapfile && sudo mkswap /swapfile && sudo swapon /swapfile```. + +2. wait for status check passed, then ssh to EC2 `ssh ubuntu@{ip}` +3. `git clone https://github.com/ClickHouse/ClickBench` +4. `cd ClickBench/datafusion` +5. `vi benchmark.sh` and modify following line to target Datafusion version ```bash git checkout 46.0.0 ``` -1. `bash benchmark.sh` +6. `bash benchmark.sh` +7. Update corresponding `.json` file under `results`, or run `./save-result.sh` with instance type like `./save-result.sh c6a.4xlarge` ### Know Issues 1. importing parquet by `datafusion-cli` doesn't support schema, need to add some casting in queries.sql (e.g. converting EventTime from Int to Timestamp via `to_timestamp_seconds`) 2. importing parquet by `datafusion-cli` make column name column name case-sensitive, i change all column name in queries.sql to double quoted literal (e.g. `EventTime` -> `"EventTime"`) -3. `comparing binary with utf-8` and `group by binary` don't work in mac, if you run these queries in mac, you'll get some errors for queries contain binary format apache/arrow-datafusion#3050 ## Generate full human readable results (for debugging) 1. install datafusion-cli 2. download the parquet ```wget --continue --progress=dot:giga https://datasets.clickhouse.com/hits_compatible/hits.parquet``` -3. execute it ```datafusion-cli -f create_single.sql queries.sql``` or ```bash run2.sh``` +3. execute it ```datafusion-cli -f create_single.sql queries.sql``` or ```PATH="$(pwd)/datafusion/target/release:$PATH" ./run.sh``` diff --git a/datafusion/benchmark.sh b/datafusion/benchmark.sh index c16368cd3..e5af2502f 100755 --- a/datafusion/benchmark.sh +++ b/datafusion/benchmark.sh @@ -11,10 +11,12 @@ sudo apt-get update -y sudo apt-get install -y gcc echo "Install DataFusion main branch" -git clone https://github.com/apache/arrow-datafusion.git -cd arrow-datafusion/ -git checkout 47.0.0 +git clone https://github.com/apache/datafusion.git +cd datafusion/ +git checkout 52.0.0 +sudo fallocate -l 4G /swapfile && sudo chmod 600 /swapfile && sudo mkswap /swapfile && sudo swapon /swapfile CARGO_PROFILE_RELEASE_LTO=true RUSTFLAGS="-C codegen-units=1" cargo build --release --package datafusion-cli --bin datafusion-cli +sudo swapoff /swapfile export PATH="`pwd`/target/release:$PATH" cd .. diff --git a/datafusion/results/c6a.2xlarge.json b/datafusion/results/c6a.2xlarge.json index 9d6b4f1bf..868ab65e7 100644 --- a/datafusion/results/c6a.2xlarge.json +++ b/datafusion/results/c6a.2xlarge.json @@ -1,6 +1,6 @@ { "system": "DataFusion (Parquet, single)", - "date": "2025-07-10", + "date": "2026-01-15", "machine": "c6a.2xlarge", "cluster_size": 1, "proprietary": "no", @@ -10,48 +10,48 @@ "load_time": 0, "data_size": 14779976446, "result": [ - [0.096, 0.048, 0.043], - [0.178, 0.089, 0.088], - [0.324, 0.172, 0.172], - [0.447, 0.143, 0.136], - [1.515, 1.336, 1.345], - [1.361, 1.151, 1.143], - [0.116, 0.057, 0.057], - [0.183, 0.099, 0.096], - [1.692, 1.451, 1.46], - [2.024, 1.665, 1.665], - [0.746, 0.43, 0.429], - [0.812, 0.495, 0.485], - [1.384, 1.185, 1.184], - [2.992, 1.937, 1.907], - [1.338, 1.144, 1.144], - [1.838, 1.6, 1.632], - [2.964, 2.522, 2.548], - [2.805, 2.233, 2.239], - [5.649, 4.744, 4.665], - [0.307, 0.126, 0.132], - [9.886, 1.33, 1.335], - [11.311, 1.772, 1.838], - [22.224, 3.518, 3.578], - [55.96, 46.554, 44.205], - [2.743, 0.668, 0.696], - [0.865, 0.585, 0.558], - [2.746, 0.77, 0.745], - [9.641, 2.196, 2.201], - [20.636, 19.789, 20.073], - [0.919, 0.806, 0.765], - [2.445, 1.223, 1.24], - [5.895, 1.209, 1.227], - [4.797, 4.257, 4.311], - [10.619, 4.901, 4.848], - [10.629, 4.829, 4.871], - [2.026, 1.864, 1.826], - [0.308, 0.139, 0.139], - [0.219, 0.09, 0.082], - [0.315, 0.151, 0.143], - [0.422, 0.204, 0.206], - [0.186, 0.073, 0.089], - [0.188, 0.078, 0.074], - [0.181, 0.071, 0.072] -] + [0.050, 0.001, 0.001], + [0.149, 0.042, 0.041], + [0.269, 0.109, 0.110], + [2.133, 0.097, 0.095], + [2.676, 1.070, 1.062], + [2.654, 1.169, 1.165], + [0.057, 0.001, 0.001], + [0.128, 0.044, 0.043], + [2.534, 1.254, 1.249], + [3.522, 1.523, 1.541], + [1.896, 0.281, 0.261], + [2.310, 0.297, 0.304], + [2.892, 1.047, 1.044], + [5.834, 1.635, 1.658], + [2.803, 1.028, 1.010], + [2.094, 1.179, 1.165], + [5.568, 2.150, 2.158], + [5.536, 2.269, 2.132], + [10.088, 4.027, 3.934], + [1.363, 0.089, 0.088], + [20.710, 1.404, 1.413], + [23.415, 1.857, 1.786], + [45.164, 5.607, 5.581], + [112.225, 101.388, 95.682], + [6.284, 0.640, 0.660], + [2.539, 0.523, 0.514], + [6.259, 0.668, 0.672], + [20.368, 1.882, 1.852], + [20.198, 17.316, 17.465], + [0.847, 0.755, 0.775], + [6.528, 1.201, 1.193], + [13.633, 1.253, 1.172], + [10.277, 3.849, 3.866], + [20.523, 4.286, 4.279], + [20.545, 4.214, 4.205], + [1.729, 1.540, 1.556], + [0.265, 0.118, 0.110], + [0.190, 0.098, 0.095], + [0.289, 0.107, 0.107], + [0.461, 0.179, 0.189], + [0.159, 0.037, 0.038], + [0.143, 0.037, 0.036], + [0.141, 0.032, 0.030] + ] } diff --git a/datafusion/results/c6a.4xlarge.json b/datafusion/results/c6a.4xlarge.json index cab20d01f..45cc9abd0 100644 --- a/datafusion/results/c6a.4xlarge.json +++ b/datafusion/results/c6a.4xlarge.json @@ -1,6 +1,6 @@ { "system": "DataFusion (Parquet, single)", - "date": "2025-07-10", + "date": "2026-01-15", "machine": "c6a.4xlarge", "cluster_size": 1, "proprietary": "no", @@ -10,48 +10,48 @@ "load_time": 0, "data_size": 14779976446, "result": [ - [0.098, 0.059, 0.062], - [0.149, 0.092, 0.091], - [0.224, 0.12, 0.126], - [0.409, 0.12, 0.117], - [1.089, 0.826, 0.857], - [0.947, 0.791, 0.776], - [0.114, 0.074, 0.061], - [0.173, 0.098, 0.096], - [1.072, 0.913, 0.897], - [1.306, 1.078, 1.033], - [0.54, 0.294, 0.29], - [0.643, 0.315, 0.305], - [0.963, 0.831, 0.84], - [2.786, 1.247, 1.399], - [1.047, 0.822, 0.809], - [1.081, 1.019, 0.978], - [2.588, 1.683, 1.68], - [2.585, 1.629, 1.625], - [5.078, 3.227, 3.265], - [0.282, 0.139, 0.13], - [9.925, 1.079, 1.078], - [11.375, 1.302, 1.324], - [22.24, 2.678, 2.725], - [55.848, 10.042, 10.348], - [2.701, 0.49, 0.485], - [0.859, 0.381, 0.393], - [2.701, 0.535, 0.552], - [9.697, 1.644, 1.66], - [10.333, 9.847, 9.703], - [0.537, 0.458, 0.427], - [2.354, 0.829, 0.816], - [5.894, 1.001, 0.991], - [4.426, 3.514, 3.5], - [10.153, 3.738, 3.713], - [10.134, 3.712, 3.729], - [1.347, 1.2, 1.248], - [0.362, 0.178, 0.18], - [0.23, 0.114, 0.133], - [0.357, 0.182, 0.199], - [0.511, 0.247, 0.246], - [0.212, 0.098, 0.101], - [0.215, 0.102, 0.113], - [0.203, 0.1, 0.093] -] + [0.062, 0.001, 0.001], + [0.122, 0.038, 0.039], + [0.188, 0.065, 0.063], + [0.393, 0.072, 0.069], + [1.101, 0.737, 0.713], + [0.974, 0.800, 0.787], + [0.068, 0.001, 0.001], + [0.134, 0.041, 0.040], + [1.026, 0.867, 0.857], + [1.353, 0.962, 0.995], + [0.481, 0.206, 0.206], + [0.703, 0.215, 0.217], + [1.031, 0.820, 0.815], + [2.488, 1.180, 1.183], + [1.023, 0.797, 0.795], + [0.984, 0.828, 0.827], + [2.592, 1.576, 1.591], + [2.551, 1.567, 1.575], + [5.086, 3.134, 3.124], + [0.251, 0.080, 0.079], + [10.079, 0.985, 0.996], + [11.328, 1.197, 1.234], + [22.336, 3.132, 3.103], + [55.832, 9.891, 9.749], + [2.685, 0.439, 0.442], + [0.818, 0.340, 0.351], + [2.703, 0.444, 0.443], + [9.786, 1.215, 1.241], + [9.912, 9.087, 9.131], + [0.500, 0.396, 0.394], + [2.858, 0.795, 0.810], + [6.345, 0.925, 0.927], + [4.690, 3.265, 3.560], + [10.113, 3.361, 3.382], + [10.116, 3.342, 3.345], + [1.289, 1.149, 1.078], + [0.316, 0.122, 0.122], + [0.218, 0.123, 0.124], + [0.305, 0.120, 0.120], + [0.480, 0.191, 0.192], + [0.165, 0.051, 0.050], + [0.152, 0.048, 0.048], + [0.149, 0.043, 0.043] + ] } diff --git a/datafusion/results/c6a.xlarge.json b/datafusion/results/c6a.xlarge.json index c48aaf779..3f4cf4a29 100644 --- a/datafusion/results/c6a.xlarge.json +++ b/datafusion/results/c6a.xlarge.json @@ -1,6 +1,6 @@ { "system": "DataFusion (Parquet, single)", - "date": "2025-07-11", + "date": "2026-01-15", "machine": "c6a.xlarge", "cluster_size": 1, "proprietary": "no", @@ -10,48 +10,48 @@ "load_time": 0, "data_size": 14779976446, "result": [ - [0.091, 0.048, 0.047], - [0.228, 0.119, 0.121], - [0.502, 0.279, 0.279], - [0.597, 0.2, 0.199], - [2.614, 2.278, 2.321], - [2.436, 2.056, 2.039], - [0.15, 0.073, 0.072], - [0.255, 0.151, 0.151], - [3.156, 2.83, 2.87], - [3.723, 3.186, 3.08], - [1.225, 0.803, 0.804], - [1.361, 0.886, 0.914], - [2.662, 2.331, 2.319], - [4.128, 3.6, 3.609], - [2.618, 2.237, 2.208], - [3.459, 3.165, 3.153], - [5.596, 5.059, 5.119], - [4.571, 3.898, 3.928], - [10.933, null, 19.109], - [0.55, 0.18, 0.178], - [9.608, 2.416, 2.429], - [11.302, 3.716, 3.757], - [22.33, 15.919, 13.787], - [55.818, 47.55, 46.996], - [2.703, 1.31, 1.318], - [1.527, 1.115, 1.092], - [2.731, 1.51, 1.479], - [9.664, 4.516, 4.776], - [42.285, 41.141, 41.129], - [1.619, 1.472, 1.451], - [3.121, 2.416, 2.476], - [5.996, 2.345, 2.324], - [9.889, null, 20.018], + [0.061, 0.001, 0.001], + [0.177, 0.062, 0.064], + [0.416, 0.193, 0.194], + [2.082, 0.165, 0.165], + [2.824, 1.699, 1.691], + [2.937, 2.110, 2.101], + [0.050, 0.001, 0.001], + [0.154, 0.066, 0.067], + [2.857, 2.065, 2.047], + [3.543, 2.345, 2.343], + [1.938, 0.436, 0.432], + [2.109, 0.493, 0.492], + [2.954, 1.851, 1.844], + [6.119, 2.711, 2.659], + [3.002, 1.782, 1.787], + [2.437, 1.885, 1.873], + [6.339, 3.711, 3.693], + [6.328, 3.671, 3.670], [null, null, null], + [1.421, 0.147, 0.147], + [20.657, 2.639, 2.654], + [23.471, 3.422, 3.431], + [45.366, 35.012, 31.708], + [112.328, 102.982, 96.806], + [6.317, 1.188, 1.194], + [2.552, 0.954, 0.951], + [6.303, 1.200, 1.204], + [20.396, 3.341, 3.356], + [32.125, 30.914, 30.854], + [1.564, 1.403, 1.401], + [6.693, 1.972, 1.970], + [13.640, 1.992, 1.960], [null, null, null], - [3.787, 3.49, 3.558], - [0.34, 0.148, 0.16], - [0.223, 0.097, 0.099], - [0.336, 0.141, 0.149], - [0.501, 0.219, 0.219], - [0.201, 0.074, 0.08], - [0.185, 0.079, 0.09], - [0.172, 0.073, 0.079] -] + [null, null, null], + [null, null, null], + [2.686, 2.433, 2.431], + [0.295, 0.104, 0.105], + [0.209, 0.096, 0.092], + [0.294, 0.104, 0.109], + [0.454, 0.177, 0.175], + [0.149, 0.039, 0.036], + [0.142, 0.039, 0.039], + [0.132, 0.033, 0.033] + ] } diff --git a/datafusion/results/c8g.4xlarge.json b/datafusion/results/c8g.4xlarge.json index 814cb7f77..04be35493 100644 --- a/datafusion/results/c8g.4xlarge.json +++ b/datafusion/results/c8g.4xlarge.json @@ -1,6 +1,6 @@ { "system": "DataFusion (Parquet, single)", - "date": "2025-07-12", + "date": "2026-01-15", "machine": "c8g.4xlarge", "cluster_size": 1, "proprietary": "no", @@ -10,48 +10,48 @@ "load_time": 0, "data_size": 14779976446, "result": [ - [0.08, 0.044, 0.042], - [0.113, 0.045, 0.052], - [0.167, 0.081, 0.071], - [0.382, 0.064, 0.068], - [1.014, 0.393, 0.413], - [0.838, 0.422, 0.42], - [0.087, 0.05, 0.05], - [0.115, 0.055, 0.055], - [0.751, 0.451, 0.427], - [1.209, 0.645, 0.665], - [0.488, 0.177, 0.171], - [0.858, 0.188, 0.188], - [1.096, 0.47, 0.455], - [2.412, 0.632, 0.656], - [0.968, 0.437, 0.433], - [0.633, 0.44, 0.453], - [2.255, 0.787, 0.802], - [2.221, 0.705, 0.712], - [4.306, 1.397, 1.411], - [0.247, 0.07, 0.068], - [10.071, 0.67, 0.687], - [11.304, 0.767, 0.794], - [22.184, 1.451, 1.388], - [55.753, 3.883, 4.14], - [2.671, 0.276, 0.268], - [0.817, 0.241, 0.242], - [2.775, 0.298, 0.303], - [9.652, 0.924, 0.871], - [10.242, 7.78, 7.539], - [0.477, 0.374, 0.381], - [2.269, 0.404, 0.43], - [5.833, 0.423, 0.397], - [4.217, 1.133, 1.132], - [9.806, 1.576, 1.613], - [9.73, 1.55, 1.618], - [0.729, 0.62, 0.579], - [0.285, 0.129, 0.129], - [0.177, 0.07, 0.072], - [0.282, 0.141, 0.131], - [0.411, 0.194, 0.198], - [0.159, 0.062, 0.057], - [0.148, 0.055, 0.055], - [0.144, 0.051, 0.06] -] + [0.050, 0.001, 0.001], + [0.083, 0.017, 0.019], + [0.150, 0.046, 0.045], + [0.400, 0.043, 0.038], + [1.093, 0.296, 0.300], + [0.830, 0.417, 0.406], + [0.041, 0.001, 0.001], + [0.070, 0.019, 0.019], + [0.700, 0.401, 0.403], + [1.276, 0.678, 0.693], + [0.450, 0.118, 0.121], + [0.980, 0.135, 0.133], + [1.246, 0.399, 0.420], + [2.249, 0.473, 0.532], + [0.909, 0.392, 0.375], + [0.569, 0.360, 0.334], + [2.188, 0.654, 0.630], + [2.183, 0.617, 0.619], + [4.319, 1.204, 1.262], + [0.194, 0.042, 0.046], + [10.194, 0.644, 0.646], + [11.257, 0.706, 0.716], + [22.174, 1.279, 1.257], + [55.737, 3.583, 3.627], + [2.671, 0.210, 0.213], + [0.959, 0.172, 0.156], + [3.033, 0.219, 0.218], + [9.849, 0.751, 0.745], + [10.612, 7.221, 7.211], + [0.443, 0.342, 0.342], + [2.784, 0.384, 0.367], + [6.308, 0.379, 0.395], + [4.509, 1.099, 1.028], + [9.716, 1.482, 1.447], + [9.757, 1.453, 1.454], + [0.688, 0.584, 0.554], + [0.230, 0.095, 0.096], + [0.122, 0.046, 0.046], + [0.255, 0.097, 0.095], + [0.405, 0.159, 0.159], + [0.122, 0.030, 0.030], + [0.109, 0.031, 0.029], + [0.102, 0.025, 0.026] + ] } diff --git a/datafusion/save-result.sh b/datafusion/save-result.sh new file mode 100755 index 000000000..9b38ce9f9 --- /dev/null +++ b/datafusion/save-result.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +# This scripts converts the raw results.csv data from `benchmark.sh` into a the +# final json format used by the benchmark dashboard. +# +# usage : ./save-result.sh +# +# example (save results/c6a.4xlarge.json) +# ./save-result.sh c6a.4xlarge + +MACHINE=$1 +OUTPUT_FILE="results/${MACHINE}.json" +SYSTEM_NAME="DataFusion (Parquet, single)" +DATE=$(date +%Y-%m-%d) + + +# Read the CSV and build the result array using sed +RESULT_ARRAY=$(awk -F, '{arr[$1]=arr[$1]","$3} END {for (i=1;i<=length(arr);i++) {gsub(/^,/, "", arr[i]); printf " ["arr[i]"]"; if (i $OUTPUT_FILE +{ + "system": "$SYSTEM_NAME", + "date": "$DATE", + "machine": "$MACHINE", + "cluster_size": 1, + "proprietary": "no", + "tuned": "no", + "hardware": "cpu", + "tags": ["Rust","column-oriented","embedded","stateless"], + "load_time": 0, + "data_size": 14779976446, + "result": [ + $RESULT_ARRAY + ] +} +EOF \ No newline at end of file