@@ -81,9 +81,12 @@ clickbench_1: ClickBench queries against a single parquet file
81
81
clickbench_partitioned: ClickBench queries against a partitioned (100 files) parquet
82
82
clickbench_extended: ClickBench \" inspired\" queries against a single parquet (DataFusion specific)
83
83
external_aggr: External aggregation benchmark
84
- h2o_small: h2oai benchmark with small dataset (1e7 rows), default file format is csv
85
- h2o_medium: h2oai benchmark with medium dataset (1e8 rows), default file format is csv
86
- h2o_big: h2oai benchmark with large dataset (1e9 rows), default file format is csv
84
+ h2o_small: h2oai benchmark with small dataset (1e7 rows) for groupby, default file format is csv
85
+ h2o_medium: h2oai benchmark with medium dataset (1e8 rows) for groupby, default file format is csv
86
+ h2o_big: h2oai benchmark with large dataset (1e9 rows) for groupby, default file format is csv
87
+ h2o_small_join: h2oai benchmark with small dataset (1e7 rows) for join, default file format is csv
88
+ h2o_medium_join: h2oai benchmark with medium dataset (1e8 rows) for join, default file format is csv
89
+ h2o_big_join: h2oai benchmark with large dataset (1e9 rows) for join, default file format is csv
87
90
imdb: Join Order Benchmark (JOB) using the IMDB dataset converted to parquet
88
91
89
92
**********
@@ -150,6 +153,9 @@ main() {
150
153
data_h2o " SMALL"
151
154
data_h2o " MEDIUM"
152
155
data_h2o " BIG"
156
+ data_h2o_join " SMALL"
157
+ data_h2o_join " MEDIUM"
158
+ data_h2o_join " BIG"
153
159
data_clickbench_1
154
160
data_clickbench_partitioned
155
161
data_imdb
@@ -189,6 +195,15 @@ main() {
189
195
h2o_big)
190
196
data_h2o " BIG" " CSV"
191
197
;;
198
+ h2o_small_join)
199
+ data_h2o_join " SMALL" " CSV"
200
+ ;;
201
+ h2o_medium_join)
202
+ data_h2o_join " MEDIUM" " CSV"
203
+ ;;
204
+ h2o_big_join)
205
+ data_h2o_join " BIG" " CSV"
206
+ ;;
192
207
external_aggr)
193
208
# same data as for tpch
194
209
data_tpch " 1"
@@ -242,6 +257,9 @@ main() {
242
257
run_h2o " SMALL" " PARQUET" " groupby"
243
258
run_h2o " MEDIUM" " PARQUET" " groupby"
244
259
run_h2o " BIG" " PARQUET" " groupby"
260
+ run_h2o_join " SMALL" " PARQUET" " join"
261
+ run_h2o_join " MEDIUM" " PARQUET" " join"
262
+ run_h2o_join " BIG" " PARQUET" " join"
245
263
run_imdb
246
264
run_external_aggr
247
265
;;
@@ -287,6 +305,15 @@ main() {
287
305
h2o_big)
288
306
run_h2o " BIG" " CSV" " groupby"
289
307
;;
308
+ h2o_small_join)
309
+ run_h2o_join " SMALL" " CSV" " join"
310
+ ;;
311
+ h2o_medium_join)
312
+ run_h2o_join " MEDIUM" " CSV" " join"
313
+ ;;
314
+ h2o_big_join)
315
+ run_h2o_join " BIG" " CSV" " join"
316
+ ;;
290
317
external_aggr)
291
318
run_external_aggr
292
319
;;
@@ -687,7 +714,82 @@ data_h2o() {
687
714
deactivate
688
715
}
689
716
690
- # # todo now only support groupby, after https://github.com/mrpowers-io/falsa/issues/21 done, we can add support for join
717
+ data_h2o_join () {
718
+ # Default values for size and data format
719
+ SIZE=${1:- " SMALL" }
720
+ DATA_FORMAT=${2:- " CSV" }
721
+
722
+ # Function to compare Python versions
723
+ version_ge () {
724
+ [ " $( printf ' %s\n' " $1 " " $2 " | sort -V | head -n1) " = " $2 " ]
725
+ }
726
+
727
+ export PYO3_USE_ABI3_FORWARD_COMPATIBILITY=1
728
+
729
+ # Find the highest available Python version (3.10 or higher)
730
+ REQUIRED_VERSION=" 3.10"
731
+ PYTHON_CMD=$( command -v python3 || true)
732
+
733
+ if [ -n " $PYTHON_CMD " ]; then
734
+ PYTHON_VERSION=$( $PYTHON_CMD -c " import sys; print(f'{sys.version_info.major}.{sys.version_info.minor}')" )
735
+ if version_ge " $PYTHON_VERSION " " $REQUIRED_VERSION " ; then
736
+ echo " Found Python version $PYTHON_VERSION , which is suitable."
737
+ else
738
+ echo " Python version $PYTHON_VERSION found, but version $REQUIRED_VERSION or higher is required."
739
+ PYTHON_CMD=" "
740
+ fi
741
+ fi
742
+
743
+ # Search for suitable Python versions if the default is unsuitable
744
+ if [ -z " $PYTHON_CMD " ]; then
745
+ # Loop through all available Python3 commands on the system
746
+ for CMD in $( compgen -c | grep -E ' ^python3(\.[0-9]+)?$' ) ; do
747
+ if command -v " $CMD " & > /dev/null; then
748
+ PYTHON_VERSION=$( $CMD -c " import sys; print(f'{sys.version_info.major}.{sys.version_info.minor}')" )
749
+ if version_ge " $PYTHON_VERSION " " $REQUIRED_VERSION " ; then
750
+ PYTHON_CMD=" $CMD "
751
+ echo " Found suitable Python version: $PYTHON_VERSION ($CMD )"
752
+ break
753
+ fi
754
+ fi
755
+ done
756
+ fi
757
+
758
+ # If no suitable Python version found, exit with an error
759
+ if [ -z " $PYTHON_CMD " ]; then
760
+ echo " Python 3.10 or higher is required. Please install it."
761
+ return 1
762
+ fi
763
+
764
+ echo " Using Python command: $PYTHON_CMD "
765
+
766
+ # Install falsa and other dependencies
767
+ echo " Installing falsa..."
768
+
769
+ # Set virtual environment directory
770
+ VIRTUAL_ENV=" ${PWD} /venv"
771
+
772
+ # Create a virtual environment using the detected Python command
773
+ $PYTHON_CMD -m venv " $VIRTUAL_ENV "
774
+
775
+ # Activate the virtual environment and install dependencies
776
+ source " $VIRTUAL_ENV /bin/activate"
777
+
778
+ # Ensure 'falsa' is installed (avoid unnecessary reinstall)
779
+ pip install --quiet --upgrade falsa
780
+
781
+ # Create directory if it doesn't exist
782
+ H2O_DIR=" ${DATA_DIR} /h2o"
783
+ mkdir -p " ${H2O_DIR} "
784
+
785
+ # Generate h2o test data
786
+ echo " Generating h2o test data in ${H2O_DIR} with size=${SIZE} and format=${DATA_FORMAT} "
787
+ falsa join --path-prefix=" ${H2O_DIR} " --size " ${SIZE} " --data-format " ${DATA_FORMAT} "
788
+
789
+ # Deactivate virtual environment after completion
790
+ deactivate
791
+ }
792
+
691
793
run_h2o () {
692
794
# Default values for size and data format
693
795
SIZE=${1:- " SMALL" }
@@ -700,7 +802,7 @@ run_h2o() {
700
802
RESULTS_FILE=" ${RESULTS_DIR} /h2o.json"
701
803
702
804
echo " RESULTS_FILE: ${RESULTS_FILE} "
703
- echo " Running h2o benchmark..."
805
+ echo " Running h2o groupby benchmark..."
704
806
705
807
# Set the file name based on the size
706
808
case " $SIZE " in
@@ -730,6 +832,56 @@ run_h2o() {
730
832
-o " ${RESULTS_FILE} "
731
833
}
732
834
835
+ run_h2o_join () {
836
+ # Default values for size and data format
837
+ SIZE=${1:- " SMALL" }
838
+ DATA_FORMAT=${2:- " CSV" }
839
+ DATA_FORMAT=$( echo " $DATA_FORMAT " | tr ' [:upper:]' ' [:lower:]' )
840
+ RUN_Type=${3:- " join" }
841
+
842
+ # Data directory and results file path
843
+ H2O_DIR=" ${DATA_DIR} /h2o"
844
+ RESULTS_FILE=" ${RESULTS_DIR} /h2o_join.json"
845
+
846
+ echo " RESULTS_FILE: ${RESULTS_FILE} "
847
+ echo " Running h2o join benchmark..."
848
+
849
+ # Set the file name based on the size
850
+ case " $SIZE " in
851
+ " SMALL" )
852
+ X_TABLE_FILE_NAME=" J1_1e7_NA_0.${DATA_FORMAT} "
853
+ SMALL_TABLE_FILE_NAME=" J1_1e7_1e1_0.${DATA_FORMAT} "
854
+ MEDIUM_TABLE_FILE_NAME=" J1_1e7_1e4_0.${DATA_FORMAT} "
855
+ LARGE_TABLE_FILE_NAME=" J1_1e7_1e7_NA.${DATA_FORMAT} "
856
+ ;;
857
+ " MEDIUM" )
858
+ X_TABLE_FILE_NAME=" J1_1e8_NA_0.${DATA_FORMAT} "
859
+ SMALL_TABLE_FILE_NAME=" J1_1e8_1e2_0.${DATA_FORMAT} "
860
+ MEDIUM_TABLE_FILE_NAME=" J1_1e8_1e5_0.${DATA_FORMAT} "
861
+ LARGE_TABLE_FILE_NAME=" J1_1e8_1e8_NA.${DATA_FORMAT} "
862
+ ;;
863
+ " BIG" )
864
+ X_TABLE_FILE_NAME=" J1_1e9_NA_0.${DATA_FORMAT} "
865
+ SMALL_TABLE_FILE_NAME=" J1_1e9_1e3_0.${DATA_FORMAT} "
866
+ MEDIUM_TABLE_FILE_NAME=" J1_1e9_1e6_0.${DATA_FORMAT} "
867
+ LARGE_TABLE_FILE_NAME=" J1_1e9_1e9_NA.${DATA_FORMAT} "
868
+ ;;
869
+ * )
870
+ echo " Invalid size. Valid options are SMALL, MEDIUM, or BIG."
871
+ return 1
872
+ ;;
873
+ esac
874
+
875
+ # Set the query file name based on the RUN_Type
876
+ QUERY_FILE=" ${SCRIPT_DIR} /queries/h2o/${RUN_Type} .sql"
877
+
878
+ $CARGO_COMMAND --bin dfbench -- h2o \
879
+ --iterations 3 \
880
+ --join-paths " ${H2O_DIR} /${X_TABLE_FILE_NAME} ,${H2O_DIR} /${SMALL_TABLE_FILE_NAME} ,${H2O_DIR} /${MEDIUM_TABLE_FILE_NAME} ,${H2O_DIR} /${LARGE_TABLE_FILE_NAME} " \
881
+ --queries-path " ${QUERY_FILE} " \
882
+ -o " ${RESULTS_FILE} "
883
+ }
884
+
733
885
# Runs the external aggregation benchmark
734
886
run_external_aggr () {
735
887
# Use TPC-H SF1 dataset
0 commit comments