@@ -30,15 +30,19 @@ use arrow::{
30
30
} ;
31
31
use arrow_array:: Float32Array ;
32
32
use arrow_schema:: ArrowError ;
33
+ use object_store:: local:: LocalFileSystem ;
34
+ use std:: fs;
33
35
use std:: sync:: Arc ;
36
+ use tempfile:: TempDir ;
37
+ use url:: Url ;
34
38
35
- use datafusion:: dataframe:: DataFrame ;
39
+ use datafusion:: dataframe:: { DataFrame , DataFrameWriteOptions } ;
36
40
use datafusion:: datasource:: MemTable ;
37
41
use datafusion:: error:: Result ;
38
42
use datafusion:: execution:: context:: { SessionContext , SessionState } ;
39
43
use datafusion:: prelude:: JoinType ;
40
44
use datafusion:: prelude:: { CsvReadOptions , ParquetReadOptions } ;
41
- use datafusion:: test_util:: parquet_test_data;
45
+ use datafusion:: test_util:: { parquet_test_data, populate_csv_partitions } ;
42
46
use datafusion:: { assert_batches_eq, assert_batches_sorted_eq} ;
43
47
use datafusion_common:: { assert_contains, DataFusionError , ScalarValue , UnnestOptions } ;
44
48
use datafusion_execution:: config:: SessionConfig ;
@@ -1896,3 +1900,155 @@ async fn test_dataframe_placeholder_missing_param_values() -> Result<()> {
1896
1900
1897
1901
Ok ( ( ) )
1898
1902
}
1903
+
1904
+ #[ tokio:: test]
1905
+ async fn write_partitioned_parquet_results ( ) -> Result < ( ) > {
1906
+ // create partitioned input file and context
1907
+ let tmp_dir = TempDir :: new ( ) ?;
1908
+
1909
+ let ctx = SessionContext :: new ( ) ;
1910
+
1911
+ // Create an in memory table with schema C1 and C2, both strings
1912
+ let schema = Arc :: new ( Schema :: new ( vec ! [
1913
+ Field :: new( "c1" , DataType :: Utf8 , false ) ,
1914
+ Field :: new( "c2" , DataType :: Utf8 , false ) ,
1915
+ ] ) ) ;
1916
+
1917
+ let record_batch = RecordBatch :: try_new (
1918
+ schema. clone ( ) ,
1919
+ vec ! [
1920
+ Arc :: new( StringArray :: from( vec![ "abc" , "def" ] ) ) ,
1921
+ Arc :: new( StringArray :: from( vec![ "123" , "456" ] ) ) ,
1922
+ ] ,
1923
+ ) ?;
1924
+
1925
+ let mem_table = Arc :: new ( MemTable :: try_new ( schema, vec ! [ vec![ record_batch] ] ) ?) ;
1926
+
1927
+ // Register the table in the context
1928
+ ctx. register_table ( "test" , mem_table) ?;
1929
+
1930
+ let local = Arc :: new ( LocalFileSystem :: new_with_prefix ( & tmp_dir) ?) ;
1931
+ let local_url = Url :: parse ( "file://local" ) . unwrap ( ) ;
1932
+ ctx. runtime_env ( ) . register_object_store ( & local_url, local) ;
1933
+
1934
+ // execute a simple query and write the results to parquet
1935
+ let out_dir = tmp_dir. as_ref ( ) . to_str ( ) . unwrap ( ) . to_string ( ) + "/out/" ;
1936
+ let out_dir_url = format ! ( "file://{out_dir}" ) ;
1937
+
1938
+ // Write the results to parquet with partitioning
1939
+ let df = ctx. sql ( "SELECT c1, c2 FROM test" ) . await ?;
1940
+ let df_write_options =
1941
+ DataFrameWriteOptions :: new ( ) . with_partition_by ( vec ! [ String :: from( "c2" ) ] ) ;
1942
+
1943
+ df. write_parquet ( & out_dir_url, df_write_options, None )
1944
+ . await ?;
1945
+
1946
+ // Explicitly read the parquet file at c2=123 to verify the physical files are partitioned
1947
+ let partitioned_file = format ! ( "{out_dir}/c2=123" , out_dir = out_dir) ;
1948
+ let filted_df = ctx
1949
+ . read_parquet ( & partitioned_file, ParquetReadOptions :: default ( ) )
1950
+ . await ?;
1951
+
1952
+ // Check that the c2 column is gone and that c1 is abc.
1953
+ let results = filted_df. collect ( ) . await ?;
1954
+ let expected = [ "+-----+" , "| c1 |" , "+-----+" , "| abc |" , "+-----+" ] ;
1955
+
1956
+ assert_batches_eq ! ( expected, & results) ;
1957
+
1958
+ // Read the entire set of parquet files
1959
+ let df = ctx
1960
+ . read_parquet (
1961
+ & out_dir_url,
1962
+ ParquetReadOptions :: default ( )
1963
+ . table_partition_cols ( vec ! [ ( String :: from( "c2" ) , DataType :: Utf8 ) ] ) ,
1964
+ )
1965
+ . await ?;
1966
+
1967
+ // Check that the df has the entire set of data
1968
+ let results = df. collect ( ) . await ?;
1969
+ let expected = [
1970
+ "+-----+-----+" ,
1971
+ "| c1 | c2 |" ,
1972
+ "+-----+-----+" ,
1973
+ "| abc | 123 |" ,
1974
+ "| def | 456 |" ,
1975
+ "+-----+-----+" ,
1976
+ ] ;
1977
+
1978
+ assert_batches_eq ! ( expected, & results) ;
1979
+
1980
+ Ok ( ( ) )
1981
+ }
1982
+
1983
+ #[ tokio:: test]
1984
+ async fn write_parquet_results ( ) -> Result < ( ) > {
1985
+ // create partitioned input file and context
1986
+ let tmp_dir = TempDir :: new ( ) ?;
1987
+ // let mut ctx = create_ctx(&tmp_dir, 4).await?;
1988
+ let ctx =
1989
+ SessionContext :: new_with_config ( SessionConfig :: new ( ) . with_target_partitions ( 8 ) ) ;
1990
+ let schema = populate_csv_partitions ( & tmp_dir, 4 , ".csv" ) ?;
1991
+ // register csv file with the execution context
1992
+ ctx. register_csv (
1993
+ "test" ,
1994
+ tmp_dir. path ( ) . to_str ( ) . unwrap ( ) ,
1995
+ CsvReadOptions :: new ( ) . schema ( & schema) ,
1996
+ )
1997
+ . await ?;
1998
+
1999
+ // register a local file system object store for /tmp directory
2000
+ let local = Arc :: new ( LocalFileSystem :: new_with_prefix ( & tmp_dir) ?) ;
2001
+ let local_url = Url :: parse ( "file://local" ) . unwrap ( ) ;
2002
+ ctx. runtime_env ( ) . register_object_store ( & local_url, local) ;
2003
+
2004
+ // execute a simple query and write the results to parquet
2005
+ let out_dir = tmp_dir. as_ref ( ) . to_str ( ) . unwrap ( ) . to_string ( ) + "/out/" ;
2006
+ let out_dir_url = "file://local/out/" ;
2007
+ let df = ctx. sql ( "SELECT c1, c2 FROM test" ) . await ?;
2008
+ df. write_parquet ( out_dir_url, DataFrameWriteOptions :: new ( ) , None )
2009
+ . await ?;
2010
+ // write_parquet(&mut ctx, "SELECT c1, c2 FROM test", &out_dir, None).await?;
2011
+
2012
+ // create a new context and verify that the results were saved to a partitioned parquet file
2013
+ let ctx = SessionContext :: new ( ) ;
2014
+
2015
+ // get write_id
2016
+ let mut paths = fs:: read_dir ( & out_dir) . unwrap ( ) ;
2017
+ let path = paths. next ( ) ;
2018
+ let name = path
2019
+ . unwrap ( ) ?
2020
+ . path ( )
2021
+ . file_name ( )
2022
+ . expect ( "Should be a file name" )
2023
+ . to_str ( )
2024
+ . expect ( "Should be a str" )
2025
+ . to_owned ( ) ;
2026
+ let ( parsed_id, _) = name. split_once ( '_' ) . expect ( "File should contain _ !" ) ;
2027
+ let write_id = parsed_id. to_owned ( ) ;
2028
+
2029
+ // register each partition as well as the top level dir
2030
+ ctx. register_parquet (
2031
+ "part0" ,
2032
+ & format ! ( "{out_dir}/{write_id}_0.parquet" ) ,
2033
+ ParquetReadOptions :: default ( ) ,
2034
+ )
2035
+ . await ?;
2036
+
2037
+ ctx. register_parquet ( "allparts" , & out_dir, ParquetReadOptions :: default ( ) )
2038
+ . await ?;
2039
+
2040
+ let part0 = ctx. sql ( "SELECT c1, c2 FROM part0" ) . await ?. collect ( ) . await ?;
2041
+ let allparts = ctx
2042
+ . sql ( "SELECT c1, c2 FROM allparts" )
2043
+ . await ?
2044
+ . collect ( )
2045
+ . await ?;
2046
+
2047
+ let allparts_count: usize = allparts. iter ( ) . map ( |batch| batch. num_rows ( ) ) . sum ( ) ;
2048
+
2049
+ assert_eq ! ( part0[ 0 ] . schema( ) , allparts[ 0 ] . schema( ) ) ;
2050
+
2051
+ assert_eq ! ( allparts_count, 40 ) ;
2052
+
2053
+ Ok ( ( ) )
2054
+ }
0 commit comments