Skip to content

Commit a26f583

Browse files
authored
tests: add tests for writing hive-partitioned parquet (#9316)
* tests: adds tests associated with #9237 * style: clippy
1 parent b8c6e0b commit a26f583

File tree

2 files changed

+158
-76
lines changed
  • datafusion/core
    • src/datasource/physical_plan/parquet
    • tests/dataframe

2 files changed

+158
-76
lines changed

datafusion/core/src/datasource/physical_plan/parquet/mod.rs

Lines changed: 0 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -2066,80 +2066,6 @@ mod tests {
20662066
Ok(())
20672067
}
20682068

2069-
#[tokio::test]
2070-
async fn write_parquet_results() -> Result<()> {
2071-
// create partitioned input file and context
2072-
let tmp_dir = TempDir::new()?;
2073-
// let mut ctx = create_ctx(&tmp_dir, 4).await?;
2074-
let ctx = SessionContext::new_with_config(
2075-
SessionConfig::new().with_target_partitions(8),
2076-
);
2077-
let schema = populate_csv_partitions(&tmp_dir, 4, ".csv")?;
2078-
// register csv file with the execution context
2079-
ctx.register_csv(
2080-
"test",
2081-
tmp_dir.path().to_str().unwrap(),
2082-
CsvReadOptions::new().schema(&schema),
2083-
)
2084-
.await?;
2085-
2086-
// register a local file system object store for /tmp directory
2087-
let local = Arc::new(LocalFileSystem::new_with_prefix(&tmp_dir)?);
2088-
let local_url = Url::parse("file://local").unwrap();
2089-
ctx.runtime_env().register_object_store(&local_url, local);
2090-
2091-
// execute a simple query and write the results to parquet
2092-
let out_dir = tmp_dir.as_ref().to_str().unwrap().to_string() + "/out/";
2093-
let out_dir_url = "file://local/out/";
2094-
let df = ctx.sql("SELECT c1, c2 FROM test").await?;
2095-
df.write_parquet(out_dir_url, DataFrameWriteOptions::new(), None)
2096-
.await?;
2097-
// write_parquet(&mut ctx, "SELECT c1, c2 FROM test", &out_dir, None).await?;
2098-
2099-
// create a new context and verify that the results were saved to a partitioned parquet file
2100-
let ctx = SessionContext::new();
2101-
2102-
// get write_id
2103-
let mut paths = fs::read_dir(&out_dir).unwrap();
2104-
let path = paths.next();
2105-
let name = path
2106-
.unwrap()?
2107-
.path()
2108-
.file_name()
2109-
.expect("Should be a file name")
2110-
.to_str()
2111-
.expect("Should be a str")
2112-
.to_owned();
2113-
let (parsed_id, _) = name.split_once('_').expect("File should contain _ !");
2114-
let write_id = parsed_id.to_owned();
2115-
2116-
// register each partition as well as the top level dir
2117-
ctx.register_parquet(
2118-
"part0",
2119-
&format!("{out_dir}/{write_id}_0.parquet"),
2120-
ParquetReadOptions::default(),
2121-
)
2122-
.await?;
2123-
2124-
ctx.register_parquet("allparts", &out_dir, ParquetReadOptions::default())
2125-
.await?;
2126-
2127-
let part0 = ctx.sql("SELECT c1, c2 FROM part0").await?.collect().await?;
2128-
let allparts = ctx
2129-
.sql("SELECT c1, c2 FROM allparts")
2130-
.await?
2131-
.collect()
2132-
.await?;
2133-
2134-
let allparts_count: usize = allparts.iter().map(|batch| batch.num_rows()).sum();
2135-
2136-
assert_eq!(part0[0].schema(), allparts[0].schema());
2137-
2138-
assert_eq!(allparts_count, 40);
2139-
2140-
Ok(())
2141-
}
2142-
21432069
fn logical2physical(expr: &Expr, schema: &Schema) -> Arc<dyn PhysicalExpr> {
21442070
let df_schema = schema.clone().to_dfschema().unwrap();
21452071
let execution_props = ExecutionProps::new();

datafusion/core/tests/dataframe/mod.rs

Lines changed: 158 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,15 +30,19 @@ use arrow::{
3030
};
3131
use arrow_array::Float32Array;
3232
use arrow_schema::ArrowError;
33+
use object_store::local::LocalFileSystem;
34+
use std::fs;
3335
use std::sync::Arc;
36+
use tempfile::TempDir;
37+
use url::Url;
3438

35-
use datafusion::dataframe::DataFrame;
39+
use datafusion::dataframe::{DataFrame, DataFrameWriteOptions};
3640
use datafusion::datasource::MemTable;
3741
use datafusion::error::Result;
3842
use datafusion::execution::context::{SessionContext, SessionState};
3943
use datafusion::prelude::JoinType;
4044
use datafusion::prelude::{CsvReadOptions, ParquetReadOptions};
41-
use datafusion::test_util::parquet_test_data;
45+
use datafusion::test_util::{parquet_test_data, populate_csv_partitions};
4246
use datafusion::{assert_batches_eq, assert_batches_sorted_eq};
4347
use datafusion_common::{assert_contains, DataFusionError, ScalarValue, UnnestOptions};
4448
use datafusion_execution::config::SessionConfig;
@@ -1896,3 +1900,155 @@ async fn test_dataframe_placeholder_missing_param_values() -> Result<()> {
18961900

18971901
Ok(())
18981902
}
1903+
1904+
#[tokio::test]
1905+
async fn write_partitioned_parquet_results() -> Result<()> {
1906+
// create partitioned input file and context
1907+
let tmp_dir = TempDir::new()?;
1908+
1909+
let ctx = SessionContext::new();
1910+
1911+
// Create an in memory table with schema C1 and C2, both strings
1912+
let schema = Arc::new(Schema::new(vec![
1913+
Field::new("c1", DataType::Utf8, false),
1914+
Field::new("c2", DataType::Utf8, false),
1915+
]));
1916+
1917+
let record_batch = RecordBatch::try_new(
1918+
schema.clone(),
1919+
vec![
1920+
Arc::new(StringArray::from(vec!["abc", "def"])),
1921+
Arc::new(StringArray::from(vec!["123", "456"])),
1922+
],
1923+
)?;
1924+
1925+
let mem_table = Arc::new(MemTable::try_new(schema, vec![vec![record_batch]])?);
1926+
1927+
// Register the table in the context
1928+
ctx.register_table("test", mem_table)?;
1929+
1930+
let local = Arc::new(LocalFileSystem::new_with_prefix(&tmp_dir)?);
1931+
let local_url = Url::parse("file://local").unwrap();
1932+
ctx.runtime_env().register_object_store(&local_url, local);
1933+
1934+
// execute a simple query and write the results to parquet
1935+
let out_dir = tmp_dir.as_ref().to_str().unwrap().to_string() + "/out/";
1936+
let out_dir_url = format!("file://{out_dir}");
1937+
1938+
// Write the results to parquet with partitioning
1939+
let df = ctx.sql("SELECT c1, c2 FROM test").await?;
1940+
let df_write_options =
1941+
DataFrameWriteOptions::new().with_partition_by(vec![String::from("c2")]);
1942+
1943+
df.write_parquet(&out_dir_url, df_write_options, None)
1944+
.await?;
1945+
1946+
// Explicitly read the parquet file at c2=123 to verify the physical files are partitioned
1947+
let partitioned_file = format!("{out_dir}/c2=123", out_dir = out_dir);
1948+
let filted_df = ctx
1949+
.read_parquet(&partitioned_file, ParquetReadOptions::default())
1950+
.await?;
1951+
1952+
// Check that the c2 column is gone and that c1 is abc.
1953+
let results = filted_df.collect().await?;
1954+
let expected = ["+-----+", "| c1 |", "+-----+", "| abc |", "+-----+"];
1955+
1956+
assert_batches_eq!(expected, &results);
1957+
1958+
// Read the entire set of parquet files
1959+
let df = ctx
1960+
.read_parquet(
1961+
&out_dir_url,
1962+
ParquetReadOptions::default()
1963+
.table_partition_cols(vec![(String::from("c2"), DataType::Utf8)]),
1964+
)
1965+
.await?;
1966+
1967+
// Check that the df has the entire set of data
1968+
let results = df.collect().await?;
1969+
let expected = [
1970+
"+-----+-----+",
1971+
"| c1 | c2 |",
1972+
"+-----+-----+",
1973+
"| abc | 123 |",
1974+
"| def | 456 |",
1975+
"+-----+-----+",
1976+
];
1977+
1978+
assert_batches_eq!(expected, &results);
1979+
1980+
Ok(())
1981+
}
1982+
1983+
#[tokio::test]
1984+
async fn write_parquet_results() -> Result<()> {
1985+
// create partitioned input file and context
1986+
let tmp_dir = TempDir::new()?;
1987+
// let mut ctx = create_ctx(&tmp_dir, 4).await?;
1988+
let ctx =
1989+
SessionContext::new_with_config(SessionConfig::new().with_target_partitions(8));
1990+
let schema = populate_csv_partitions(&tmp_dir, 4, ".csv")?;
1991+
// register csv file with the execution context
1992+
ctx.register_csv(
1993+
"test",
1994+
tmp_dir.path().to_str().unwrap(),
1995+
CsvReadOptions::new().schema(&schema),
1996+
)
1997+
.await?;
1998+
1999+
// register a local file system object store for /tmp directory
2000+
let local = Arc::new(LocalFileSystem::new_with_prefix(&tmp_dir)?);
2001+
let local_url = Url::parse("file://local").unwrap();
2002+
ctx.runtime_env().register_object_store(&local_url, local);
2003+
2004+
// execute a simple query and write the results to parquet
2005+
let out_dir = tmp_dir.as_ref().to_str().unwrap().to_string() + "/out/";
2006+
let out_dir_url = "file://local/out/";
2007+
let df = ctx.sql("SELECT c1, c2 FROM test").await?;
2008+
df.write_parquet(out_dir_url, DataFrameWriteOptions::new(), None)
2009+
.await?;
2010+
// write_parquet(&mut ctx, "SELECT c1, c2 FROM test", &out_dir, None).await?;
2011+
2012+
// create a new context and verify that the results were saved to a partitioned parquet file
2013+
let ctx = SessionContext::new();
2014+
2015+
// get write_id
2016+
let mut paths = fs::read_dir(&out_dir).unwrap();
2017+
let path = paths.next();
2018+
let name = path
2019+
.unwrap()?
2020+
.path()
2021+
.file_name()
2022+
.expect("Should be a file name")
2023+
.to_str()
2024+
.expect("Should be a str")
2025+
.to_owned();
2026+
let (parsed_id, _) = name.split_once('_').expect("File should contain _ !");
2027+
let write_id = parsed_id.to_owned();
2028+
2029+
// register each partition as well as the top level dir
2030+
ctx.register_parquet(
2031+
"part0",
2032+
&format!("{out_dir}/{write_id}_0.parquet"),
2033+
ParquetReadOptions::default(),
2034+
)
2035+
.await?;
2036+
2037+
ctx.register_parquet("allparts", &out_dir, ParquetReadOptions::default())
2038+
.await?;
2039+
2040+
let part0 = ctx.sql("SELECT c1, c2 FROM part0").await?.collect().await?;
2041+
let allparts = ctx
2042+
.sql("SELECT c1, c2 FROM allparts")
2043+
.await?
2044+
.collect()
2045+
.await?;
2046+
2047+
let allparts_count: usize = allparts.iter().map(|batch| batch.num_rows()).sum();
2048+
2049+
assert_eq!(part0[0].schema(), allparts[0].schema());
2050+
2051+
assert_eq!(allparts_count, 40);
2052+
2053+
Ok(())
2054+
}

0 commit comments

Comments
 (0)