houqp · houqp · Jan 13, 2022 · Dec 15, 2021 · Dec 15, 2021 · Dec 15, 2021
diff --git a/.github/dependabot.yml b/.github/dependabot.yml
@@ -0,0 +1,11 @@
+version: 2
+updates:
+  - package-ecosystem: cargo
+    directory: "/"
+    schedule:
+      interval: weekly
+      day: sunday
+      time: "7:00"
+    open-pull-requests-limit: 10
+    target-branch: master
+    labels: [auto-dependencies]
diff --git a/.github/workflows/python_build.yml b/.github/workflows/python_build.yml
diff --git a/.github/workflows/python_test.yaml b/.github/workflows/python_test.yaml
diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
@@ -116,7 +116,8 @@ jobs:
           cargo test --no-default-features
           cargo run --example csv_sql
           cargo run --example parquet_sql
-          # cargo run --example avro_sql --features=datafusion/avro
+          #nopass
+          cargo run --example avro_sql --features=datafusion/avro
         env:
           CARGO_HOME: "/github/home/.cargo"
           CARGO_TARGET_DIR: "/github/home/target"
@@ -127,6 +128,7 @@ jobs:
           export PARQUET_TEST_DATA=$(pwd)/parquet-testing/data
           cd ballista/rust
           # snmalloc requires cmake so build without default features
+          #nopass
           cargo test --no-default-features --features sled
         env:
           CARGO_HOME: "/github/home/.cargo"

diff --git a/Cargo.toml b/Cargo.toml
@@ -35,6 +35,6 @@ lto = true
 codegen-units = 1
 
 [patch.crates-io]
-#arrow2 = { git = "https://github.com/jorgecarleitao/arrow2.git", rev = "f2c7503bc171a4c75c0af9905823c8795bd17f9b" }
-arrow2 = { git = "https://github.com/blaze-init/arrow2.git", branch = "shuffle_ipc" }
-parquet2 = { git = "https://github.com/blaze-init/parquet2.git", branch = "meta_new" }
+arrow2 = { git = "https://github.com/jorgecarleitao/arrow2.git", rev = "ef7937dfe56033c2cc491482c67587b52cd91554" }
+#arrow2 = { git = "https://github.com/blaze-init/arrow2.git", branch = "shuffle_ipc" }
+#parquet2 = { git = "https://github.com/blaze-init/parquet2.git", branch = "meta_new" }
diff --git a/README.md b/README.md
@@ -254,7 +254,7 @@ DataFusion is designed to be extensible at all points. To that end, you can prov
 
 ## Rust Version Compatbility
 
-This crate is tested with the latest stable version of Rust. We do not currrently test against other, older versions of the Rust compiler.
+This crate is tested with the latest stable version of Rust. We do not currently test against other, older versions of the Rust compiler.
 
 # Supported SQL
 
@@ -264,9 +264,9 @@ This library currently supports many SQL constructs, including
 - `SELECT ... FROM ...` together with any expression
 - `ALIAS` to name an expression
 - `CAST` to change types, including e.g. `Timestamp(Nanosecond, None)`
-- most mathematical unary and binary expressions such as `+`, `/`, `sqrt`, `tan`, `>=`.
+- Many mathematical unary and binary expressions such as `+`, `/`, `sqrt`, `tan`, `>=`.
 - `WHERE` to filter
-- `GROUP BY` together with one of the following aggregations: `MIN`, `MAX`, `COUNT`, `SUM`, `AVG`
+- `GROUP BY` together with one of the following aggregations: `MIN`, `MAX`, `COUNT`, `SUM`, `AVG`, `VAR`, `STDDEV` (sample and population)
 - `ORDER BY` together with an expression and optional `ASC` or `DESC` and also optional `NULLS FIRST` or `NULLS LAST`
 
 ## Supported Functions
@@ -366,7 +366,7 @@ Please see [Roadmap](docs/source/specification/roadmap.md) for information of wh
 There is no formal document describing DataFusion's architecture yet, but the following presentations offer a good overview of its different components and how they interact together.
 
 - (March 2021): The DataFusion architecture is described in _Query Engine Design and the Rust-Based DataFusion in Apache Arrow_: [recording](https://www.youtube.com/watch?v=K6eCAVEk4kU) (DataFusion content starts [~ 15 minutes in](https://www.youtube.com/watch?v=K6eCAVEk4kU&t=875s)) and [slides](https://www.slideshare.net/influxdata/influxdb-iox-tech-talks-query-engine-design-and-the-rustbased-datafusion-in-apache-arrow-244161934)
-- (Feburary 2021): How DataFusion is used within the Ballista Project is described in \*Ballista: Distributed Compute with Rust and Apache Arrow: [recording](https://www.youtube.com/watch?v=ZZHQaOap9pQ)
+- (February 2021): How DataFusion is used within the Ballista Project is described in \*Ballista: Distributed Compute with Rust and Apache Arrow: [recording](https://www.youtube.com/watch?v=ZZHQaOap9pQ)
 
 # Developer's guide
 

diff --git a/ballista/rust/core/Cargo.toml b/ballista/rust/core/Cargo.toml
@@ -30,7 +30,7 @@ build = "build.rs"
 simd = ["datafusion/simd"]
 
 [dependencies]
-ahash = "0.7"
+ahash = { version = "0.7", default-features = false }
 async-trait = "0.1.36"
 futures = "0.3"
 hashbrown = "0.11"
@@ -41,7 +41,7 @@ sqlparser = "0.13"
 tokio = "1.0"
 tonic = "0.6"
 uuid = { version = "0.8", features = ["v4"] }
-chrono = "0.4"
+chrono = { version = "0.4", default-features = false }
 
 arrow-format = { version = "0.3", features = ["flight-data", "flight-service"] }
 arrow = { package = "arrow2", version="0.8", features = ["io_ipc", "io_flight"] }

diff --git a/ballista/rust/core/proto/ballista.proto b/ballista/rust/core/proto/ballista.proto
@@ -169,6 +169,10 @@ enum AggregateFunction {
   COUNT = 4;
   APPROX_DISTINCT = 5;
   ARRAY_AGG = 6;
+  VARIANCE=7;
+  VARIANCE_POP=8;
+  STDDEV=9;
+  STDDEV_POP=10;
 }
 
 message AggregateExprNode {

diff --git a/ballista/rust/core/src/client.rs b/ballista/rust/core/src/client.rs
@@ -17,6 +17,8 @@
 
 //! Client API for sending requests to executors.
 
+use arrow::io::flight::deserialize_schemas;
+use arrow::io::ipc::IpcSchema;
 use std::sync::{Arc, Mutex};
 use std::{collections::HashMap, pin::Pin};
 use std::{
@@ -121,10 +123,12 @@ impl BallistaClient {
         {
             Some(flight_data) => {
                 // convert FlightData to a stream
-                let schema = Arc::new(Schema::try_from(&flight_data)?);
+                let (schema, ipc_schema) =
+                    deserialize_schemas(flight_data.data_body.as_slice()).unwrap();
+                let schema = Arc::new(schema);
 
                 // all the remaining stream messages should be dictionary and record batches
-                Ok(Box::pin(FlightDataStream::new(stream, schema)))
+                Ok(Box::pin(FlightDataStream::new(stream, schema, ipc_schema)))
             }
             None => Err(ballista_error(
                 "Did not receive schema batch from flight server",
@@ -136,13 +140,19 @@ impl BallistaClient {
 struct FlightDataStream {
     stream: Mutex<Streaming<FlightData>>,
     schema: SchemaRef,
+    ipc_schema: IpcSchema,
 }
 
 impl FlightDataStream {
-    pub fn new(stream: Streaming<FlightData>, schema: SchemaRef) -> Self {
+    pub fn new(
+        stream: Streaming<FlightData>,
+        schema: SchemaRef,
+        ipc_schema: IpcSchema,
+    ) -> Self {
         Self {
             stream: Mutex::new(stream),
             schema,
+            ipc_schema,
         }
     }
 }
@@ -161,10 +171,11 @@ impl Stream for FlightDataStream {
                     .map_err(|e| ArrowError::from_external_error(Box::new(e)))
                     .and_then(|flight_data_chunk| {
                         let hm = HashMap::new();
+
                         arrow::io::flight::deserialize_batch(
                             &flight_data_chunk,
                             self.schema.clone(),
-                            true,
+                            &self.ipc_schema,
                             &hm,
                         )
                     });

diff --git a/ballista/rust/core/src/execution_plans/shuffle_writer.rs b/ballista/rust/core/src/execution_plans/shuffle_writer.rs
@@ -458,12 +458,17 @@ impl ShuffleWriter {
             num_rows: 0,
             num_bytes: 0,
             path: path.to_owned(),
-            writer: FileWriter::try_new(buffer_writer, schema, WriteOptions::default())?,
+            writer: FileWriter::try_new(
+                buffer_writer,
+                schema,
+                None,
+                WriteOptions::default(),
+            )?,
         })
     }
 
     fn write(&mut self, batch: &RecordBatch) -> Result<()> {
-        self.writer.write(batch)?;
+        self.writer.write(batch, None)?;
         self.num_batches += 1;
         self.num_rows += batch.num_rows() as u64;
         let num_bytes: usize = batch