diff --git a/Cargo.lock b/Cargo.lock index bcc3fe133d87..7a504eb2e1ce 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -169,9 +169,9 @@ checksum = "96d30a06541fbafbc7f82ed10c06164cfbd2c401138f6addd8404629c4b16711" [[package]] name = "arrow-array" -version = "52.1.0" +version = "52.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "81c16ec702d3898c2f5cfdc148443c6cd7dbe5bac28399859eb0a3d38f072827" +checksum = "16f4a9468c882dc66862cef4e1fd8423d47e67972377d85d80e022786427768c" dependencies = [ "ahash", "arrow-buffer", @@ -196,9 +196,9 @@ dependencies = [ [[package]] name = "arrow-data" -version = "52.1.0" +version = "52.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a769666ffac256dd301006faca1ca553d0ae7cffcf4cd07095f73f95eb226514" +checksum = "dd9d6f18c65ef7a2573ab498c374d8ae364b4a4edf67105357491c031f716ca5" dependencies = [ "arrow-buffer", "arrow-schema", @@ -208,9 +208,9 @@ dependencies = [ [[package]] name = "arrow-schema" -version = "52.1.0" +version = "52.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dab1c12b40e29d9f3b699e0203c2a73ba558444c05e388a4377208f8f9c97eee" +checksum = "9e972cd1ff4a4ccd22f86d3e53e835c2ed92e0eea6a3e8eadb72b4f1ac802cf8" [[package]] name = "arrow2" @@ -251,7 +251,7 @@ checksum = "16e62a023e7c117e27523144c5d2459f4397fcc3cab0085af8e2224f643a0193" dependencies = [ "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -262,7 +262,7 @@ checksum = "6e0c28dcc82d7c8ead5cb13beb15405b57b8546e93215673ff8ca0349a028107" dependencies = [ "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -310,9 +310,9 @@ dependencies = [ [[package]] name = "aws-config" -version = "1.5.4" +version = "1.5.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "caf6cfe2881cb1fcbba9ae946fb9a6480d3b7a714ca84c74925014a89ef3387a" +checksum = "4e95816a168520d72c0e7680c405a5a8c1fb6a035b4bc4b9d7b0de8e1a941697" dependencies = [ "aws-credential-types", "aws-runtime", @@ -330,7 +330,6 @@ dependencies = [ "fastrand", "hex", "http 0.2.12", - "hyper 0.14.30", "ring", "time", "tokio", @@ -353,9 +352,9 @@ dependencies = [ [[package]] name = "aws-runtime" -version = "1.3.1" +version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87c5f920ffd1e0526ec9e70e50bf444db50b204395a0fa7016bbf9e31ea1698f" +checksum = "f42c2d4218de4dcd890a109461e2f799a1a2ba3bcd2cde9af88360f5df9266c6" dependencies = [ "aws-credential-types", "aws-sigv4", @@ -369,6 +368,7 @@ dependencies = [ "fastrand", "http 0.2.12", "http-body 0.4.6", + "once_cell", "percent-encoding", "pin-project-lite", "tracing", @@ -412,9 +412,9 @@ dependencies = [ [[package]] name = "aws-sdk-sso" -version = "1.36.0" +version = "1.37.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6acca681c53374bf1d9af0e317a41d12a44902ca0f2d1e10e5cb5bb98ed74f35" +checksum = "1074e818fbe4f9169242d78448b15be8916a79daa38ea1231f2e2e10d993fcd2" dependencies = [ "aws-credential-types", "aws-runtime", @@ -434,9 +434,9 @@ dependencies = [ [[package]] name = "aws-sdk-ssooidc" -version = "1.37.0" +version = "1.38.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b79c6bdfe612503a526059c05c9ccccbf6bd9530b003673cb863e547fd7c0c9a" +checksum = "29755c51e33fa3f678598f64324a169cf4b7d3c4865d2709d4308f53366a92a4" dependencies = [ "aws-credential-types", "aws-runtime", @@ -456,9 +456,9 @@ dependencies = [ [[package]] name = "aws-sdk-sts" -version = "1.36.0" +version = "1.37.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32e6ecdb2bd756f3b2383e6f0588dc10a4e65f5d551e70a56e0bfe0c884673ce" +checksum = "6e52dc3fd7dfa6c01a69cf3903e00aa467261639138a05b06cd92314d2c8fb07" dependencies = [ "aws-credential-types", "aws-runtime", @@ -618,9 +618,9 @@ dependencies = [ [[package]] name = "aws-smithy-runtime-api" -version = "1.7.1" +version = "1.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "30819352ed0a04ecf6a2f3477e344d2d1ba33d43e0f09ad9047c12e0d923616f" +checksum = "e086682a53d3aa241192aa110fa8dfce98f2f5ac2ead0de84d41582c7e8fdb96" dependencies = [ "aws-smithy-async", "aws-smithy-types", @@ -801,17 +801,6 @@ dependencies = [ "alloc-stdlib", ] -[[package]] -name = "bstr" -version = "1.10.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "40723b8fb387abc38f4f4a37c09073622e41dd12327033091ef8950659e6dc0c" -dependencies = [ - "memchr", - "regex-automata", - "serde", -] - [[package]] name = "built" version = "0.7.4" @@ -831,9 +820,9 @@ checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c" [[package]] name = "bytemuck" -version = "1.16.1" +version = "1.16.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b236fc92302c97ed75b38da1f4917b5cdda4984745740f153a5d3059e48d725e" +checksum = "102087e286b4677862ea56cf8fc58bb2cdfa8725c40ffb80fe3a008eb7f2fc83" dependencies = [ "bytemuck_derive", ] @@ -846,14 +835,20 @@ checksum = "1ee891b04274a59bd38b412188e24b849617b2e45a0fd8d057deb63e7403761b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] +[[package]] +name = "byteorder" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" + [[package]] name = "bytes" -version = "1.7.0" +version = "1.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fca2be1d5c43812bae364ee3f30b3afcb7877cf59f4aeb94c66f313a41d2fac9" +checksum = "8318a53db07bb3f8dca91a600466bdb3f2eaadeedfdbcf02e1accbad9271ba50" [[package]] name = "bytes-utils" @@ -894,9 +889,9 @@ checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" [[package]] name = "cc" -version = "1.1.6" +version = "1.1.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2aba8f4e9906c7ce3c73463f62a7f0c65183ada1a2d47e397cc8810827f9694f" +checksum = "e9e8aabfac534be767c909e0690571677d49f41bd8465ae876fe043d52ba5292" dependencies = [ "jobserver", "libc", @@ -972,18 +967,18 @@ dependencies = [ [[package]] name = "clap" -version = "4.5.11" +version = "4.5.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "35723e6a11662c2afb578bcf0b88bf6ea8e21282a953428f240574fcc3a2b5b3" +checksum = "11d8838454fda655dafd3accb2b6e2bea645b9e4078abe84a22ceb947235c5cc" dependencies = [ "clap_builder", ] [[package]] name = "clap_builder" -version = "4.5.11" +version = "4.5.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "49eb96cbfa7cfa35017b7cd548c75b14c3118c98b423041d70562665e07fb0fa" +checksum = "216aec2b177652e3846684cbfe25c9964d18ec45234f0f5da5157b207ed1aab6" dependencies = [ "anstyle", "clap_lex", @@ -1069,9 +1064,9 @@ dependencies = [ [[package]] name = "core-foundation-sys" -version = "0.8.6" +version = "0.8.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "06ea2b9bc92be3c2baa9334a323ebca2d6f074ff852cd1d7b11064035cd3868f" +checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" [[package]] name = "core2" @@ -1084,9 +1079,9 @@ dependencies = [ [[package]] name = "cpufeatures" -version = "0.2.12" +version = "0.2.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "53fe5e26ff1b7aef8bca9c6080520cfb8d9333c7568e1829cef191a9723e5504" +checksum = "51e852e6dc9a5bed1fae92dd2375037bf2b768725bf3be87811edee3249d09ad" dependencies = [ "libc", ] @@ -1361,7 +1356,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -1432,9 +1427,9 @@ dependencies = [ [[package]] name = "flate2" -version = "1.0.30" +version = "1.0.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f54427cfd1c7829e2a139fcefea601bf088ebca651d2bf53ebc600eac295dae" +checksum = "7f211bbe8e69bbd0cfdea405084f128ae8b4aaa6b0b522fc8f2b009084797920" dependencies = [ "crc32fast", "libz-ng-sys", @@ -1537,7 +1532,7 @@ checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac" dependencies = [ "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -1912,9 +1907,9 @@ dependencies = [ [[package]] name = "hyper-util" -version = "0.1.6" +version = "0.1.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3ab92f4f49ee4fb4f997c784b7a2e0fa70050211e0b6a287f898c3c9785ca956" +checksum = "cde7055719c54e36e95e8719f95883f22072a48ede39db7fc17a4e1d5281e9b9" dependencies = [ "bytes", "futures-channel", @@ -2064,9 +2059,9 @@ dependencies = [ [[package]] name = "js-sys" -version = "0.3.69" +version = "0.3.70" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "29c15563dc2726973df627357ce0c9ddddbea194836909d655df6a75d2cf296d" +checksum = "1868808506b929d7b0cfa8f75951347aa71bb21144b7791bae35d9bccfcfe37a" dependencies = [ "wasm-bindgen", ] @@ -2252,9 +2247,9 @@ dependencies = [ [[package]] name = "libz-sys" -version = "1.1.18" +version = "1.1.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c15da26e5af7e25c90b37a2d75cdbf940cf4a55316de9d84c679c9b8bfabf82e" +checksum = "fdc53a7799a7496ebc9fd29f31f7df80e83c9bda5299768af5f9e59eeea74647" dependencies = [ "cc", "libc", @@ -2286,9 +2281,9 @@ checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24" [[package]] name = "lru" -version = "0.12.3" +version = "0.12.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d3262e75e648fce39813cb56ac41f3c3e3f65217ebf3844d818d1f9398cfb0dc" +checksum = "37ee39891760e7d94734f6f63fedc29a2e4a152f836120753a72503f09fcf904" dependencies = [ "hashbrown", ] @@ -2324,9 +2319,9 @@ dependencies = [ [[package]] name = "matrixmultiply" -version = "0.3.8" +version = "0.3.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7574c1cf36da4798ab73da5b215bbf444f50718207754cb522201d78d1cd0ff2" +checksum = "9380b911e3e96d10c1f415da0876389aaf1b56759054eeb0de7df940c456ba1a" dependencies = [ "autocfg", "rawpointer", @@ -2392,9 +2387,9 @@ dependencies = [ [[package]] name = "mio" -version = "1.0.1" +version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4569e456d394deccd22ce1c1913e6ea0e54519f577285001215d33557431afe4" +checksum = "80e04d1dcff3aae0704555fe5fee3bcfaf3d1fdf8a7e521d5b9d2b42acb52cec" dependencies = [ "hermit-abi", "libc", @@ -2546,7 +2541,7 @@ dependencies = [ "num-integer", "num-traits", "pyo3", - "rustc-hash", + "rustc-hash 1.1.0", ] [[package]] @@ -2650,9 +2645,9 @@ dependencies = [ [[package]] name = "object" -version = "0.36.2" +version = "0.36.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3f203fa8daa7bb185f760ae12bd8e097f63d17041dcdcaf675ac54cdf863170e" +checksum = "27b64972346851a39438c60b341ebc01bba47464ae329e55cf343eb93964efd9" dependencies = [ "memchr", ] @@ -2678,7 +2673,7 @@ dependencies = [ "rand", "reqwest", "ring", - "rustls-pemfile 2.1.2", + "rustls-pemfile 2.1.3", "serde", "serde_json", "snafu", @@ -2826,7 +2821,7 @@ checksum = "2f38a4412a78282e09a2cf38d195ea5420d15ba0602cb375210efbc877243965" dependencies = [ "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -3449,9 +3444,12 @@ checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391" [[package]] name = "ppv-lite86" -version = "0.2.17" +version = "0.2.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" +checksum = "77957b295656769bb8ad2b6a6b09d897d94f05c41b069aede1fcdaa675eaea04" +dependencies = [ + "zerocopy", +] [[package]] name = "proc-macro2" @@ -3595,7 +3593,7 @@ dependencies = [ "proc-macro2", "pyo3-macros-backend", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -3608,7 +3606,7 @@ dependencies = [ "proc-macro2", "pyo3-build-config", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -3640,16 +3638,17 @@ dependencies = [ [[package]] name = "quinn" -version = "0.11.2" +version = "0.11.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e4ceeeeabace7857413798eb1ffa1e9c905a9946a57d81fb69b4b71c4d8eb3ad" +checksum = "b22d8e7369034b9a7132bc2008cac12f2013c8132b45e0554e6e20e2617f2156" dependencies = [ "bytes", "pin-project-lite", "quinn-proto", "quinn-udp", - "rustc-hash", + "rustc-hash 2.0.0", "rustls 0.23.12", + "socket2", "thiserror", "tokio", "tracing", @@ -3657,14 +3656,14 @@ dependencies = [ [[package]] name = "quinn-proto" -version = "0.11.3" +version = "0.11.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ddf517c03a109db8100448a4be38d498df8a210a99fe0e1b9eaf39e78c640efe" +checksum = "ba92fb39ec7ad06ca2582c0ca834dfeadcaf06ddfc8e635c80aa7e1c05315fdd" dependencies = [ "bytes", "rand", "ring", - "rustc-hash", + "rustc-hash 2.0.0", "rustls 0.23.12", "slab", "thiserror", @@ -3681,6 +3680,7 @@ dependencies = [ "libc", "once_cell", "socket2", + "tracing", "windows-sys 0.52.0", ] @@ -3804,7 +3804,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "76009fbe0614077fc1a2ce255e3a1881a2e3a3527097d5dc6d8212c585e7e38b" dependencies = [ "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -3833,14 +3833,14 @@ checksum = "bcc303e793d3734489387d205e9b186fac9c6cfacedd98cbb2e8a5943595f3e6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] name = "regex" -version = "1.10.5" +version = "1.10.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b91213439dad192326a0d7c6ee3955910425f441d7038e0d6933b0aec5c4517f" +checksum = "4219d74c6b67a3654a9fbebc4b419e22126d13d2f3c4a07ee0cb61ff79a79619" dependencies = [ "aho-corasick", "memchr", @@ -3905,7 +3905,7 @@ dependencies = [ "quinn", "rustls 0.23.12", "rustls-native-certs 0.7.1", - "rustls-pemfile 2.1.2", + "rustls-pemfile 2.1.3", "rustls-pki-types", "serde", "serde_json", @@ -3967,6 +3967,12 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" +[[package]] +name = "rustc-hash" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "583034fd73374156e66797ed8e5b0d5690409c9226b22d87cb7f19821c05d152" + [[package]] name = "rustc_version" version = "0.4.0" @@ -4034,7 +4040,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a88d6d420651b496bdd98684116959239430022a115c1240e6c3993be0b15fba" dependencies = [ "openssl-probe", - "rustls-pemfile 2.1.2", + "rustls-pemfile 2.1.3", "rustls-pki-types", "schannel", "security-framework", @@ -4051,9 +4057,9 @@ dependencies = [ [[package]] name = "rustls-pemfile" -version = "2.1.2" +version = "2.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "29993a25686778eb88d4189742cd713c9bce943bc54251a33509dc63cbacf73d" +checksum = "196fe16b00e106300d3e45ecfcb764fa292a535d7326a29a5875c579c7417425" dependencies = [ "base64 0.22.1", "rustls-pki-types", @@ -4061,9 +4067,9 @@ dependencies = [ [[package]] name = "rustls-pki-types" -version = "1.7.0" +version = "1.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "976295e77ce332211c0d24d92c0e83e50f5c5f046d11082cea19f3df13a3562d" +checksum = "fc0a2ce646f8655401bb81e7927b812614bd5d91dbc968696be50603510fcaf0" [[package]] name = "rustls-webpki" @@ -4226,32 +4232,33 @@ dependencies = [ [[package]] name = "serde" -version = "1.0.204" +version = "1.0.207" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc76f558e0cbb2a839d37354c575f1dc3fdc6546b5be373ba43d95f231bf7c12" +checksum = "5665e14a49a4ea1b91029ba7d3bca9f299e1f7cfa194388ccc20f14743e784f2" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.204" +version = "1.0.207" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e0cd7e117be63d3c3678776753929474f3b04a43a080c744d6b0ae2a8c28e222" +checksum = "6aea2634c86b0e8ef2cfdc0c340baede54ec27b1e46febd7f80dffb2aa44a00e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] name = "serde_json" -version = "1.0.120" +version = "1.0.124" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4e0d21c9a8cae1235ad58a00c11cb40d4b1e5c784f1ef2c537876ed6ffd8b7c5" +checksum = "66ad62847a56b3dba58cc891acd13884b9c61138d330c0d7b6181713d4fce38d" dependencies = [ "indexmap", "itoa", + "memchr", "ryu", "serde", ] @@ -4509,7 +4516,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -4522,7 +4529,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -4544,9 +4551,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.72" +version = "2.0.74" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc4b9b9bf2add8093d3f2c0204471e951b2285580335de42f9d2534f3ae7a8af" +checksum = "1fceb41e3d546d0bd83421d3409b1460cc7444cd389341a4c880fe7a042cb3d7" dependencies = [ "proc-macro2", "quote", @@ -4561,11 +4568,10 @@ checksum = "a7065abeca94b6a8a577f9bd45aa0867a2238b74e8eb67cf10d492bc39351394" [[package]] name = "sysinfo" -version = "0.31.0" +version = "0.31.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "29a6b037e3af4ae9a9d6214198e4df53091363b2c96c88fc416a6c1bd92a2799" +checksum = "d4115055da5f572fff541dd0c4e61b0262977f453cc9fe04be83aba25a89bdab" dependencies = [ - "bstr", "core-foundation-sys", "libc", "memchr", @@ -4581,20 +4587,21 @@ checksum = "c1bbb9f3c5c463a01705937a24fdabc5047929ac764b2d5b9cf681c1f5041ed5" [[package]] name = "target-lexicon" -version = "0.12.15" +version = "0.12.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4873307b7c257eddcb50c9bedf158eb669578359fb28428bef438fec8e6ba7c2" +checksum = "61c41af27dd6d1e27b1b16b489db798443478cef1f06a660c96db617ba5de3b1" [[package]] name = "tempfile" -version = "3.10.1" +version = "3.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85b77fafb263dd9d05cbeac119526425676db3784113aa9295c88498cbf8bff1" +checksum = "04cbcdd0c794ebb0d4cf35e88edd2f7d2c4c3e9a5a6dab322839b321c6a87a64" dependencies = [ "cfg-if", "fastrand", + "once_cell", "rustix", - "windows-sys 0.52.0", + "windows-sys 0.59.0", ] [[package]] @@ -4614,7 +4621,7 @@ checksum = "a4558b58466b9ad7ca0f102865eccc95938dca1a74a856f2b57b6629050da261" dependencies = [ "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -4683,9 +4690,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.39.1" +version = "1.39.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d040ac2b29ab03b09d4129c2f5bbd012a3ac2f79d38ff506a4bf8dd34b0eac8a" +checksum = "daa4fb1bc778bd6f04cbfc4bb2d06a7396a8f299dc33ea1900cedaa316f467b1" dependencies = [ "backtrace", "bytes", @@ -4706,7 +4713,7 @@ checksum = "693d596312e88961bc67d7f1f97af8a70227d9f90c31bba5806eec004978d752" dependencies = [ "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -4758,9 +4765,9 @@ dependencies = [ [[package]] name = "toml_datetime" -version = "0.6.7" +version = "0.6.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f8fb9f64314842840f1d940ac544da178732128f1c78c21772e876579e0da1db" +checksum = "0dd7358ecb8fc2f8d014bf86f6f638ce72ba252a2c3a2572f2a795f1d23efb41" dependencies = [ "serde", ] @@ -4824,7 +4831,7 @@ checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -4869,7 +4876,7 @@ checksum = "f03ca4cb38206e2bef0700092660bb74d696f808514dae47fa1467cbfe26e96e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -5022,34 +5029,35 @@ checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" [[package]] name = "wasm-bindgen" -version = "0.2.92" +version = "0.2.93" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4be2531df63900aeb2bca0daaaddec08491ee64ceecbee5076636a3b026795a8" +checksum = "a82edfc16a6c469f5f44dc7b571814045d60404b55a0ee849f9bcfa2e63dd9b5" dependencies = [ "cfg-if", + "once_cell", "wasm-bindgen-macro", ] [[package]] name = "wasm-bindgen-backend" -version = "0.2.92" +version = "0.2.93" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "614d787b966d3989fa7bb98a654e369c762374fd3213d212cfc0251257e747da" +checksum = "9de396da306523044d3302746f1208fa71d7532227f15e347e2d93e4145dd77b" dependencies = [ "bumpalo", "log", "once_cell", "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-futures" -version = "0.4.42" +version = "0.4.43" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "76bc14366121efc8dbb487ab05bcc9d346b3b5ec0eaa76e46594cabbe51762c0" +checksum = "61e9300f63a621e96ed275155c108eb6f843b6a26d053f122ab69724559dc8ed" dependencies = [ "cfg-if", "js-sys", @@ -5059,9 +5067,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.92" +version = "0.2.93" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a1f8823de937b71b9460c0c34e25f3da88250760bec0ebac694b49997550d726" +checksum = "585c4c91a46b072c92e908d99cb1dcdf95c5218eeb6f3bf1efa991ee7a68cccf" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -5069,22 +5077,22 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.92" +version = "0.2.93" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e94f17b526d0a461a191c78ea52bbce64071ed5c04c9ffe424dcb38f74171bb7" +checksum = "afc340c74d9005395cf9dd098506f7f44e38f2b4a21c6aaacf9a105ea5e1e836" dependencies = [ "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", "wasm-bindgen-backend", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-shared" -version = "0.2.92" +version = "0.2.93" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "af190c94f2773fdb3729c55b007a722abb5384da03bc0986df4c289bf5567e96" +checksum = "c62a0a307cb4a311d3a07867860911ca130c3494e8c2719593806c08bc5d0484" [[package]] name = "wasm-streams" @@ -5101,9 +5109,9 @@ dependencies = [ [[package]] name = "web-sys" -version = "0.3.69" +version = "0.3.70" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "77afa9a11836342370f4817622a2f0f418b134426d91a82dfb48f532d2ec13ef" +checksum = "26fdeaafd9bd129f65e7c031593c24d62186301e0c72c8978fa1678be7d532c0" dependencies = [ "js-sys", "wasm-bindgen", @@ -5127,11 +5135,11 @@ checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" [[package]] name = "winapi-util" -version = "0.1.8" +version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4d4cc384e1e73b93bafa6fb4f1df8c41695c8a91cf9c4c64358067d15a7b6c6b" +checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb" dependencies = [ - "windows-sys 0.52.0", + "windows-sys 0.59.0", ] [[package]] @@ -5179,7 +5187,7 @@ checksum = "9107ddc059d5b6fbfbffdfa7a7fe3e22a226def0b2608f72e9d552763d3e1ad7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -5190,7 +5198,7 @@ checksum = "29bee4b38ea3cde66011baa44dba677c432a78593e202392d1e9070cf2a7fca7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -5220,6 +5228,15 @@ dependencies = [ "windows-targets 0.52.6", ] +[[package]] +name = "windows-sys" +version = "0.59.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" +dependencies = [ + "windows-targets 0.52.6", +] + [[package]] name = "windows-targets" version = "0.48.5" @@ -5385,9 +5402,9 @@ checksum = "66fee0b777b0f5ac1c69bb06d361268faafa61cd4682ae064a171c16c433e9e4" [[package]] name = "xxhash-rust" -version = "0.8.11" +version = "0.8.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "63658493314859b4dfdf3fb8c1defd61587839def09582db50b8a4e93afca6bb" +checksum = "6a5cbf750400958819fb6178eaa83bee5cd9c29a26a40cc241df8c70fdd46984" [[package]] name = "zerocopy" @@ -5395,6 +5412,7 @@ version = "0.7.35" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0" dependencies = [ + "byteorder", "zerocopy-derive", ] @@ -5406,7 +5424,7 @@ checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -5426,18 +5444,18 @@ dependencies = [ [[package]] name = "zstd-safe" -version = "7.2.0" +version = "7.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fa556e971e7b568dc775c136fc9de8c779b1c2fc3a63defaafadffdbd3181afa" +checksum = "54a3ab4db68cea366acc5c897c7b4d4d1b8994a9cd6e6f841f8964566a419059" dependencies = [ "zstd-sys", ] [[package]] name = "zstd-sys" -version = "2.0.12+zstd.1.5.6" +version = "2.0.13+zstd.1.5.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0a4e40c320c3cb459d9a9ff6de98cff88f4751ee9275d140e2be94a2b74e4c13" +checksum = "38ff0f21cfee8f97d94cef41359e0c89aa6113028ab0291aa8ca0038995a95aa" dependencies = [ "cc", "pkg-config", diff --git a/crates/polars-arrow/src/array/binview/mod.rs b/crates/polars-arrow/src/array/binview/mod.rs index 85595ba4a7a8..f91e5f06e8a9 100644 --- a/crates/polars-arrow/src/array/binview/mod.rs +++ b/crates/polars-arrow/src/array/binview/mod.rs @@ -407,7 +407,7 @@ impl BinaryViewArrayGeneric { let buffers = self.buffers.as_ref(); for view in self.views.as_ref() { - unsafe { mutable.push_view_copied_unchecked(*view, buffers) } + unsafe { mutable.push_view_unchecked(*view, buffers) } } mutable.freeze().with_validity(self.validity) } diff --git a/crates/polars-arrow/src/array/binview/mutable.rs b/crates/polars-arrow/src/array/binview/mutable.rs index a4a9a2da6a52..3258f18052e3 100644 --- a/crates/polars-arrow/src/array/binview/mutable.rs +++ b/crates/polars-arrow/src/array/binview/mutable.rs @@ -144,7 +144,7 @@ impl MutableBinaryViewArray { /// - caller must allocate enough capacity /// - caller must ensure the view and buffers match. /// - The array must not have validity. - pub(crate) unsafe fn push_view_copied_unchecked(&mut self, v: View, buffers: &[Buffer]) { + pub(crate) unsafe fn push_view_unchecked(&mut self, v: View, buffers: &[Buffer]) { let len = v.length; self.total_bytes_len += len as usize; if len <= 12 { @@ -165,7 +165,7 @@ impl MutableBinaryViewArray { /// - caller must ensure the view and buffers match. /// - The array must not have validity. /// - caller must not mix use this function with other push functions. - pub unsafe fn push_view_unchecked(&mut self, mut v: View, buffers: &[Buffer]) { + pub unsafe fn push_view_unchecked_dedupe(&mut self, mut v: View, buffers: &[Buffer]) { let len = v.length; self.total_bytes_len += len as usize; if len <= 12 { @@ -438,14 +438,17 @@ impl MutableBinaryViewArray { /// # Safety /// Same as `push_view_unchecked()`. #[inline] - pub unsafe fn extend_non_null_views_trusted_len_unchecked( + pub unsafe fn extend_non_null_views_unchecked_dedupe( &mut self, iterator: I, buffers: &[Buffer], ) where - I: TrustedLen, + I: Iterator, { - self.extend_non_null_views_unchecked(iterator, buffers); + self.reserve(iterator.size_hint().0); + for v in iterator { + self.push_view_unchecked_dedupe(v, buffers); + } } #[inline] diff --git a/crates/polars-arrow/src/array/growable/binview.rs b/crates/polars-arrow/src/array/growable/binview.rs index 9e4c871d596f..ccbefa6d9e4e 100644 --- a/crates/polars-arrow/src/array/growable/binview.rs +++ b/crates/polars-arrow/src/array/growable/binview.rs @@ -1,6 +1,8 @@ use std::ops::Deref; use std::sync::Arc; +use polars_utils::aliases::{InitHashMaps, PlHashSet}; + use super::Growable; use crate::array::binview::{BinaryViewArrayGeneric, ViewType}; use crate::array::growable::utils::{extend_validity, extend_validity_copies, prepare_validity}; @@ -18,6 +20,7 @@ pub struct GrowableBinaryViewArray<'a, T: ViewType + ?Sized> { inner: MutableBinaryViewArray, same_buffers: Option<&'a Arc<[Buffer]>>, total_same_buffers_len: usize, // Only valid if same_buffers is Some. + has_duplicate_buffers: bool, } impl<'a, T: ViewType + ?Sized> GrowableBinaryViewArray<'a, T> { @@ -51,6 +54,14 @@ impl<'a, T: ViewType + ?Sized> GrowableBinaryViewArray<'a, T> { .then(|| arrays[0].total_buffer_len()) .unwrap_or_default(); + let mut duplicates = PlHashSet::new(); + let mut has_duplicate_buffers = false; + for arr in arrays.iter() { + if !duplicates.insert(arr.data_buffers().as_ptr()) { + has_duplicate_buffers = true; + break; + } + } Self { arrays, data_type, @@ -58,6 +69,7 @@ impl<'a, T: ViewType + ?Sized> GrowableBinaryViewArray<'a, T> { inner: MutableBinaryViewArray::::with_capacity(capacity), same_buffers, total_same_buffers_len, + has_duplicate_buffers, } } @@ -97,9 +109,12 @@ impl<'a, T: ViewType + ?Sized> Growable<'a> for GrowableBinaryViewArray<'a, T> { .views .extend(views_iter.inspect(|v| total_len += v.length as usize)); self.inner.total_bytes_len += total_len; + } else if self.has_duplicate_buffers { + self.inner + .extend_non_null_views_unchecked_dedupe(views_iter, local_buffers.deref()); } else { self.inner - .extend_non_null_views_trusted_len_unchecked(views_iter, local_buffers.deref()); + .extend_non_null_views_unchecked(views_iter, local_buffers.deref()); } } diff --git a/crates/polars-compute/src/if_then_else/mod.rs b/crates/polars-compute/src/if_then_else/mod.rs index c6c752483330..8265422fb9de 100644 --- a/crates/polars-compute/src/if_then_else/mod.rs +++ b/crates/polars-compute/src/if_then_else/mod.rs @@ -100,7 +100,7 @@ impl IfThenElseKernel for PrimitiveArray { } } -fn if_then_else_validity( +pub fn if_then_else_validity( mask: &Bitmap, if_true: Option<&Bitmap>, if_false: Option<&Bitmap>, diff --git a/crates/polars-compute/src/if_then_else/view.rs b/crates/polars-compute/src/if_then_else/view.rs index 25ff67f401aa..9324fbb8d7a7 100644 --- a/crates/polars-compute/src/if_then_else/view.rs +++ b/crates/polars-compute/src/if_then_else/view.rs @@ -6,6 +6,7 @@ use arrow::array::{Array, BinaryViewArray, MutablePlBinary, Utf8ViewArray, View} use arrow::bitmap::Bitmap; use arrow::buffer::Buffer; use arrow::datatypes::ArrowDataType; +use polars_utils::aliases::{InitHashMaps, PlHashSet}; use super::IfThenElseKernel; use crate::if_then_else::scalar::if_then_else_broadcast_both_scalar_64; @@ -28,12 +29,25 @@ fn make_buffer_and_views( (views, buf) } +fn has_duplicate_buffers(bufs: &[Buffer]) -> bool { + let mut has_duplicate_buffers = false; + let mut bufset = PlHashSet::new(); + for buf in bufs { + if !bufset.insert(buf.as_ptr()) { + has_duplicate_buffers = true; + break; + } + } + has_duplicate_buffers +} + impl IfThenElseKernel for BinaryViewArray { type Scalar<'a> = &'a [u8]; fn if_then_else(mask: &Bitmap, if_true: &Self, if_false: &Self) -> Self { let combined_buffers: Arc<_>; let false_buffer_idx_offset: u32; + let mut has_duplicate_bufs = false; if Arc::ptr_eq(if_true.data_buffers(), if_false.data_buffers()) { // Share exact same buffers, no need to combine. combined_buffers = if_true.data_buffers().clone(); @@ -42,7 +56,9 @@ impl IfThenElseKernel for BinaryViewArray { // Put false buffers after true buffers. let true_buffers = if_true.data_buffers().iter().cloned(); let false_buffers = if_false.data_buffers().iter().cloned(); + combined_buffers = true_buffers.chain(false_buffers).collect(); + has_duplicate_bufs = has_duplicate_buffers(&combined_buffers); false_buffer_idx_offset = if_true.data_buffers().len() as u32; } @@ -57,12 +73,19 @@ impl IfThenElseKernel for BinaryViewArray { let validity = super::if_then_else_validity(mask, if_true.validity(), if_false.validity()); let mut builder = MutablePlBinary::with_capacity(views.len()); - unsafe { - builder.extend_non_null_views_trusted_len_unchecked( - views.into_iter(), - combined_buffers.deref(), - ) - }; + + if has_duplicate_bufs { + unsafe { + builder.extend_non_null_views_unchecked_dedupe( + views.into_iter(), + combined_buffers.deref(), + ) + }; + } else { + unsafe { + builder.extend_non_null_views_unchecked(views.into_iter(), combined_buffers.deref()) + }; + } builder .freeze_with_dtype(if_true.data_type().clone()) .with_validity(validity) @@ -90,12 +113,17 @@ impl IfThenElseKernel for BinaryViewArray { let validity = super::if_then_else_validity(mask, None, if_false.validity()); let mut builder = MutablePlBinary::with_capacity(views.len()); + unsafe { - builder.extend_non_null_views_trusted_len_unchecked( - views.into_iter(), - combined_buffers.deref(), - ) - }; + if has_duplicate_buffers(&combined_buffers) { + builder.extend_non_null_views_unchecked_dedupe( + views.into_iter(), + combined_buffers.deref(), + ) + } else { + builder.extend_non_null_views_unchecked(views.into_iter(), combined_buffers.deref()) + } + } builder .freeze_with_dtype(if_false.data_type().clone()) .with_validity(validity) @@ -125,10 +153,14 @@ impl IfThenElseKernel for BinaryViewArray { let mut builder = MutablePlBinary::with_capacity(views.len()); unsafe { - builder.extend_non_null_views_trusted_len_unchecked( - views.into_iter(), - combined_buffers.deref(), - ) + if has_duplicate_buffers(&combined_buffers) { + builder.extend_non_null_views_unchecked_dedupe( + views.into_iter(), + combined_buffers.deref(), + ) + } else { + builder.extend_non_null_views_unchecked(views.into_iter(), combined_buffers.deref()) + } }; builder .freeze_with_dtype(if_true.data_type().clone()) @@ -152,7 +184,11 @@ impl IfThenElseKernel for BinaryViewArray { let mut builder = MutablePlBinary::with_capacity(views.len()); unsafe { - builder.extend_non_null_views_trusted_len_unchecked(views.into_iter(), buffers.deref()) + if has_duplicate_buffers(&buffers) { + builder.extend_non_null_views_unchecked_dedupe(views.into_iter(), buffers.deref()) + } else { + builder.extend_non_null_views_unchecked(views.into_iter(), buffers.deref()) + } }; builder.freeze_with_dtype(dtype) } diff --git a/crates/polars-core/src/chunked_array/ops/full.rs b/crates/polars-core/src/chunked_array/ops/full.rs index fce46375666c..790e7a23e6ed 100644 --- a/crates/polars-core/src/chunked_array/ops/full.rs +++ b/crates/polars-core/src/chunked_array/ops/full.rs @@ -1,4 +1,4 @@ -use arrow::bitmap::MutableBitmap; +use arrow::bitmap::{Bitmap, MutableBitmap}; use crate::chunked_array::builder::get_list_builder; use crate::prelude::*; @@ -189,7 +189,9 @@ impl ListChunked { impl ChunkFullNull for StructChunked { fn full_null(name: &str, length: usize) -> StructChunked { let s = vec![Series::new_null("", length)]; - StructChunked::from_series(name, &s).unwrap() + StructChunked::from_series(name, &s) + .unwrap() + .with_outer_validity(Some(Bitmap::new_zeroed(length))) } } diff --git a/crates/polars-core/src/chunked_array/ops/zip.rs b/crates/polars-core/src/chunked_array/ops/zip.rs index 8319c81d9c3c..cf85266581e7 100644 --- a/crates/polars-core/src/chunked_array/ops/zip.rs +++ b/crates/polars-core/src/chunked_array/ops/zip.rs @@ -1,6 +1,6 @@ use arrow::bitmap::Bitmap; use arrow::compute::utils::{combine_validities_and, combine_validities_and_not}; -use polars_compute::if_then_else::IfThenElseKernel; +use polars_compute::if_then_else::{if_then_else_validity, IfThenElseKernel}; #[cfg(feature = "object")] use crate::chunked_array::object::ObjectArray; @@ -62,7 +62,7 @@ fn combine_validities_chunked< impl ChunkZip for ChunkedArray where - T: PolarsDataType, + T: PolarsDataType, T::Array: for<'a> IfThenElseKernel = T::Physical<'a>>, ChunkedArray: ChunkExpandAtIndex, { @@ -206,3 +206,104 @@ impl IfThenElseKernel for ObjectArray { .collect_arr() } } + +#[cfg(feature = "dtype-struct")] +impl ChunkZip for StructChunked { + fn zip_with( + &self, + mask: &BooleanChunked, + other: &ChunkedArray, + ) -> PolarsResult> { + let (l, r, mask) = align_chunks_ternary(self, other, mask); + + // Prepare the boolean arrays such that Null maps to false. + // This prevents every field doing that. + // # SAFETY + // We don't modify the length and update the null count. + let mut mask = mask.into_owned(); + unsafe { + for arr in mask.downcast_iter_mut() { + let bm = bool_null_to_false(arr); + *arr = BooleanArray::from_data_default(bm, None); + } + mask.set_null_count(0); + } + + // Zip all the fields. + let fields = l + .fields_as_series() + .iter() + .zip(r.fields_as_series()) + .map(|(lhs, rhs)| lhs.zip_with_same_type(&mask, &rhs)) + .collect::>>()?; + + let mut out = StructChunked::from_series(self.name(), &fields)?; + + // Zip the validities. + if (l.null_count + r.null_count) > 0 { + let validities = l + .chunks() + .iter() + .zip(r.chunks()) + .map(|(l, r)| (l.validity(), r.validity())); + + fn broadcast(v: Option<&Bitmap>, arr: &ArrayRef) -> Bitmap { + if v.unwrap().get(0).unwrap() { + Bitmap::new_with_value(true, arr.len()) + } else { + Bitmap::new_zeroed(arr.len()) + } + } + + // # SAFETY + // We don't modify the length and update the null count. + unsafe { + for ((arr, (lv, rv)), mask) in out + .chunks_mut() + .iter_mut() + .zip(validities) + .zip(mask.downcast_iter()) + { + // TODO! we can optimize this and use a kernel that is able to broadcast wo/ allocating. + let (lv, rv) = match (lv.map(|b| b.len()), rv.map(|b| b.len())) { + (Some(1), Some(1)) if arr.len() != 1 => { + let lv = broadcast(lv, arr); + let rv = broadcast(rv, arr); + (Some(lv), Some(rv)) + }, + (Some(a), Some(b)) if a == b => (lv.cloned(), rv.cloned()), + (Some(1), _) => { + let lv = broadcast(lv, arr); + (Some(lv), rv.cloned()) + }, + (_, Some(1)) => { + let rv = broadcast(rv, arr); + (lv.cloned(), Some(rv)) + }, + (None, Some(_)) | (Some(_), None) | (None, None) => { + (lv.cloned(), rv.cloned()) + }, + (Some(a), Some(b)) => { + polars_bail!(InvalidOperation: "got different sizes in 'zip' operation, got length: {a} and {b}") + }, + }; + + // broadcast mask + let validity = if mask.len() != arr.len() && mask.len() == 1 { + if mask.get(0).unwrap() { + lv + } else { + rv + } + } else { + if_then_else_validity(mask.values(), lv.as_ref(), rv.as_ref()) + }; + + *arr = arr.with_validity(validity); + } + } + out.compute_len(); + } + Ok(out) + } +} diff --git a/crates/polars-core/src/series/implementations/struct__.rs b/crates/polars-core/src/series/implementations/struct__.rs index a6c775a4245d..9c565bf43b49 100644 --- a/crates/polars-core/src/series/implementations/struct__.rs +++ b/crates/polars-core/src/series/implementations/struct__.rs @@ -56,15 +56,9 @@ impl PrivateSeries for SeriesWrap { #[cfg(feature = "zip_with")] fn zip_with_same_type(&self, mask: &BooleanChunked, other: &Series) -> PolarsResult { - let other = other.struct_()?; - let fields = self - .0 - .fields_as_series() - .iter() - .zip(other.fields_as_series()) - .map(|(lhs, rhs)| lhs.zip_with_same_type(mask, &rhs)) - .collect::>>()?; - StructChunked::from_series(self.0.name(), &fields).map(|ca| ca.into_series()) + self.0 + .zip_with(mask, other.struct_()?) + .map(|ca| ca.into_series()) } #[cfg(feature = "algorithm_group_by")] diff --git a/crates/polars-core/src/series/ops/null.rs b/crates/polars-core/src/series/ops/null.rs index d13ce699cbad..0f46af8065bb 100644 --- a/crates/polars-core/src/series/ops/null.rs +++ b/crates/polars-core/src/series/ops/null.rs @@ -1,3 +1,5 @@ +use arrow::bitmap::Bitmap; + #[cfg(feature = "object")] use crate::chunked_array::object::registry::get_object_builder; use crate::prelude::*; @@ -53,9 +55,14 @@ impl Series { .iter() .map(|fld| Series::full_null(fld.name(), size, fld.data_type())) .collect::>(); - StructChunked::from_series(name, &fields) - .unwrap() - .into_series() + let ca = StructChunked::from_series(name, &fields).unwrap(); + + if !fields.is_empty() { + ca.with_outer_validity(Some(Bitmap::new_zeroed(size))) + .into_series() + } else { + ca.into_series() + } }, DataType::Null => Series::new_null(name, size), DataType::Unknown(kind) => { diff --git a/crates/polars-io/src/parquet/read/mmap.rs b/crates/polars-io/src/parquet/read/mmap.rs index 69ba42ac4c29..c42f8159b057 100644 --- a/crates/polars-io/src/parquet/read/mmap.rs +++ b/crates/polars-io/src/parquet/read/mmap.rs @@ -73,13 +73,7 @@ pub(super) fn to_deserializer<'a>( // Advise fetching the data for the column chunk chunk.prefetch(); - let pages = PageReader::new( - MemReader::new(chunk), - column_meta, - std::sync::Arc::new(|_, _| true), - vec![], - usize::MAX, - ); + let pages = PageReader::new(MemReader::new(chunk), column_meta, vec![], usize::MAX); ( BasicDecompressor::new(pages, vec![]), &column_meta.descriptor().descriptor.primitive_type, diff --git a/crates/polars-parquet/src/arrow/read/deserialize/binary/basic.rs b/crates/polars-parquet/src/arrow/read/deserialize/binary/basic.rs index 4c17b7bd2982..2d56f09d0af1 100644 --- a/crates/polars-parquet/src/arrow/read/deserialize/binary/basic.rs +++ b/crates/polars-parquet/src/arrow/read/deserialize/binary/basic.rs @@ -1,4 +1,5 @@ use std::default::Default; +use std::mem::MaybeUninit; use std::sync::atomic::{AtomicBool, Ordering}; use arrow::array::specification::try_check_utf8; @@ -11,11 +12,15 @@ use super::super::utils; use super::super::utils::extend_from_decoder; use super::decoders::*; use super::utils::*; +use crate::parquet::encoding::delta_bitpacked::DeltaGatherer; use crate::parquet::encoding::hybrid_rle::gatherer::HybridRleGatherer; use crate::parquet::encoding::hybrid_rle::HybridRleDecoder; +use crate::parquet::encoding::{delta_byte_array, delta_length_byte_array}; use crate::parquet::error::{ParquetError, ParquetResult}; use crate::parquet::page::{DataPage, DictPage}; -use crate::read::deserialize::utils::{Decoder, GatheredHybridRle, StateTranslation}; +use crate::read::deserialize::utils::{ + BatchableCollector, Decoder, GatheredHybridRle, StateTranslation, +}; use crate::read::PrimitiveLogicalType; impl utils::ExactSize for (Binary, MutableBitmap) { @@ -24,6 +29,138 @@ impl utils::ExactSize for (Binary, MutableBitmap) { } } +pub(crate) struct DeltaCollector<'a, 'b, O: Offset> { + pub(crate) decoder: &'b mut delta_length_byte_array::Decoder<'a>, + pub(crate) _pd: std::marker::PhantomData, +} + +pub(crate) struct DeltaBytesCollector<'a, 'b, O: Offset> { + pub(crate) decoder: &'b mut delta_byte_array::Decoder<'a>, + pub(crate) _pd: std::marker::PhantomData, +} + +impl<'a, 'b, O: Offset> DeltaBytesCollector<'a, 'b, O> { + pub fn gather_n_into(&mut self, target: &mut Binary, n: usize) -> ParquetResult<()> { + struct MaybeUninitCollector(usize); + + impl DeltaGatherer for MaybeUninitCollector { + type Target = [MaybeUninit; BATCH_SIZE]; + + fn target_len(&self, _target: &Self::Target) -> usize { + self.0 + } + + fn target_reserve(&self, _target: &mut Self::Target, _n: usize) {} + + fn gather_one(&mut self, target: &mut Self::Target, v: i64) -> ParquetResult<()> { + target[self.0] = MaybeUninit::new(v as usize); + self.0 += 1; + Ok(()) + } + } + + let decoder_len = self.decoder.len(); + let mut n = usize::min(n, decoder_len); + + if n == 0 { + return Ok(()); + } + + target.offsets.reserve(n); + let num_reserve_bytes = if target.offsets.len_proxy() == 0 { + self.decoder.values.len() - self.decoder.offset + } else { + // Make an estimate of how many bytes we will need + target.values.len() / target.offsets.len_proxy() * n + }; + target.values.reserve(num_reserve_bytes); + + const BATCH_SIZE: usize = 4096; + + let mut prefix_lengths = [const { MaybeUninit::::uninit() }; BATCH_SIZE]; + let mut suffix_lengths = [const { MaybeUninit::::uninit() }; BATCH_SIZE]; + + while n > 0 { + let num_elems = usize::min(n, BATCH_SIZE); + n -= num_elems; + + self.decoder.prefix_lengths.gather_n_into( + &mut prefix_lengths, + num_elems, + &mut MaybeUninitCollector(0), + )?; + self.decoder.suffix_lengths.gather_n_into( + &mut suffix_lengths, + num_elems, + &mut MaybeUninitCollector(0), + )?; + + for i in 0..num_elems { + let prefix_length = unsafe { prefix_lengths[i].assume_init() }; + let suffix_length = unsafe { suffix_lengths[i].assume_init() }; + + target + .values + .extend_from_slice(&self.decoder.last[..prefix_length]); + target.values.extend_from_slice( + &self.decoder.values[self.decoder.offset..self.decoder.offset + suffix_length], + ); + + self.decoder.last.clear(); + self.decoder.last.extend_from_slice( + &target.values[target.values.len() - prefix_length - suffix_length..], + ); + + self.decoder.offset += suffix_length; + } + } + + Ok(()) + } +} + +impl<'a, 'b, O: Offset> BatchableCollector<(), Binary> for DeltaCollector<'a, 'b, O> { + fn reserve(target: &mut Binary, n: usize) { + target.offsets.reserve(n); + } + + fn push_n(&mut self, target: &mut Binary, n: usize) -> ParquetResult<()> { + let start = target.offsets.last().to_usize(); + let mut gatherer = OffsetGatherer::default(); + self.decoder + .lengths + .gather_n_into(&mut target.offsets, n, &mut gatherer)?; + let end = target.offsets.last().to_usize(); + + target.values.extend_from_slice( + &self.decoder.values[self.decoder.offset..self.decoder.offset + end - start], + ); + self.decoder.offset += end - start; + + Ok(()) + } + + fn push_n_nulls(&mut self, target: &mut Binary, n: usize) -> ParquetResult<()> { + target.extend_constant(n); + Ok(()) + } +} + +impl<'a, 'b, O: Offset> BatchableCollector<(), Binary> for DeltaBytesCollector<'a, 'b, O> { + fn reserve(target: &mut Binary, n: usize) { + target.offsets.reserve(n); + } + + fn push_n(&mut self, target: &mut Binary, n: usize) -> ParquetResult<()> { + self.gather_n_into(target, n) + } + + fn push_n_nulls(&mut self, target: &mut Binary, n: usize) -> ParquetResult<()> { + target.extend_constant(n); + Ok(()) + } +} + impl<'a, O: Offset> StateTranslation<'a, BinaryDecoder> for BinaryStateTranslation<'a> { type PlainDecoder = BinaryIter<'a>; @@ -73,53 +210,42 @@ impl<'a, O: Offset> StateTranslation<'a, BinaryDecoder> for BinaryStateTransl page.dict, additional, )?, - T::Delta(page) => { + T::Delta(ref mut page) => { let (values, validity) = decoded; + let mut collector = DeltaCollector { + decoder: page, + _pd: std::marker::PhantomData, + }; + match page_validity { - None => values - .extend_lengths(page.lengths.by_ref().take(additional), &mut page.values), - Some(page_validity) => { - let Binary { - offsets, - values: values_, - } = values; - - let last_offset = *offsets.last(); - extend_from_decoder( - validity, - page_validity, - Some(additional), - offsets, - page.lengths.by_ref(), - )?; - - let length = *offsets.last() - last_offset; - - let (consumed, remaining) = page.values.split_at(length.to_usize()); - page.values = remaining; - values_.extend_from_slice(consumed); - }, + None => collector.push_n(values, additional)?, + Some(page_validity) => extend_from_decoder( + validity, + page_validity, + Some(additional), + values, + collector, + )?, } }, - T::DeltaBytes(page_values) => { + T::DeltaBytes(ref mut page_values) => { + let mut collector = DeltaBytesCollector { + decoder: page_values, + _pd: std::marker::PhantomData, + }; + let (values, validity) = decoded; match page_validity { - None => { - for x in page_values.take(additional) { - values.push(x) - } - }, - Some(page_validity) => { - extend_from_decoder( - validity, - page_validity, - Some(additional), - values, - page_values, - )?; - }, + None => collector.push_n(values, additional)?, + Some(page_validity) => extend_from_decoder( + validity, + page_validity, + Some(additional), + values, + collector, + )?, } }, } diff --git a/crates/polars-parquet/src/arrow/read/deserialize/binary/decoders.rs b/crates/polars-parquet/src/arrow/read/deserialize/binary/decoders.rs index 27ee2a9a251c..53c25d8050b9 100644 --- a/crates/polars-parquet/src/arrow/read/deserialize/binary/decoders.rs +++ b/crates/polars-parquet/src/arrow/read/deserialize/binary/decoders.rs @@ -1,122 +1,20 @@ use arrow::array::specification::try_check_utf8; use arrow::array::{BinaryArray, MutableBinaryValuesArray}; +use arrow::offset::Offsets; +use arrow::types::Offset; use polars_error::PolarsResult; use super::super::utils; use super::utils::*; -use crate::parquet::encoding::{delta_bitpacked, delta_length_byte_array, hybrid_rle, Encoding}; -use crate::parquet::error::ParquetResult; +use crate::parquet::encoding::{ + delta_bitpacked, delta_byte_array, delta_length_byte_array, hybrid_rle, Encoding, +}; +use crate::parquet::error::{ParquetError, ParquetResult}; use crate::parquet::page::{split_buffer, DataPage}; use crate::read::deserialize::utils::PageValidity; pub(crate) type BinaryDict = BinaryArray; -#[derive(Debug)] -pub(crate) struct Delta<'a> { - pub lengths: std::vec::IntoIter, - pub values: &'a [u8], -} - -impl<'a> Delta<'a> { - pub fn try_new(page: &'a DataPage) -> PolarsResult { - let values = split_buffer(page)?.values; - - let mut lengths_iter = delta_length_byte_array::Decoder::try_new(values)?; - - #[allow(clippy::needless_collect)] // we need to consume it to get the values - let lengths = lengths_iter - .by_ref() - .map(|x| x.map(|x| x as usize)) - .collect::>>()?; - - let values = lengths_iter.into_values(); - Ok(Self { - lengths: lengths.into_iter(), - values, - }) - } - - pub fn len(&self) -> usize { - self.lengths.size_hint().0 - } -} - -impl<'a> Iterator for Delta<'a> { - type Item = &'a [u8]; - - #[inline] - fn next(&mut self) -> Option { - let length = self.lengths.next()?; - let (item, remaining) = self.values.split_at(length); - self.values = remaining; - Some(item) - } - - fn size_hint(&self) -> (usize, Option) { - self.lengths.size_hint() - } -} - -#[derive(Debug)] -pub(crate) struct DeltaBytes<'a> { - prefix: std::vec::IntoIter, - suffix: std::vec::IntoIter, - data: &'a [u8], - data_offset: usize, - last_value: Vec, -} - -impl<'a> DeltaBytes<'a> { - pub fn try_new(page: &'a DataPage) -> PolarsResult { - let values = split_buffer(page)?.values; - let mut decoder = delta_bitpacked::Decoder::try_new(values)?; - let prefix = (&mut decoder) - .take(page.num_values()) - .map(|r| r.map(|v| v as i32).unwrap()) - .collect::>(); - - let mut data_offset = decoder.consumed_bytes(); - let mut decoder = delta_bitpacked::Decoder::try_new(&values[decoder.consumed_bytes()..])?; - let suffix = (&mut decoder) - .map(|r| r.map(|v| v as i32).unwrap()) - .collect::>(); - data_offset += decoder.consumed_bytes(); - - Ok(Self { - prefix: prefix.into_iter(), - suffix: suffix.into_iter(), - data: values, - data_offset, - last_value: vec![], - }) - } -} - -impl<'a> Iterator for DeltaBytes<'a> { - type Item = &'a [u8]; - - #[inline] - fn next(&mut self) -> Option { - let prefix_len = self.prefix.next()? as usize; - let suffix_len = self.suffix.next()? as usize; - - self.last_value.truncate(prefix_len); - self.last_value - .extend_from_slice(&self.data[self.data_offset..self.data_offset + suffix_len]); - self.data_offset += suffix_len; - - // SAFETY: the consumer will only keep one value around per iteration. - // We need a different API for this to work with safe code. - let extend_lifetime = - unsafe { std::mem::transmute::<&[u8], &'a [u8]>(self.last_value.as_slice()) }; - Some(extend_lifetime) - } - - fn size_hint(&self) -> (usize, Option) { - self.prefix.size_hint() - } -} - #[derive(Debug)] pub(crate) struct ValuesDictionary<'a> { pub values: hybrid_rle::HybridRleDecoder<'a>, @@ -136,12 +34,13 @@ impl<'a> ValuesDictionary<'a> { } } +#[allow(clippy::large_enum_variant)] #[derive(Debug)] pub(crate) enum BinaryStateTranslation<'a> { Plain(BinaryIter<'a>), Dictionary(ValuesDictionary<'a>), - Delta(Delta<'a>), - DeltaBytes(DeltaBytes<'a>), + Delta(delta_length_byte_array::Decoder<'a>), + DeltaBytes(delta_byte_array::Decoder<'a>), } impl<'a> BinaryStateTranslation<'a> { @@ -167,11 +66,17 @@ impl<'a> BinaryStateTranslation<'a> { Ok(BinaryStateTranslation::Plain(values)) }, (Encoding::DeltaLengthByteArray, _) => { - Ok(BinaryStateTranslation::Delta(Delta::try_new(page)?)) + let values = split_buffer(page)?.values; + Ok(BinaryStateTranslation::Delta( + delta_length_byte_array::Decoder::try_new(values)?, + )) + }, + (Encoding::DeltaByteArray, _) => { + let values = split_buffer(page)?.values; + Ok(BinaryStateTranslation::DeltaBytes( + delta_byte_array::Decoder::try_new(values)?, + )) }, - (Encoding::DeltaByteArray, _) => Ok(BinaryStateTranslation::DeltaBytes( - DeltaBytes::try_new(page)?, - )), _ => Err(utils::not_implemented(page)), } } @@ -180,7 +85,7 @@ impl<'a> BinaryStateTranslation<'a> { Self::Plain(v) => v.len_when_not_nullable(), Self::Dictionary(v) => v.len(), Self::Delta(v) => v.len(), - Self::DeltaBytes(v) => v.size_hint().0, + Self::DeltaBytes(v) => v.len(), } } @@ -192,8 +97,8 @@ impl<'a> BinaryStateTranslation<'a> { match self { Self::Plain(t) => _ = t.by_ref().nth(n - 1), Self::Dictionary(t) => t.values.skip_in_place(n)?, - Self::Delta(t) => _ = t.by_ref().nth(n - 1), - Self::DeltaBytes(t) => _ = t.by_ref().nth(n - 1), + Self::Delta(t) => t.skip_in_place(n)?, + Self::DeltaBytes(t) => t.skip_in_place(n)?, } Ok(()) @@ -211,3 +116,35 @@ pub(crate) fn deserialize_plain(values: &[u8], num_values: usize) -> BinaryDict dict_values.into() } + +#[derive(Default)] +pub(crate) struct OffsetGatherer { + _pd: std::marker::PhantomData, +} + +impl delta_bitpacked::DeltaGatherer for OffsetGatherer { + type Target = Offsets; + + fn target_len(&self, target: &Self::Target) -> usize { + target.len() + } + + fn target_reserve(&self, target: &mut Self::Target, n: usize) { + target.reserve(n); + } + + fn gather_one(&mut self, target: &mut Self::Target, v: i64) -> ParquetResult<()> { + target.try_push(v.try_into().unwrap()).unwrap(); + Ok(()) + } + fn gather_slice(&mut self, target: &mut Self::Target, slice: &[i64]) -> ParquetResult<()> { + target + .try_extend_from_lengths(slice.iter().copied().map(|i| i.try_into().unwrap())) + .map_err(|_| ParquetError::oos("Invalid length in delta encoding")) + } + fn gather_chunk(&mut self, target: &mut Self::Target, chunk: &[i64; 64]) -> ParquetResult<()> { + target + .try_extend_from_lengths(chunk.iter().copied().map(|i| i.try_into().unwrap())) + .map_err(|_| ParquetError::oos("Invalid length in delta encoding")) + } +} diff --git a/crates/polars-parquet/src/arrow/read/deserialize/binary/utils.rs b/crates/polars-parquet/src/arrow/read/deserialize/binary/utils.rs index 57e263d93587..65ea8aa54cc6 100644 --- a/crates/polars-parquet/src/arrow/read/deserialize/binary/utils.rs +++ b/crates/polars-parquet/src/arrow/read/deserialize/binary/utils.rs @@ -40,17 +40,6 @@ impl Binary { pub fn len(&self) -> usize { self.offsets.len_proxy() } - - #[inline] - pub fn extend_lengths>(&mut self, lengths: I, values: &mut &[u8]) { - let current_offset = *self.offsets.last(); - self.offsets.try_extend_from_lengths(lengths).unwrap(); - let new_offset = *self.offsets.last(); - let length = new_offset.to_usize() - current_offset.to_usize(); - let (consumed, remaining) = values.split_at(length); - *values = remaining; - self.values.extend_from_slice(consumed); - } } impl<'a, O: Offset> Pushable<&'a [u8]> for Binary { diff --git a/crates/polars-parquet/src/arrow/read/deserialize/binview.rs b/crates/polars-parquet/src/arrow/read/deserialize/binview.rs index c9d2f6486017..bf6f4bf97f1d 100644 --- a/crates/polars-parquet/src/arrow/read/deserialize/binview.rs +++ b/crates/polars-parquet/src/arrow/read/deserialize/binview.rs @@ -1,3 +1,4 @@ +use std::mem::MaybeUninit; use std::sync::atomic::{AtomicBool, Ordering}; use arrow::array::{ @@ -8,8 +9,10 @@ use arrow::bitmap::MutableBitmap; use arrow::datatypes::{ArrowDataType, PhysicalType}; use super::binary::decoders::*; -use super::utils::freeze_validity; +use super::utils::{freeze_validity, BatchableCollector}; +use crate::parquet::encoding::delta_bitpacked::DeltaGatherer; use crate::parquet::encoding::hybrid_rle::{self, DictionaryTranslator}; +use crate::parquet::encoding::{delta_byte_array, delta_length_byte_array}; use crate::parquet::error::{ParquetError, ParquetResult}; use crate::parquet::page::{DataPage, DictPage}; use crate::read::deserialize::binary::utils::BinaryIter; @@ -82,39 +85,39 @@ impl<'a> StateTranslation<'a, BinViewDecoder> for BinaryStateTranslation<'a> { // Already done in decode_plain_encoded validate_utf8 = false; }, - Self::Delta(page_values) => { + Self::Delta(ref mut page_values) => { let (values, validity) = decoded; + + let mut collector = DeltaCollector { + decoder: page_values, + }; + match page_validity { - None => { - for value in page_values.by_ref().take(additional) { - values.push_value_ignore_validity(value) - } - }, - Some(page_validity) => { - extend_from_decoder( - validity, - page_validity, - Some(additional), - values, - page_values, - )?; - }, + None => collector.push_n(values, additional)?, + Some(page_validity) => extend_from_decoder( + validity, + page_validity, + Some(additional), + values, + collector, + )?, } }, - Self::DeltaBytes(page_values) => { + Self::DeltaBytes(ref mut page_values) => { let (values, validity) = decoded; + + let mut collector = DeltaBytesCollector { + decoder: page_values, + }; + match page_validity { - None => { - for x in page_values.take(additional) { - values.push_value_ignore_validity(x) - } - }, + None => collector.push_n(values, additional)?, Some(page_validity) => extend_from_decoder( validity, page_validity, Some(additional), values, - page_values, + collector, )?, } }, @@ -143,6 +146,153 @@ impl utils::ExactSize for DecodedStateTuple { } } +pub(crate) struct DeltaCollector<'a, 'b> { + pub(crate) decoder: &'b mut delta_length_byte_array::Decoder<'a>, +} + +pub(crate) struct DeltaBytesCollector<'a, 'b> { + pub(crate) decoder: &'b mut delta_byte_array::Decoder<'a>, +} + +pub(crate) struct ViewGatherer<'a, 'b> { + values: &'a [u8], + offset: &'b mut usize, +} + +impl<'a, 'b> DeltaGatherer for ViewGatherer<'a, 'b> { + type Target = MutableBinaryViewArray<[u8]>; + + fn target_len(&self, target: &Self::Target) -> usize { + target.len() + } + + fn target_reserve(&self, target: &mut Self::Target, n: usize) { + target.views_mut().reserve(n) + } + + fn gather_one(&mut self, target: &mut Self::Target, v: i64) -> ParquetResult<()> { + let v = v as usize; + let s = &self.values[*self.offset..*self.offset + v]; + *self.offset += v; + target.push(Some(s)); + Ok(()) + } +} + +impl<'a, 'b> BatchableCollector<(), MutableBinaryViewArray<[u8]>> for DeltaCollector<'a, 'b> { + fn reserve(target: &mut MutableBinaryViewArray<[u8]>, n: usize) { + target.views_mut().reserve(n); + } + + fn push_n(&mut self, target: &mut MutableBinaryViewArray<[u8]>, n: usize) -> ParquetResult<()> { + let mut gatherer = ViewGatherer { + values: self.decoder.values, + offset: &mut self.decoder.offset, + }; + self.decoder + .lengths + .gather_n_into(target, n, &mut gatherer)?; + + Ok(()) + } + + fn push_n_nulls( + &mut self, + target: &mut MutableBinaryViewArray<[u8]>, + n: usize, + ) -> ParquetResult<()> { + target.extend_constant(n, >::None); + Ok(()) + } +} + +impl<'a, 'b> BatchableCollector<(), MutableBinaryViewArray<[u8]>> for DeltaBytesCollector<'a, 'b> { + fn reserve(target: &mut MutableBinaryViewArray<[u8]>, n: usize) { + target.views_mut().reserve(n); + } + + fn push_n(&mut self, target: &mut MutableBinaryViewArray<[u8]>, n: usize) -> ParquetResult<()> { + struct MaybeUninitCollector(usize); + + impl DeltaGatherer for MaybeUninitCollector { + type Target = [MaybeUninit; BATCH_SIZE]; + + fn target_len(&self, _target: &Self::Target) -> usize { + self.0 + } + + fn target_reserve(&self, _target: &mut Self::Target, _n: usize) {} + + fn gather_one(&mut self, target: &mut Self::Target, v: i64) -> ParquetResult<()> { + target[self.0] = MaybeUninit::new(v as usize); + self.0 += 1; + Ok(()) + } + } + + let decoder_len = self.decoder.len(); + let mut n = usize::min(n, decoder_len); + + if n == 0 { + return Ok(()); + } + + let mut buffer = Vec::new(); + target.views_mut().reserve(n); + + const BATCH_SIZE: usize = 4096; + + let mut prefix_lengths = [const { MaybeUninit::::uninit() }; BATCH_SIZE]; + let mut suffix_lengths = [const { MaybeUninit::::uninit() }; BATCH_SIZE]; + + while n > 0 { + let num_elems = usize::min(n, BATCH_SIZE); + n -= num_elems; + + self.decoder.prefix_lengths.gather_n_into( + &mut prefix_lengths, + num_elems, + &mut MaybeUninitCollector(0), + )?; + self.decoder.suffix_lengths.gather_n_into( + &mut suffix_lengths, + num_elems, + &mut MaybeUninitCollector(0), + )?; + + for i in 0..num_elems { + let prefix_length = unsafe { prefix_lengths[i].assume_init() }; + let suffix_length = unsafe { suffix_lengths[i].assume_init() }; + + buffer.clear(); + + buffer.extend_from_slice(&self.decoder.last[..prefix_length]); + buffer.extend_from_slice( + &self.decoder.values[self.decoder.offset..self.decoder.offset + suffix_length], + ); + + target.push_value(&buffer); + + self.decoder.last.clear(); + std::mem::swap(&mut self.decoder.last, &mut buffer); + + self.decoder.offset += suffix_length; + } + } + + Ok(()) + } + + fn push_n_nulls( + &mut self, + target: &mut MutableBinaryViewArray<[u8]>, + n: usize, + ) -> ParquetResult<()> { + target.extend_constant(n, >::None); + Ok(()) + } +} + impl utils::Decoder for BinViewDecoder { type Translation<'a> = BinaryStateTranslation<'a>; type Dict = BinaryDict; diff --git a/crates/polars-parquet/src/arrow/read/deserialize/mod.rs b/crates/polars-parquet/src/arrow/read/deserialize/mod.rs index 28c97f82ccf1..1e36127eef97 100644 --- a/crates/polars-parquet/src/arrow/read/deserialize/mod.rs +++ b/crates/polars-parquet/src/arrow/read/deserialize/mod.rs @@ -29,14 +29,12 @@ use crate::parquet::schema::types::PrimitiveType; pub fn get_page_iterator( column_metadata: &ColumnChunkMetaData, reader: MemReader, - pages_filter: Option, buffer: Vec, max_header_size: usize, ) -> PolarsResult { Ok(_get_page_iterator( column_metadata, reader, - pages_filter, buffer, max_header_size, )?) diff --git a/crates/polars-parquet/src/arrow/read/deserialize/nested.rs b/crates/polars-parquet/src/arrow/read/deserialize/nested.rs index b82abec09996..e200d3c1a8da 100644 --- a/crates/polars-parquet/src/arrow/read/deserialize/nested.rs +++ b/crates/polars-parquet/src/arrow/read/deserialize/nested.rs @@ -50,7 +50,7 @@ pub fn columns_to_iter_recursive( PageNestedDecoder::new( columns.pop().unwrap(), field.data_type().clone(), - primitive::PrimitiveDecoder::::cast_as(), + primitive::IntDecoder::::cast_as(), init, )? .collect_n(filter) @@ -62,7 +62,7 @@ pub fn columns_to_iter_recursive( PageNestedDecoder::new( columns.pop().unwrap(), field.data_type().clone(), - primitive::PrimitiveDecoder::::cast_as(), + primitive::IntDecoder::::cast_as(), init, )? .collect_n(filter) @@ -74,7 +74,7 @@ pub fn columns_to_iter_recursive( PageNestedDecoder::new( columns.pop().unwrap(), field.data_type().clone(), - primitive::PrimitiveDecoder::::unit(), + primitive::IntDecoder::::unit(), init, )? .collect_n(filter) @@ -86,7 +86,7 @@ pub fn columns_to_iter_recursive( PageNestedDecoder::new( columns.pop().unwrap(), field.data_type().clone(), - primitive::PrimitiveDecoder::::unit(), + primitive::IntDecoder::::unit(), init, )? .collect_n(filter) @@ -98,7 +98,7 @@ pub fn columns_to_iter_recursive( PageNestedDecoder::new( columns.pop().unwrap(), field.data_type().clone(), - primitive::PrimitiveDecoder::::cast_as(), + primitive::IntDecoder::::cast_as(), init, )? .collect_n(filter) @@ -110,7 +110,7 @@ pub fn columns_to_iter_recursive( PageNestedDecoder::new( columns.pop().unwrap(), field.data_type().clone(), - primitive::PrimitiveDecoder::::cast_as(), + primitive::IntDecoder::::cast_as(), init, )? .collect_n(filter) @@ -123,7 +123,7 @@ pub fn columns_to_iter_recursive( PhysicalType::Int32 => PageNestedDecoder::new( columns.pop().unwrap(), field.data_type().clone(), - primitive::PrimitiveDecoder::::cast_as(), + primitive::IntDecoder::::cast_as(), init, )? .collect_n(filter) @@ -132,7 +132,7 @@ pub fn columns_to_iter_recursive( PhysicalType::Int64 => PageNestedDecoder::new( columns.pop().unwrap(), field.data_type().clone(), - primitive::PrimitiveDecoder::::cast_as(), + primitive::IntDecoder::::cast_as(), init, )? .collect_n(filter) @@ -150,7 +150,7 @@ pub fn columns_to_iter_recursive( PageNestedDecoder::new( columns.pop().unwrap(), field.data_type().clone(), - primitive::PrimitiveDecoder::::cast_as(), + primitive::IntDecoder::::cast_as(), init, )? .collect_n(filter) @@ -244,7 +244,7 @@ pub fn columns_to_iter_recursive( PhysicalType::Int32 => PageNestedDecoder::new( columns.pop().unwrap(), field.data_type.clone(), - primitive::PrimitiveDecoder::::cast_into(), + primitive::IntDecoder::::cast_into(), init, )? .collect_n(filter) @@ -252,7 +252,7 @@ pub fn columns_to_iter_recursive( PhysicalType::Int64 => PageNestedDecoder::new( columns.pop().unwrap(), field.data_type.clone(), - primitive::PrimitiveDecoder::::cast_into(), + primitive::IntDecoder::::cast_into(), init, )? .collect_n(filter) @@ -302,7 +302,7 @@ pub fn columns_to_iter_recursive( PhysicalType::Int32 => PageNestedDecoder::new( columns.pop().unwrap(), field.data_type.clone(), - primitive::PrimitiveDecoder::closure(|x: i32| i256(I256::new(x as i128))), + primitive::IntDecoder::closure(|x: i32| i256(I256::new(x as i128))), init, )? .collect_n(filter) @@ -310,7 +310,7 @@ pub fn columns_to_iter_recursive( PhysicalType::Int64 => PageNestedDecoder::new( columns.pop().unwrap(), field.data_type.clone(), - primitive::PrimitiveDecoder::closure(|x: i64| i256(I256::new(x as i128))), + primitive::IntDecoder::closure(|x: i64| i256(I256::new(x as i128))), init, )? .collect_n(filter) @@ -481,68 +481,52 @@ fn dict_read( }; Ok(match values_data_type.to_logical_type() { - UInt8 => { - PageNestedDecoder::new( - iter, - data_type, - dictionary::DictionaryDecoder::new( - primitive::PrimitiveDecoder::::cast_as(), - ), - init, - )? - .collect_n(filter)? - }, + UInt8 => PageNestedDecoder::new( + iter, + data_type, + dictionary::DictionaryDecoder::new(primitive::IntDecoder::::cast_as()), + init, + )? + .collect_n(filter)?, UInt16 => PageNestedDecoder::new( iter, data_type, - dictionary::DictionaryDecoder::new( - primitive::PrimitiveDecoder::::cast_as(), - ), + dictionary::DictionaryDecoder::new(primitive::IntDecoder::::cast_as()), init, )? .collect_n(filter)?, UInt32 => PageNestedDecoder::new( iter, data_type, - dictionary::DictionaryDecoder::new( - primitive::PrimitiveDecoder::::cast_as(), - ), + dictionary::DictionaryDecoder::new(primitive::IntDecoder::::cast_as()), + init, + )? + .collect_n(filter)?, + Int8 => PageNestedDecoder::new( + iter, + data_type, + dictionary::DictionaryDecoder::new(primitive::IntDecoder::::cast_as()), init, )? .collect_n(filter)?, - Int8 => { - PageNestedDecoder::new( - iter, - data_type, - dictionary::DictionaryDecoder::new( - primitive::PrimitiveDecoder::::cast_as(), - ), - init, - )? - .collect_n(filter)? - }, Int16 => PageNestedDecoder::new( iter, data_type, - dictionary::DictionaryDecoder::new( - primitive::PrimitiveDecoder::::cast_as(), - ), + dictionary::DictionaryDecoder::new(primitive::IntDecoder::::cast_as()), init, )? .collect_n(filter)?, Int32 | Date32 | Time32(_) | Interval(IntervalUnit::YearMonth) => PageNestedDecoder::new( iter, data_type, - dictionary::DictionaryDecoder::new(primitive::PrimitiveDecoder::::unit()), + dictionary::DictionaryDecoder::new(primitive::IntDecoder::::unit()), init, )? .collect_n(filter)?, Int64 | Date64 | Time64(_) | Duration(_) => PageNestedDecoder::new( iter, data_type, - dictionary::DictionaryDecoder::new( - primitive::PrimitiveDecoder::::cast_as(), - ), + dictionary::DictionaryDecoder::new(primitive::IntDecoder::::cast_as()), init, )? .collect_n(filter)?, diff --git a/crates/polars-parquet/src/arrow/read/deserialize/primitive/integer.rs b/crates/polars-parquet/src/arrow/read/deserialize/primitive/integer.rs index 10aeb5b9f640..45518947e0a1 100644 --- a/crates/polars-parquet/src/arrow/read/deserialize/primitive/integer.rs +++ b/crates/polars-parquet/src/arrow/read/deserialize/primitive/integer.rs @@ -2,13 +2,13 @@ use arrow::array::{DictionaryArray, DictionaryKey, PrimitiveArray}; use arrow::bitmap::MutableBitmap; use arrow::datatypes::ArrowDataType; use arrow::types::NativeType; -use num_traits::AsPrimitive; use super::super::utils; use super::basic::{ AsDecoderFunction, ClosureDecoderFunction, DecoderFunction, IntoDecoderFunction, PlainDecoderFnCollector, PrimitiveDecoder, UnitDecoderFunction, ValuesDictionary, }; +use super::{DeltaCollector, DeltaTranslator}; use crate::parquet::encoding::hybrid_rle::{self, DictionaryTranslator}; use crate::parquet::encoding::{byte_stream_split, delta_bitpacked, Encoding}; use crate::parquet::error::ParquetResult; @@ -61,9 +61,9 @@ where }, (Encoding::DeltaBinaryPacked, _) => { let values = split_buffer(page)?.values; - Ok(Self::DeltaBinaryPacked(delta_bitpacked::Decoder::try_new( - values, - )?)) + Ok(Self::DeltaBinaryPacked( + delta_bitpacked::Decoder::try_new(values)?.0, + )) }, _ => Err(utils::not_implemented(page)), } @@ -74,7 +74,7 @@ where Self::Plain(v) => v.len(), Self::Dictionary(v) => v.len(), Self::ByteStreamSplit(v) => v.len(), - Self::DeltaBinaryPacked(v) => v.size_hint().0, + Self::DeltaBinaryPacked(v) => v.len(), } } @@ -87,7 +87,7 @@ where Self::Plain(v) => _ = v.nth(n - 1), Self::Dictionary(v) => v.values.skip_in_place(n)?, Self::ByteStreamSplit(v) => _ = v.iter_converted(|_| ()).nth(n - 1), - Self::DeltaBinaryPacked(v) => _ = v.nth(n - 1), + Self::DeltaBinaryPacked(v) => v.skip_in_place(n)?, } Ok(()) @@ -140,23 +140,22 @@ where Self::DeltaBinaryPacked(page_values) => { let (values, validity) = decoded; + let mut gatherer = DeltaTranslator { + dfn: decoder.0.decoder, + _pd: std::marker::PhantomData, + }; + match page_validity { - None => { - values.extend( - page_values - .by_ref() - .map(|x| decoder.0.decoder.decode(x.unwrap().as_())) - .take(additional), - ); - }, + None => page_values.gather_n_into(values, additional, &mut gatherer)?, Some(page_validity) => utils::extend_from_decoder( validity, page_validity, Some(additional), values, - &mut page_values - .by_ref() - .map(|x| decoder.0.decoder.decode(x.unwrap().as_())), + DeltaCollector { + decoder: page_values, + gatherer, + }, )?, } }, diff --git a/crates/polars-parquet/src/arrow/read/deserialize/primitive/mod.rs b/crates/polars-parquet/src/arrow/read/deserialize/primitive/mod.rs index c13dfa88bc3e..45c95a7d5ee1 100644 --- a/crates/polars-parquet/src/arrow/read/deserialize/primitive/mod.rs +++ b/crates/polars-parquet/src/arrow/read/deserialize/primitive/mod.rs @@ -1,5 +1,112 @@ +use arrow::types::NativeType; +use num_traits::AsPrimitive; + +use crate::parquet::types::NativeType as ParquetNativeType; + mod basic; mod integer; pub(crate) use basic::PrimitiveDecoder; pub(crate) use integer::IntDecoder; + +use self::basic::DecoderFunction; +use super::utils::BatchableCollector; +use super::ParquetResult; +use crate::parquet::encoding::delta_bitpacked::{self, DeltaGatherer}; + +struct DeltaTranslator +where + T: NativeType, + P: ParquetNativeType, + i64: AsPrimitive

, + D: DecoderFunction, +{ + dfn: D, + _pd: std::marker::PhantomData<(P, T)>, +} + +struct DeltaCollector<'a, 'b, P, T, D> +where + T: NativeType, + P: ParquetNativeType, + i64: AsPrimitive

, + D: DecoderFunction, +{ + decoder: &'b mut delta_bitpacked::Decoder<'a>, + gatherer: DeltaTranslator, +} + +impl DeltaGatherer for DeltaTranslator +where + T: NativeType, + P: ParquetNativeType, + i64: AsPrimitive

, + D: DecoderFunction, +{ + type Target = Vec; + + fn target_len(&self, target: &Self::Target) -> usize { + target.len() + } + + fn target_reserve(&self, target: &mut Self::Target, n: usize) { + target.reserve(n); + } + + fn gather_one(&mut self, target: &mut Self::Target, v: i64) -> ParquetResult<()> { + target.push(self.dfn.decode(v.as_())); + Ok(()) + } + + fn gather_constant( + &mut self, + target: &mut Self::Target, + v: i64, + delta: i64, + num_repeats: usize, + ) -> ParquetResult<()> { + target.extend((0..num_repeats).map(|i| self.dfn.decode((v + (i as i64) * delta).as_()))); + Ok(()) + } + + fn gather_slice(&mut self, target: &mut Self::Target, slice: &[i64]) -> ParquetResult<()> { + target.extend(slice.iter().copied().map(|v| self.dfn.decode(v.as_()))); + Ok(()) + } + + fn gather_chunk(&mut self, target: &mut Self::Target, chunk: &[i64; 64]) -> ParquetResult<()> { + target.extend(chunk.iter().copied().map(|v| self.dfn.decode(v.as_()))); + Ok(()) + } +} + +impl<'a, 'b, P, T, D> BatchableCollector<(), Vec> for DeltaCollector<'a, 'b, P, T, D> +where + T: NativeType, + P: ParquetNativeType, + i64: AsPrimitive

, + D: DecoderFunction, +{ + fn reserve(target: &mut Vec, n: usize) { + target.reserve(n); + } + + fn push_n(&mut self, target: &mut Vec, n: usize) -> ParquetResult<()> { + let start_length = target.len(); + let start_num_elems = self.decoder.len(); + + self.decoder.gather_n_into(target, n, &mut self.gatherer)?; + + let consumed_elements = usize::min(n, start_num_elems); + + debug_assert_eq!(self.decoder.len(), start_num_elems - consumed_elements); + debug_assert_eq!(target.len(), start_length + consumed_elements); + + Ok(()) + } + + fn push_n_nulls(&mut self, target: &mut Vec, n: usize) -> ParquetResult<()> { + target.resize(target.len() + n, T::default()); + Ok(()) + } +} diff --git a/crates/polars-parquet/src/arrow/read/mod.rs b/crates/polars-parquet/src/arrow/read/mod.rs index fff6987f4f1a..a0993d7c81b4 100644 --- a/crates/polars-parquet/src/arrow/read/mod.rs +++ b/crates/polars-parquet/src/arrow/read/mod.rs @@ -35,7 +35,7 @@ pub use crate::parquet::{ read::{ decompress, get_column_iterator, read_columns_indexes as _read_columns_indexes, read_metadata as _read_metadata, read_pages_locations, BasicDecompressor, - MutStreamingIterator, PageFilter, PageReader, ReadColumnIterator, State, + MutStreamingIterator, PageReader, ReadColumnIterator, State, }, schema::types::{ GroupLogicalType, ParquetType, PhysicalType, PrimitiveConvertedType, PrimitiveLogicalType, diff --git a/crates/polars-parquet/src/arrow/read/row_group.rs b/crates/polars-parquet/src/arrow/read/row_group.rs index 0156569b4cd9..990814c356c0 100644 --- a/crates/polars-parquet/src/arrow/read/row_group.rs +++ b/crates/polars-parquet/src/arrow/read/row_group.rs @@ -146,7 +146,6 @@ pub fn to_deserializer<'a>( let pages = PageReader::new( MemReader::from_vec(chunk), column_meta, - std::sync::Arc::new(|_, _| true), vec![], len * 2 + 1024, ); diff --git a/crates/polars-parquet/src/arrow/write/binary/basic.rs b/crates/polars-parquet/src/arrow/write/binary/basic.rs index e675986d81cc..c977a4e4939c 100644 --- a/crates/polars-parquet/src/arrow/write/binary/basic.rs +++ b/crates/polars-parquet/src/arrow/write/binary/basic.rs @@ -126,14 +126,14 @@ pub(crate) fn encode_delta( let length = offsets.len() - 1 - validity.unset_bits(); let lengths = utils::ExactSizedIter::new(lengths, length); - delta_bitpacked::encode(lengths, buffer); + delta_bitpacked::encode(lengths, buffer, 1); } else { let lengths = offsets.windows(2).map(|w| (w[1] - w[0]).to_usize() as i64); - delta_bitpacked::encode(lengths, buffer); + delta_bitpacked::encode(lengths, buffer, 1); } } else { let lengths = offsets.windows(2).map(|w| (w[1] - w[0]).to_usize() as i64); - delta_bitpacked::encode(lengths, buffer); + delta_bitpacked::encode(lengths, buffer, 1); } buffer.extend_from_slice( diff --git a/crates/polars-parquet/src/arrow/write/binview/basic.rs b/crates/polars-parquet/src/arrow/write/binview/basic.rs index 2516e03e667c..c7059b63c99e 100644 --- a/crates/polars-parquet/src/arrow/write/binview/basic.rs +++ b/crates/polars-parquet/src/arrow/write/binview/basic.rs @@ -23,8 +23,11 @@ pub(crate) fn encode_plain(array: &BinaryViewArray, buffer: &mut Vec) { } pub(crate) fn encode_delta(array: &BinaryViewArray, buffer: &mut Vec) { - let lengths = array.non_null_views_iter().map(|v| v.length as i64); - delta_bitpacked::encode(lengths, buffer); + let lengths = utils::ExactSizedIter::new( + array.non_null_views_iter().map(|v| v.length as i64), + array.len() - array.null_count(), + ); + delta_bitpacked::encode(lengths, buffer, 1); for slice in array.non_null_values_iter() { buffer.extend_from_slice(slice) diff --git a/crates/polars-parquet/src/arrow/write/primitive/basic.rs b/crates/polars-parquet/src/arrow/write/primitive/basic.rs index b914978ea8db..2c6c137ce220 100644 --- a/crates/polars-parquet/src/arrow/write/primitive/basic.rs +++ b/crates/polars-parquet/src/arrow/write/primitive/basic.rs @@ -89,7 +89,7 @@ where integer }); let iterator = ExactSizedIter::new(iterator, array.len() - array.null_count()); - encode(iterator, &mut buffer) + encode(iterator, &mut buffer, 1) } else { // append all values let iterator = array.values().iter().map(|x| { @@ -97,7 +97,7 @@ where let integer: i64 = parquet_native.as_(); integer }); - encode(iterator, &mut buffer) + encode(iterator, &mut buffer, 1) } buffer } diff --git a/crates/polars-parquet/src/arrow/write/utils.rs b/crates/polars-parquet/src/arrow/write/utils.rs index bbbe177af648..7f7796b0fff2 100644 --- a/crates/polars-parquet/src/arrow/write/utils.rs +++ b/crates/polars-parquet/src/arrow/write/utils.rs @@ -134,6 +134,8 @@ impl> Iterator for ExactSizedIter { } } +impl> std::iter::ExactSizeIterator for ExactSizedIter {} + /// Returns the number of bits needed to bitpack `max` #[inline] pub fn get_bit_width(max: u64) -> u32 { diff --git a/crates/polars-parquet/src/parquet/encoding/bitpacked/decode.rs b/crates/polars-parquet/src/parquet/encoding/bitpacked/decode.rs index ce7fa301a7b4..6e37507d137f 100644 --- a/crates/polars-parquet/src/parquet/encoding/bitpacked/decode.rs +++ b/crates/polars-parquet/src/parquet/encoding/bitpacked/decode.rs @@ -1,5 +1,5 @@ use super::{Packed, Unpackable, Unpacked}; -use crate::parquet::error::ParquetError; +use crate::parquet::error::{ParquetError, ParquetResult}; /// An [`Iterator`] of [`Unpackable`] unpacked from a bitpacked slice of bytes. /// # Implementation @@ -9,34 +9,18 @@ pub struct Decoder<'a, T: Unpackable> { packed: std::slice::Chunks<'a, u8>, num_bits: usize, /// number of items - length: usize, + pub(crate) length: usize, _pd: std::marker::PhantomData, } -#[derive(Debug)] -pub struct DecoderIter { - buffer: Vec, - idx: usize, -} - -impl Iterator for DecoderIter { - type Item = T; - - fn next(&mut self) -> Option { - if self.idx >= self.buffer.len() { - return None; +impl<'a, T: Unpackable> Default for Decoder<'a, T> { + fn default() -> Self { + Self { + packed: [].chunks(1), + num_bits: 0, + length: 0, + _pd: std::marker::PhantomData, } - - let value = self.buffer[self.idx]; - self.idx += 1; - - Some(value) - } - - fn size_hint(&self) -> (usize, Option) { - let len = self.buffer.len() - self.idx; - - (len, Some(len)) } } @@ -57,18 +41,43 @@ impl<'a, T: Unpackable> Decoder<'a, T> { Self::try_new(packed, num_bits, length).unwrap() } - pub fn collect_into_iter(self) -> DecoderIter { - let mut buffer = Vec::new(); - self.collect_into(&mut buffer); - DecoderIter { buffer, idx: 0 } + /// Returns a [`Decoder`] with `T` encoded in `packed` with `num_bits`. + /// + /// `num_bits` is allowed to be `0`. + pub fn new_allow_zero(packed: &'a [u8], num_bits: usize, length: usize) -> Self { + Self::try_new_allow_zero(packed, num_bits, length).unwrap() } - pub fn num_bits(&self) -> usize { - self.num_bits + /// Returns a [`Decoder`] with `T` encoded in `packed` with `num_bits`. + /// + /// `num_bits` is allowed to be `0`. + pub fn try_new_allow_zero( + packed: &'a [u8], + num_bits: usize, + length: usize, + ) -> ParquetResult { + let block_size = std::mem::size_of::() * num_bits; + + if packed.len() * 8 < length * num_bits { + return Err(ParquetError::oos(format!( + "Unpacking {length} items with a number of bits {num_bits} requires at least {} bytes.", + length * num_bits / 8 + ))); + } + + debug_assert!(num_bits != 0 || packed.is_empty()); + let packed = packed.chunks(block_size.max(1)); + + Ok(Self { + length, + packed, + num_bits, + _pd: Default::default(), + }) } /// Returns a [`Decoder`] with `T` encoded in `packed` with `num_bits`. - pub fn try_new(packed: &'a [u8], num_bits: usize, length: usize) -> Result { + pub fn try_new(packed: &'a [u8], num_bits: usize, length: usize) -> ParquetResult { let block_size = std::mem::size_of::() * num_bits; if num_bits == 0 { @@ -91,11 +100,16 @@ impl<'a, T: Unpackable> Decoder<'a, T> { _pd: Default::default(), }) } + + pub fn num_bits(&self) -> usize { + self.num_bits + } } /// A iterator over the exact chunks in a [`Decoder`]. /// /// The remainder can be accessed using `remainder` or `next_inexact`. +#[derive(Debug)] pub struct ChunkedDecoder<'a, 'b, T: Unpackable> { pub(crate) decoder: &'b mut Decoder<'a, T>, } diff --git a/crates/polars-parquet/src/parquet/encoding/bitpacked/mod.rs b/crates/polars-parquet/src/parquet/encoding/bitpacked/mod.rs index ef6c5313ba26..94f310d28f14 100644 --- a/crates/polars-parquet/src/parquet/encoding/bitpacked/mod.rs +++ b/crates/polars-parquet/src/parquet/encoding/bitpacked/mod.rs @@ -57,7 +57,7 @@ mod encode; mod pack; mod unpack; -pub use decode::{Decoder, DecoderIter}; +pub use decode::Decoder; pub use encode::{encode, encode_pack}; /// A byte slice (e.g. `[u8; 8]`) denoting types that represent complete packs. diff --git a/crates/polars-parquet/src/parquet/encoding/delta_bitpacked/decoder.rs b/crates/polars-parquet/src/parquet/encoding/delta_bitpacked/decoder.rs index ee21a5094718..fb8eb153cfb7 100644 --- a/crates/polars-parquet/src/parquet/encoding/delta_bitpacked/decoder.rs +++ b/crates/polars-parquet/src/parquet/encoding/delta_bitpacked/decoder.rs @@ -1,246 +1,811 @@ +//! This module implements the `DELTA_BINARY_PACKED` encoding. +//! +//! For performance reasons this is done without iterators. Instead, we have `gather_n` functions +//! and a `DeltaGatherer` trait. These allow efficient decoding and mapping of the decoded values. +//! +//! Full information on the delta encoding can be found on the Apache Parquet Format repository. +//! +//! +//! +//! Delta encoding compresses sequential integer values by encoding the first value and the +//! differences between consequentive values. This variant encodes the data into `Block`s and +//! `MiniBlock`s. +//! +//! - A `Block` contains a minimum delta, bitwidths and one or more miniblocks. +//! - A `MiniBlock` contains many deltas that are encoded in [`bitpacked`] encoding. +//! +//! The decoder keeps track of the last value and calculates a new value with the following +//! function. +//! +//! ```text +//! NextValue(Delta) = { +//! Value = Decoder.LastValue + Delta + Block.MinDelta +//! Decoder.LastValue = Value +//! return Value +//! } +//! ``` +//! +//! Note that all these additions need to be wrapping. + use super::super::{bitpacked, uleb128, zigzag_leb128}; -use crate::parquet::encoding::ceil8; +use crate::parquet::encoding::bitpacked::{Unpackable, Unpacked}; use crate::parquet::error::{ParquetError, ParquetResult}; -/// An [`Iterator`] of [`i64`] +const MAX_BITWIDTH: u8 = 64; + +/// Decoder of parquets' `DELTA_BINARY_PACKED`. +#[derive(Debug)] +pub struct Decoder<'a> { + num_miniblocks_per_block: usize, + values_per_block: usize, + + values_remaining: usize, + + last_value: i64, + + values: &'a [u8], + + block: Block<'a>, +} + #[derive(Debug)] struct Block<'a> { - // this is the minimum delta that must be added to every value. min_delta: i64, - _num_mini_blocks: usize, - /// Number of values that each mini block has. - values_per_mini_block: usize, - bitwidths: std::slice::Iter<'a, u8>, - values: &'a [u8], - remaining: usize, // number of elements - current_index: usize, // invariant: < values_per_mini_block - // None represents a relative delta of zero, in which case there is no miniblock. - current_miniblock: Option>, - // number of bytes consumed. - consumed_bytes: usize, + + /// Bytes that give the `num_bits` for the [`bitpacked::Decoder`]. + /// + /// Invariant: `bitwidth[i] <= MAX_BITWIDTH` for all `i` + bitwidths: &'a [u8], + values_remaining: usize, + miniblock: MiniBlock<'a>, +} + +#[derive(Debug)] +struct MiniBlock<'a> { + decoder: bitpacked::Decoder<'a, u64>, + buffered: ::Unpacked, + unpacked_start: usize, + unpacked_end: usize, } -impl<'a> Block<'a> { - pub fn try_new( - mut values: &'a [u8], - num_mini_blocks: usize, - values_per_mini_block: usize, - length: usize, - ) -> ParquetResult { - let length = std::cmp::min(length, num_mini_blocks * values_per_mini_block); - - let mut consumed_bytes = 0; - let (min_delta, consumed) = zigzag_leb128::decode(values); - consumed_bytes += consumed; - values = &values[consumed..]; - - if num_mini_blocks > values.len() { - return Err(ParquetError::oos( - "Block must contain at least num_mini_blocks bytes (the bitwidths)", - )); +struct SkipGatherer; +pub(crate) struct SumGatherer(pub(crate) usize); + +pub trait DeltaGatherer { + type Target: std::fmt::Debug; + + fn target_len(&self, target: &Self::Target) -> usize; + fn target_reserve(&self, target: &mut Self::Target, n: usize); + + /// Gather one element with value `v` into `target`. + fn gather_one(&mut self, target: &mut Self::Target, v: i64) -> ParquetResult<()>; + + /// Gather `num_repeats` elements into `target`. + /// + /// The first value is `v` and the `n`-th value is `v + (n-1)*delta`. + fn gather_constant( + &mut self, + target: &mut Self::Target, + v: i64, + delta: i64, + num_repeats: usize, + ) -> ParquetResult<()> { + for i in 0..num_repeats { + self.gather_one(target, v + (i as i64) * delta)?; + } + Ok(()) + } + /// Gather a `slice` of elements into `target`. + fn gather_slice(&mut self, target: &mut Self::Target, slice: &[i64]) -> ParquetResult<()> { + for &v in slice { + self.gather_one(target, v)?; } - let (bitwidths, remaining) = values.split_at(num_mini_blocks); - consumed_bytes += num_mini_blocks; - values = remaining; + Ok(()) + } + /// Gather a `chunk` of elements into `target`. + fn gather_chunk(&mut self, target: &mut Self::Target, chunk: &[i64; 64]) -> ParquetResult<()> { + self.gather_slice(target, chunk) + } +} - let mut block = Block { - min_delta, - _num_mini_blocks: num_mini_blocks, - values_per_mini_block, - bitwidths: bitwidths.iter(), - remaining: length, - values, - current_index: 0, - current_miniblock: None, - consumed_bytes, - }; +impl DeltaGatherer for SkipGatherer { + type Target = usize; - // Set up first mini-block - block.advance_miniblock()?; + fn target_len(&self, target: &Self::Target) -> usize { + *target + } + fn target_reserve(&self, _target: &mut Self::Target, _n: usize) {} - Ok(block) + fn gather_one(&mut self, target: &mut Self::Target, _v: i64) -> ParquetResult<()> { + *target += 1; + Ok(()) + } + fn gather_constant( + &mut self, + target: &mut Self::Target, + _v: i64, + _delta: i64, + num_repeats: usize, + ) -> ParquetResult<()> { + *target += num_repeats; + Ok(()) } + fn gather_chunk(&mut self, target: &mut Self::Target, chunk: &[i64; 64]) -> ParquetResult<()> { + *target += chunk.len(); + Ok(()) + } + fn gather_slice(&mut self, target: &mut Self::Target, slice: &[i64]) -> ParquetResult<()> { + *target += slice.len(); + Ok(()) + } +} - fn advance_miniblock(&mut self) -> ParquetResult<()> { - // unwrap is ok: we sliced it by num_mini_blocks in try_new - let num_bits = self.bitwidths.next().copied().unwrap() as usize; +impl DeltaGatherer for SumGatherer { + type Target = usize; - self.current_miniblock = if num_bits > 0 { - let length = std::cmp::min(self.remaining, self.values_per_mini_block); + fn target_len(&self, _target: &Self::Target) -> usize { + self.0 + } + fn target_reserve(&self, _target: &mut Self::Target, _n: usize) {} - let miniblock_length = ceil8(self.values_per_mini_block * num_bits); - if miniblock_length > self.values.len() { - return Err(ParquetError::oos( - "block must contain at least miniblock_length bytes (the mini block)", - )); - } - let (miniblock, remainder) = self.values.split_at(miniblock_length); - - self.values = remainder; - self.consumed_bytes += miniblock_length; - - Some( - bitpacked::Decoder::try_new(miniblock, num_bits, length) - .unwrap() - .collect_into_iter(), - ) - } else { - None - }; - self.current_index = 0; + fn gather_one(&mut self, target: &mut Self::Target, v: i64) -> ParquetResult<()> { + if v < 0 { + return Err(ParquetError::oos(format!( + "Invalid delta encoding length {v}" + ))); + } + *target += v as usize; + self.0 += 1; Ok(()) } -} + fn gather_constant( + &mut self, + target: &mut Self::Target, + v: i64, + delta: i64, + num_repeats: usize, + ) -> ParquetResult<()> { + if v < 0 || (delta < 0 && num_repeats as i64 * delta + v < 0) { + return Err(ParquetError::oos("Invalid delta encoding length")); + } + + let base = v * num_repeats as i64; + let is_even = num_repeats & 1; + // SUM_i=0^n f * i = f * (n(n+1)/2) + let increment = (num_repeats >> is_even) * ((num_repeats + 1) >> (is_even ^ 1)); -impl<'a> Iterator for Block<'a> { - type Item = Result; + *target += base as usize + increment; - fn next(&mut self) -> Option { - if self.remaining == 0 { - return None; + Ok(()) + } + fn gather_slice(&mut self, target: &mut Self::Target, slice: &[i64]) -> ParquetResult<()> { + let min = slice.iter().copied().min().unwrap_or_default(); + if min < 0 { + return Err(ParquetError::oos(format!( + "Invalid delta encoding length {min}" + ))); } - let result = self.min_delta - + self - .current_miniblock - .as_mut() - .map(|x| x.next().unwrap_or_default()) - .unwrap_or(0) as i64; - self.current_index += 1; - self.remaining -= 1; - - if self.remaining > 0 && self.current_index == self.values_per_mini_block { - if let Err(e) = self.advance_miniblock() { - return Some(Err(e)); - } + + *target += slice.iter().copied().map(|v| v as usize).sum::(); + self.0 += slice.len(); + Ok(()) + } + fn gather_chunk(&mut self, target: &mut Self::Target, chunk: &[i64; 64]) -> ParquetResult<()> { + let min = chunk.iter().copied().min().unwrap_or_default(); + if min < 0 { + return Err(ParquetError::oos(format!( + "Invalid delta encoding length {min}" + ))); } + *target += chunk.iter().copied().map(|v| v as usize).sum::(); + self.0 += chunk.len(); + Ok(()) + } +} - Some(Ok(result)) +/// Gather the rest of the [`bitpacked::Decoder`] into `target` +fn gather_bitpacked( + target: &mut G::Target, + min_delta: i64, + last_value: &mut i64, + mut decoder: bitpacked::Decoder, + gatherer: &mut G, +) -> ParquetResult<()> { + let mut chunked = decoder.chunked(); + for mut chunk in &mut chunked { + for value in &mut chunk { + *last_value = last_value + .wrapping_add(*value as i64) + .wrapping_add(min_delta); + *value = *last_value as u64; + } + + let chunk = bytemuck::cast_ref(&chunk); + gatherer.gather_chunk(target, chunk)?; + } + + if let Some((mut chunk, length)) = chunked.next_inexact() { + let slice = &mut chunk[..length]; + + for value in slice.iter_mut() { + *last_value = last_value + .wrapping_add(*value as i64) + .wrapping_add(min_delta); + *value = *last_value as u64; + } + + let slice = bytemuck::cast_slice(slice); + gatherer.gather_slice(target, slice)?; } + + Ok(()) } -/// Decoder of parquets' `DELTA_BINARY_PACKED`. Implements `Iterator`. -/// # Implementation -/// This struct does not allocate on the heap. -#[derive(Debug)] -pub struct Decoder<'a> { - num_mini_blocks: usize, - values_per_mini_block: usize, - values_remaining: usize, - next_value: i64, - values: &'a [u8], - current_block: Option>, - // the total number of bytes consumed up to a given point, excluding the bytes on the current_block - consumed_bytes: usize, +/// Gather an entire [`MiniBlock`] into `target` +fn gather_miniblock( + target: &mut G::Target, + min_delta: i64, + bitwidth: u8, + values: &[u8], + values_per_miniblock: usize, + last_value: &mut i64, + gatherer: &mut G, +) -> ParquetResult<()> { + let bitwidth = bitwidth as usize; + + debug_assert!(bitwidth <= 64); + debug_assert_eq!((bitwidth * values_per_miniblock).div_ceil(8), values.len()); + + let start_length = gatherer.target_len(target); + gather_bitpacked( + target, + min_delta, + last_value, + bitpacked::Decoder::new(values, bitwidth, values_per_miniblock), + gatherer, + )?; + let target_length = gatherer.target_len(target); + + debug_assert_eq!(target_length - start_length, values_per_miniblock); + + Ok(()) +} + +/// Gather an entire [`Block`] into `target` +fn gather_block<'a, G: DeltaGatherer>( + target: &mut G::Target, + num_miniblocks: usize, + values_per_miniblock: usize, + mut values: &'a [u8], + last_value: &mut i64, + gatherer: &mut G, +) -> ParquetResult<&'a [u8]> { + let (min_delta, consumed) = zigzag_leb128::decode(values); + values = &values[consumed..]; + let bitwidths; + (bitwidths, values) = values + .split_at_checked(num_miniblocks) + .ok_or(ParquetError::oos( + "Not enough bitwidths available in delta encoding", + ))?; + + gatherer.target_reserve(target, num_miniblocks * values_per_miniblock); + for &bitwidth in bitwidths { + let miniblock; + (miniblock, values) = values + .split_at_checked((bitwidth as usize * values_per_miniblock).div_ceil(8)) + .ok_or(ParquetError::oos( + "Not enough bytes for miniblock in delta encoding", + ))?; + gather_miniblock( + target, + min_delta, + bitwidth, + miniblock, + values_per_miniblock, + last_value, + gatherer, + )?; + } + + Ok(values) } impl<'a> Decoder<'a> { - pub fn try_new(mut values: &'a [u8]) -> Result { - let mut consumed_bytes = 0; - let (block_size, consumed) = uleb128::decode(values); - consumed_bytes += consumed; - assert_eq!(block_size % 128, 0); - values = &values[consumed..]; - let (num_mini_blocks, consumed) = uleb128::decode(values); - let num_mini_blocks = num_mini_blocks as usize; - consumed_bytes += consumed; - values = &values[consumed..]; + pub fn try_new(mut values: &'a [u8]) -> ParquetResult<(Self, &'a [u8])> { + let header_err = || ParquetError::oos("Insufficient bytes for Delta encoding header"); + + // header: + // + + let (values_per_block, consumed) = uleb128::decode(values); + let values_per_block = values_per_block as usize; + values = values.get(consumed..).ok_or_else(header_err)?; + + assert_eq!(values_per_block % 128, 0); + + let (num_miniblocks_per_block, consumed) = uleb128::decode(values); + let num_miniblocks_per_block = num_miniblocks_per_block as usize; + values = values.get(consumed..).ok_or_else(header_err)?; + let (total_count, consumed) = uleb128::decode(values); let total_count = total_count as usize; - consumed_bytes += consumed; - values = &values[consumed..]; + values = values.get(consumed..).ok_or_else(header_err)?; + let (first_value, consumed) = zigzag_leb128::decode(values); - consumed_bytes += consumed; - values = &values[consumed..]; - - let values_per_mini_block = block_size as usize / num_mini_blocks; - assert_eq!(values_per_mini_block % 8, 0); - - // If we only have one value (first_value), there are no blocks. - let current_block = if total_count > 1 { - Some(Block::try_new( - values, - num_mini_blocks, - values_per_mini_block, - total_count - 1, - )?) - } else { - None - }; + values = values.get(consumed..).ok_or_else(header_err)?; + + assert_eq!(values_per_block % num_miniblocks_per_block, 0); + assert_eq!((values_per_block / num_miniblocks_per_block) % 32, 0); + + let values_per_miniblock = values_per_block / num_miniblocks_per_block; + assert_eq!(values_per_miniblock % 8, 0); + + // We skip over all the values to determine where the slice stops. + // + // This also has the added benefit of error checking in advance, thus we can unwrap in + // other places. + + let mut rem = values; + if total_count > 1 { + let mut num_values_left = total_count - 1; + while num_values_left > 0 { + // If the number of values is does not need all the miniblocks anymore, we need to + // ignore the later miniblocks and regard them as having bitwidth = 0. + // + // Quoted from the specification: + // + // > If, in the last block, less than miniblocks + // > are needed to store the values, the bytes storing the bit widths of the + // > unneeded miniblocks are still present, their value should be zero, but readers + // > must accept arbitrary values as well. There are no additional padding bytes for + // > the miniblock bodies though, as if their bit widths were 0 (regardless of the + // > actual byte values). The reader knows when to stop reading by keeping track of + // > the number of values read. + let num_remaining_mini_blocks = usize::min( + num_miniblocks_per_block, + num_values_left.div_ceil(values_per_miniblock), + ); + + // block: + // + + let (_, consumed) = zigzag_leb128::decode(rem); + rem = rem.get(consumed..).ok_or(ParquetError::oos( + "No min-delta value in delta encoding miniblock", + ))?; + + if rem.len() < num_miniblocks_per_block { + return Err(ParquetError::oos( + "Not enough bitwidths available in delta encoding", + )); + } + if let Some(err_bitwidth) = rem + .get(..num_remaining_mini_blocks) + .expect("num_remaining_mini_blocks <= num_miniblocks_per_block") + .iter() + .copied() + .find(|&bitwidth| bitwidth > MAX_BITWIDTH) + { + return Err(ParquetError::oos(format!( + "Delta encoding miniblock with bitwidth {err_bitwidth} higher than maximum {MAX_BITWIDTH} bits", + ))); + } + + let num_bitpacking_bytes = rem[..num_remaining_mini_blocks] + .iter() + .copied() + .map(|bitwidth| (bitwidth as usize * values_per_miniblock).div_ceil(8)) + .sum::(); + + rem = rem + .get(num_miniblocks_per_block + num_bitpacking_bytes..) + .ok_or(ParquetError::oos( + "Not enough bytes for all bitpacked values in delta encoding", + ))?; + + num_values_left = num_values_left.saturating_sub(values_per_block); + } + } + + let values = &values[..values.len() - rem.len()]; - Ok(Self { - num_mini_blocks, - values_per_mini_block, - values_remaining: total_count, - next_value: first_value, + let decoder = Self { + num_miniblocks_per_block, + values_per_block, + values_remaining: total_count.saturating_sub(1), + last_value: first_value, values, - current_block, - consumed_bytes, - }) + + block: Block { + // @NOTE: + // We add one delta=0 into the buffered block which allows us to + // prepend like the `first_value` is just any normal value. + // + // This is a bit of a hack, but makes the rest of the logic + // **A LOT** simpler. + values_remaining: usize::from(total_count > 0), + min_delta: 0, + bitwidths: &[], + miniblock: MiniBlock { + decoder: bitpacked::Decoder::try_new_allow_zero(&[], 0, 1)?, + buffered: ::Unpacked::zero(), + unpacked_start: 0, + unpacked_end: 0, + }, + }, + }; + + Ok((decoder, rem)) } - /// Returns the total number of bytes consumed up to this point by [`Decoder`]. - pub fn consumed_bytes(&self) -> usize { - self.consumed_bytes + self.current_block.as_ref().map_or(0, |b| b.consumed_bytes) + /// Consume a new [`Block`] from `self.values`. + fn consume_block(&mut self) { + // @NOTE: All the panics here should be prevented in the `Decoder::try_new`. + + debug_assert!(!self.values.is_empty()); + + let values_per_miniblock = self.values_per_miniblock(); + + let length = usize::min(self.values_remaining, self.values_per_block); + let actual_num_miniblocks = usize::min( + self.num_miniblocks_per_block, + length.div_ceil(values_per_miniblock), + ); + + debug_assert!(actual_num_miniblocks > 0); + + // + + let (min_delta, consumed) = zigzag_leb128::decode(self.values); + + self.values = &self.values[consumed..]; + let (bitwidths, remainder) = self.values.split_at(self.num_miniblocks_per_block); + + let first_bitwidth = bitwidths[0]; + let bitwidths = &bitwidths[1..actual_num_miniblocks]; + debug_assert!(first_bitwidth <= MAX_BITWIDTH); + let first_bitwidth = first_bitwidth as usize; + + let values_in_first_miniblock = usize::min(length, values_per_miniblock); + let num_allocated_bytes = (first_bitwidth * values_per_miniblock).div_ceil(8); + let num_actual_bytes = (first_bitwidth * values_in_first_miniblock).div_ceil(8); + let (bytes, remainder) = remainder.split_at(num_allocated_bytes); + let bytes = &bytes[..num_actual_bytes]; + + let decoder = + bitpacked::Decoder::new_allow_zero(bytes, first_bitwidth, values_in_first_miniblock); + + self.block = Block { + min_delta, + bitwidths, + values_remaining: length, + miniblock: MiniBlock { + decoder, + // We can leave this as it should not be read before it is updated + buffered: self.block.miniblock.buffered, + unpacked_start: 0, + unpacked_end: 0, + }, + }; + + self.values_remaining -= length; + self.values = remainder; } - fn load_delta(&mut self) -> Result { - // At this point we must have at least one block and value available - let current_block = self.current_block.as_mut().unwrap(); - if let Some(x) = current_block.next() { - x - } else { - // load next block - self.values = &self.values[current_block.consumed_bytes..]; - self.consumed_bytes += current_block.consumed_bytes; - - let next_block = Block::try_new( - self.values, - self.num_mini_blocks, - self.values_per_mini_block, - self.values_remaining, + /// Gather `n` elements from the current [`MiniBlock`] to `target` + fn gather_miniblock_n_into( + &mut self, + target: &mut G::Target, + mut n: usize, + gatherer: &mut G, + ) -> ParquetResult<()> { + debug_assert!(n > 0); + debug_assert!(self.miniblock_len() >= n); + + // If the `num_bits == 0`, the delta is constant and equal to `min_delta`. The + // `bitpacked::Decoder` basically only keeps track of the length. + if self.block.miniblock.decoder.num_bits() == 0 { + let num_repeats = usize::min(self.miniblock_len(), n); + let v = self.last_value.wrapping_add(self.block.min_delta); + gatherer.gather_constant(target, v, self.block.min_delta, num_repeats)?; + self.last_value = self + .last_value + .wrapping_add(self.block.min_delta * num_repeats as i64); + self.block.miniblock.decoder.length -= num_repeats; + return Ok(()); + } + + if self.block.miniblock.unpacked_start < self.block.miniblock.unpacked_end { + let length = usize::min( + n, + self.block.miniblock.unpacked_end - self.block.miniblock.unpacked_start, ); - match next_block { - Ok(mut next_block) => { - let delta = next_block - .next() - .ok_or_else(|| ParquetError::oos("Missing block"))?; - self.current_block = Some(next_block); - delta - }, - Err(e) => Err(e), + self.block.miniblock.buffered + [self.block.miniblock.unpacked_start..self.block.miniblock.unpacked_start + length] + .iter_mut() + .for_each(|v| { + self.last_value = self + .last_value + .wrapping_add(*v as i64) + .wrapping_add(self.block.min_delta); + *v = self.last_value as u64; + }); + gatherer.gather_slice( + target, + bytemuck::cast_slice( + &self.block.miniblock.buffered[self.block.miniblock.unpacked_start + ..self.block.miniblock.unpacked_start + length], + ), + )?; + n -= length; + self.block.miniblock.unpacked_start += length; + } + + if n == 0 { + return Ok(()); + } + + const ITEMS_PER_PACK: usize = <::Unpacked as Unpacked>::LENGTH; + for _ in 0..n / ITEMS_PER_PACK { + let mut chunk = self.block.miniblock.decoder.chunked().next().unwrap(); + chunk.iter_mut().for_each(|v| { + self.last_value = self + .last_value + .wrapping_add(*v as i64) + .wrapping_add(self.block.min_delta); + *v = self.last_value as u64; + }); + gatherer.gather_chunk(target, bytemuck::cast_ref(&chunk))?; + n -= ITEMS_PER_PACK; + } + + if n == 0 { + return Ok(()); + } + + let Some((chunk, len)) = self.block.miniblock.decoder.chunked().next_inexact() else { + debug_assert_eq!(n, 0); + self.block.miniblock.buffered = ::Unpacked::zero(); + self.block.miniblock.unpacked_start = 0; + self.block.miniblock.unpacked_end = 0; + return Ok(()); + }; + + self.block.miniblock.buffered = chunk; + self.block.miniblock.unpacked_start = 0; + self.block.miniblock.unpacked_end = len; + + if n > 0 { + let length = usize::min(n, self.block.miniblock.unpacked_end); + self.block.miniblock.buffered[..length] + .iter_mut() + .for_each(|v| { + self.last_value = self + .last_value + .wrapping_add(*v as i64) + .wrapping_add(self.block.min_delta); + *v = self.last_value as u64; + }); + gatherer.gather_slice( + target, + bytemuck::cast_slice(&self.block.miniblock.buffered[..length]), + )?; + self.block.miniblock.unpacked_start = length; + } + + Ok(()) + } + + /// Gather `n` elements from the current [`Block`] to `target` + fn gather_block_n_into( + &mut self, + target: &mut G::Target, + n: usize, + gatherer: &mut G, + ) -> ParquetResult<()> { + let values_per_miniblock = self.values_per_miniblock(); + + debug_assert!(n <= self.values_per_block); + debug_assert!(self.values_per_block >= values_per_miniblock); + debug_assert_eq!(self.values_per_block % values_per_miniblock, 0); + + let mut n = usize::min(self.block.values_remaining, n); + + if n == 0 { + return Ok(()); + } + + let miniblock_len = self.miniblock_len(); + if n < miniblock_len { + self.gather_miniblock_n_into(target, n, gatherer)?; + debug_assert_eq!(self.miniblock_len(), miniblock_len - n); + self.block.values_remaining -= n; + return Ok(()); + } + + if miniblock_len > 0 { + self.gather_miniblock_n_into(target, miniblock_len, gatherer)?; + n -= miniblock_len; + self.block.values_remaining -= miniblock_len; + } + + while n >= values_per_miniblock { + let bitwidth = self.block.bitwidths[0]; + self.block.bitwidths = &self.block.bitwidths[1..]; + + let miniblock; + (miniblock, self.values) = self + .values + .split_at((bitwidth as usize * values_per_miniblock).div_ceil(8)); + gather_miniblock( + target, + self.block.min_delta, + bitwidth, + miniblock, + values_per_miniblock, + &mut self.last_value, + gatherer, + )?; + n -= values_per_miniblock; + self.block.values_remaining -= values_per_miniblock; + } + + if n == 0 { + return Ok(()); + } + + if !self.block.bitwidths.is_empty() { + let bitwidth = self.block.bitwidths[0]; + self.block.bitwidths = &self.block.bitwidths[1..]; + + if bitwidth > MAX_BITWIDTH { + return Err(ParquetError::oos(format!( + "Delta encoding bitwidth '{bitwidth}' is larger than maximum {MAX_BITWIDTH})" + ))); + } + + let length = usize::min(values_per_miniblock, self.block.values_remaining); + + let num_allocated_bytes = (bitwidth as usize * values_per_miniblock).div_ceil(8); + let num_actual_bytes = (bitwidth as usize * length).div_ceil(8); + + let miniblock; + (miniblock, self.values) = + self.values + .split_at_checked(num_allocated_bytes) + .ok_or(ParquetError::oos( + "Not enough space for delta encoded miniblock", + ))?; + + let miniblock = &miniblock[..num_actual_bytes]; + + let decoder = + bitpacked::Decoder::try_new_allow_zero(miniblock, bitwidth as usize, length)?; + self.block.miniblock = MiniBlock { + decoder, + buffered: self.block.miniblock.buffered, + unpacked_start: 0, + unpacked_end: 0, + }; + + if n > 0 { + self.gather_miniblock_n_into(target, n, gatherer)?; + self.block.values_remaining -= n; } } + + Ok(()) } -} -impl<'a> Iterator for Decoder<'a> { - type Item = Result; + /// Gather `n` elements to `target` + pub fn gather_n_into( + &mut self, + target: &mut G::Target, + mut n: usize, + gatherer: &mut G, + ) -> ParquetResult<()> { + n = usize::min(n, self.len()); + + if n == 0 { + return Ok(()); + } + + let values_per_miniblock = self.values_per_block / self.num_miniblocks_per_block; - fn next(&mut self) -> Option { - if self.values_remaining == 0 { - return None; + let start_num_values_remaining = self.block.values_remaining; + if n <= self.block.values_remaining { + self.gather_block_n_into(target, n, gatherer)?; + debug_assert_eq!(self.block.values_remaining, start_num_values_remaining - n); + return Ok(()); } - let result = Some(Ok(self.next_value)); + n -= self.block.values_remaining; + self.gather_block_n_into(target, self.block.values_remaining, gatherer)?; + debug_assert_eq!(self.block.values_remaining, 0); - self.values_remaining -= 1; - if self.values_remaining == 0 { - // do not try to load another block - return result; + while usize::min(n, self.values_remaining) >= self.values_per_block { + self.values = gather_block( + target, + self.num_miniblocks_per_block, + values_per_miniblock, + self.values, + &mut self.last_value, + gatherer, + )?; + n -= self.values_per_block; + self.values_remaining -= self.values_per_block; } - let delta = match self.load_delta() { - Ok(delta) => delta, - Err(e) => return Some(Err(e)), - }; + if n == 0 { + return Ok(()); + } + + self.consume_block(); + self.gather_block_n_into(target, n, gatherer)?; + + Ok(()) + } + + pub fn skip_in_place(&mut self, n: usize) -> ParquetResult<()> { + let mut gatherer = SkipGatherer; + self.gather_n_into(&mut 0usize, n, &mut gatherer) + } + + #[cfg(test)] + pub(crate) fn collect_n>( + &mut self, + e: &mut E, + n: usize, + ) -> ParquetResult<()> { + struct ExtendGatherer<'a, E: std::fmt::Debug + Extend>( + std::marker::PhantomData<&'a E>, + ); + + impl<'a, E: std::fmt::Debug + Extend> DeltaGatherer for ExtendGatherer<'a, E> { + type Target = (usize, &'a mut E); + + fn target_len(&self, target: &Self::Target) -> usize { + target.0 + } + + fn target_reserve(&self, _target: &mut Self::Target, _n: usize) {} + + fn gather_one(&mut self, target: &mut Self::Target, v: i64) -> ParquetResult<()> { + target.1.extend(Some(v)); + target.0 += 1; + Ok(()) + } + } + + let mut gatherer = ExtendGatherer(std::marker::PhantomData); + let mut target = (0, e); + + self.gather_n_into(&mut target, n, &mut gatherer) + } + + #[cfg(test)] + pub(crate) fn collect + Default>( + mut self, + ) -> ParquetResult { + let mut e = E::default(); + self.collect_n(&mut e, self.len())?; + Ok(e) + } + + pub fn len(&self) -> usize { + self.values_remaining + self.block.values_remaining + } - self.next_value += delta; - result + fn values_per_miniblock(&self) -> usize { + debug_assert_eq!(self.values_per_block % self.num_miniblocks_per_block, 0); + self.values_per_block / self.num_miniblocks_per_block } - fn size_hint(&self) -> (usize, Option) { - (self.values_remaining, Some(self.values_remaining)) + fn miniblock_len(&self) -> usize { + self.block.miniblock.unpacked_end - self.block.miniblock.unpacked_start + + self.block.miniblock.decoder.len() } } @@ -259,11 +824,11 @@ mod tests { // first_value: 2 <=z> 1 let data = &[128, 1, 4, 1, 2]; - let mut decoder = Decoder::try_new(data).unwrap(); - let r = decoder.by_ref().collect::, _>>().unwrap(); + let (decoder, rem) = Decoder::try_new(data).unwrap(); + let r = decoder.collect::>().unwrap(); assert_eq!(&r[..], &[1]); - assert_eq!(decoder.consumed_bytes(), 5); + assert_eq!(data.len() - rem.len(), 5); } #[test] @@ -280,12 +845,12 @@ mod tests { // bit_width: 0 let data = &[128, 1, 4, 5, 2, 2, 0, 0, 0, 0]; - let mut decoder = Decoder::try_new(data).unwrap(); - let r = decoder.by_ref().collect::, _>>().unwrap(); + let (decoder, rem) = Decoder::try_new(data).unwrap(); + let r = decoder.collect::>().unwrap(); assert_eq!(expected, r); - assert_eq!(decoder.consumed_bytes(), 10); + assert_eq!(data.len() - rem.len(), 10); } #[test] @@ -311,11 +876,11 @@ mod tests { 1, 2, 3, ]; - let mut decoder = Decoder::try_new(data).unwrap(); - let r = decoder.by_ref().collect::, _>>().unwrap(); + let (decoder, rem) = Decoder::try_new(data).unwrap(); + let r = decoder.collect::>().unwrap(); assert_eq!(expected, r); - assert_eq!(decoder.consumed_bytes(), data.len() - 3); + assert_eq!(rem, &[1, 2, 3]); } #[test] @@ -357,10 +922,11 @@ mod tests { -2, 2, 6, 10, 14, 18, 22, 26, 30, 34, 38, 42, 46, 50, ]; - let mut decoder = Decoder::try_new(data).unwrap(); - let r = decoder.by_ref().collect::, _>>().unwrap(); + let (decoder, rem) = Decoder::try_new(data).unwrap(); + let r = decoder.collect::>().unwrap(); assert_eq!(&expected[..], &r[..]); - assert_eq!(decoder.consumed_bytes(), data.len() - 3); + assert_eq!(data.len() - rem.len(), data.len() - 3); + assert_eq!(rem.len(), 3); } } diff --git a/crates/polars-parquet/src/parquet/encoding/delta_bitpacked/encoder.rs b/crates/polars-parquet/src/parquet/encoding/delta_bitpacked/encoder.rs index 9bdb861504d1..24b6ea6523b8 100644 --- a/crates/polars-parquet/src/parquet/encoding/delta_bitpacked/encoder.rs +++ b/crates/polars-parquet/src/parquet/encoding/delta_bitpacked/encoder.rs @@ -5,49 +5,60 @@ use crate::parquet::encoding::ceil8; /// # Implementation /// * This function does not allocate on the heap. /// * The number of mini-blocks is always 1. This may change in the future. -pub fn encode>(mut iterator: I, buffer: &mut Vec) { - let block_size = 128; - let mini_blocks = 1; +pub fn encode>( + mut iterator: I, + buffer: &mut Vec, + num_miniblocks_per_block: usize, +) { + const BLOCK_SIZE: usize = 256; + assert!([1, 2, 4].contains(&num_miniblocks_per_block)); + let values_per_miniblock = BLOCK_SIZE / num_miniblocks_per_block; let mut container = [0u8; 10]; - let encoded_len = uleb128::encode(block_size, &mut container); + let encoded_len = uleb128::encode(BLOCK_SIZE as u64, &mut container); buffer.extend_from_slice(&container[..encoded_len]); - let encoded_len = uleb128::encode(mini_blocks, &mut container); + let encoded_len = uleb128::encode(num_miniblocks_per_block as u64, &mut container); buffer.extend_from_slice(&container[..encoded_len]); - let length = iterator.size_hint().1.unwrap(); + let length = iterator.len(); let encoded_len = uleb128::encode(length as u64, &mut container); buffer.extend_from_slice(&container[..encoded_len]); - let mut values = [0i64; 128]; - let mut deltas = [0u64; 128]; + let mut values = [0i64; BLOCK_SIZE]; + let mut deltas = [0u64; BLOCK_SIZE]; + let mut num_bits = [0u8; 4]; let first_value = iterator.next().unwrap_or_default(); let (container, encoded_len) = zigzag_leb128::encode(first_value); buffer.extend_from_slice(&container[..encoded_len]); let mut prev = first_value; - let mut length = iterator.size_hint().1.unwrap(); + let mut length = iterator.len(); while length != 0 { let mut min_delta = i64::MAX; let mut max_delta = i64::MIN; - let mut num_bits = 0; - for (i, integer) in (0..128).zip(&mut iterator) { - let delta = integer - prev; + for (i, integer) in iterator.by_ref().enumerate().take(BLOCK_SIZE) { + if i % values_per_miniblock == 0 { + min_delta = i64::MAX; + max_delta = i64::MIN + } + + let delta = integer.wrapping_sub(prev); min_delta = min_delta.min(delta); max_delta = max_delta.max(delta); - num_bits = 64 - (max_delta - min_delta).leading_zeros(); + let miniblock_idx = i / values_per_miniblock; + num_bits[miniblock_idx] = (64 - max_delta.abs_diff(min_delta).leading_zeros()) as u8; values[i] = delta; prev = integer; } - let consumed = std::cmp::min(length - iterator.size_hint().1.unwrap(), 128); - length = iterator.size_hint().1.unwrap(); + let consumed = std::cmp::min(length - iterator.len(), BLOCK_SIZE); + length = iterator.len(); let values = &values[..consumed]; values.iter().zip(deltas.iter_mut()).for_each(|(v, delta)| { - *delta = (v - min_delta) as u64; + *delta = v.wrapping_sub(min_delta) as u64; }); // @@ -55,19 +66,32 @@ pub fn encode>(mut iterator: I, buffer: &mut Vec) { buffer.extend_from_slice(&container[..encoded_len]); // one miniblock => 1 byte - buffer.push(num_bits as u8); - write_miniblock(buffer, num_bits as usize, deltas); + let mut values_remaining = consumed; + buffer.extend_from_slice(&num_bits[..num_miniblocks_per_block]); + for i in 0..num_miniblocks_per_block { + if values_remaining == 0 { + break; + } + + values_remaining = values_remaining.saturating_sub(values_per_miniblock); + write_miniblock( + buffer, + num_bits[i], + &deltas[i * values_per_miniblock..(i + 1) * values_per_miniblock], + ); + } } } -fn write_miniblock(buffer: &mut Vec, num_bits: usize, deltas: [u64; 128]) { +fn write_miniblock(buffer: &mut Vec, num_bits: u8, deltas: &[u64]) { + let num_bits = num_bits as usize; if num_bits > 0 { let start = buffer.len(); // bitpack encode all (deltas.len = 128 which is a multiple of 32) let bytes_needed = start + ceil8(deltas.len() * num_bits); buffer.resize(bytes_needed, 0); - bitpacked::encode(deltas.as_ref(), num_bits, &mut buffer[start..]); + bitpacked::encode(deltas, num_bits, &mut buffer[start..]); let bytes_needed = start + ceil8(deltas.len() * num_bits); buffer.truncate(bytes_needed); @@ -80,8 +104,8 @@ mod tests { #[test] fn constant_delta() { - // header: [128, 1, 1, 5, 2]: - // block size: 128 <=u> 128, 1 + // header: [128, 2, 1, 5, 2]: + // block size: 256 <=u> 128, 2 // mini-blocks: 1 <=u> 1 // elements: 5 <=u> 5 // first_value: 2 <=z> 1 @@ -89,10 +113,10 @@ mod tests { // min_delta: 1 <=z> 2 // bitwidth: 0 let data = 1..=5; - let expected = vec![128u8, 1, 1, 5, 2, 2, 0]; + let expected = vec![128u8, 2, 1, 5, 2, 2, 0]; let mut buffer = vec![]; - encode(data, &mut buffer); + encode(data.collect::>().into_iter(), &mut buffer, 1); assert_eq!(expected, buffer); } @@ -100,8 +124,8 @@ mod tests { fn negative_min_delta() { // max - min = 1 - -4 = 5 let data = vec![1, 2, 3, 4, 5, 1]; - // header: [128, 1, 4, 6, 2] - // block size: 128 <=u> 128, 1 + // header: [128, 2, 4, 6, 2] + // block size: 256 <=u> 128, 2 // mini-blocks: 1 <=u> 1 // elements: 6 <=u> 5 // first_value: 2 <=z> 1 @@ -112,11 +136,11 @@ mod tests { // 0b01101101 // 0b00001011 // ] - let mut expected = vec![128u8, 1, 1, 6, 2, 7, 3, 0b01101101, 0b00001011]; - expected.extend(std::iter::repeat(0).take(128 * 3 / 8 - 2)); // 128 values, 3 bits, 2 already used + let mut expected = vec![128u8, 2, 1, 6, 2, 7, 3, 0b01101101, 0b00001011]; + expected.extend(std::iter::repeat(0).take(256 * 3 / 8 - 2)); // 128 values, 3 bits, 2 already used let mut buffer = vec![]; - encode(data.into_iter(), &mut buffer); + encode(data.into_iter(), &mut buffer, 1); assert_eq!(expected, buffer); } } diff --git a/crates/polars-parquet/src/parquet/encoding/delta_bitpacked/fuzz.rs b/crates/polars-parquet/src/parquet/encoding/delta_bitpacked/fuzz.rs new file mode 100644 index 000000000000..dc16bc8353fd --- /dev/null +++ b/crates/polars-parquet/src/parquet/encoding/delta_bitpacked/fuzz.rs @@ -0,0 +1,76 @@ +#[ignore = "Fuzz test. Takes too long"] +#[test] +fn fuzz_test_delta_encoding() -> Result<(), Box> { + use rand::Rng; + + use super::DeltaGatherer; + use crate::parquet::error::ParquetResult; + + struct SimpleGatherer; + + impl DeltaGatherer for SimpleGatherer { + type Target = Vec; + + fn target_len(&self, target: &Self::Target) -> usize { + target.len() + } + + fn target_reserve(&self, target: &mut Self::Target, n: usize) { + target.reserve(n); + } + + fn gather_one(&mut self, target: &mut Self::Target, v: i64) -> ParquetResult<()> { + target.push(v); + Ok(()) + } + } + + const MIN_VALUES: usize = 1; + const MAX_VALUES: usize = 515; + + const MIN: i64 = i64::MIN; + const MAX: i64 = i64::MAX; + + const NUM_ITERATIONS: usize = 1_000_000; + + let mut values = Vec::with_capacity(MAX_VALUES); + let mut rng = rand::thread_rng(); + + let mut encoded = Vec::with_capacity(MAX_VALUES); + let mut decoded = Vec::with_capacity(MAX_VALUES); + let mut gatherer = SimpleGatherer; + + for i in 0..NUM_ITERATIONS { + values.clear(); + + let num_values = rng.gen_range(MIN_VALUES..=MAX_VALUES); + values.extend(std::iter::from_fn(|| Some(rng.gen_range(MIN..=MAX))).take(num_values)); + + encoded.clear(); + decoded.clear(); + + super::encode( + values.iter().copied(), + &mut encoded, + 1 << rng.gen_range(0..=2), + ); + let (mut decoder, rem) = super::Decoder::try_new(&encoded)?; + + assert!(rem.is_empty()); + + let mut num_remaining = num_values; + while num_remaining > 0 { + let n = rng.gen_range(1usize..=num_remaining); + decoder.gather_n_into(&mut decoded, n, &mut gatherer)?; + num_remaining -= n; + } + + assert_eq!(values, decoded); + + if i % 1000 == 999 { + eprintln!("[INFO]: {} iterations done.", i + 1); + } + } + + Ok(()) +} diff --git a/crates/polars-parquet/src/parquet/encoding/delta_bitpacked/mod.rs b/crates/polars-parquet/src/parquet/encoding/delta_bitpacked/mod.rs index 4f7922821c5f..23e67ee7fb4f 100644 --- a/crates/polars-parquet/src/parquet/encoding/delta_bitpacked/mod.rs +++ b/crates/polars-parquet/src/parquet/encoding/delta_bitpacked/mod.rs @@ -1,23 +1,24 @@ mod decoder; mod encoder; +mod fuzz; -pub use decoder::Decoder; -pub use encoder::encode; +pub(crate) use decoder::{Decoder, DeltaGatherer, SumGatherer}; +pub(crate) use encoder::encode; #[cfg(test)] mod tests { use super::*; - use crate::parquet::error::ParquetError; + use crate::parquet::error::{ParquetError, ParquetResult}; #[test] fn basic() -> Result<(), ParquetError> { let data = vec![1, 3, 1, 2, 3]; let mut buffer = vec![]; - encode(data.clone().into_iter(), &mut buffer); - let iter = Decoder::try_new(&buffer)?; + encode(data.clone().into_iter(), &mut buffer, 1); + let (iter, _) = Decoder::try_new(&buffer)?; - let result = iter.collect::, _>>()?; + let result = iter.collect::>()?; assert_eq!(result, data); Ok(()) } @@ -27,10 +28,10 @@ mod tests { let data = vec![1, 3, -1, 2, 3]; let mut buffer = vec![]; - encode(data.clone().into_iter(), &mut buffer); - let iter = Decoder::try_new(&buffer)?; + encode(data.clone().into_iter(), &mut buffer, 1); + let (iter, _) = Decoder::try_new(&buffer)?; - let result = iter.collect::, _>>()?; + let result = iter.collect::>()?; assert_eq!(result, data); Ok(()) } @@ -48,10 +49,10 @@ mod tests { ]; let mut buffer = vec![]; - encode(data.clone().into_iter(), &mut buffer); - let iter = Decoder::try_new(&buffer)?; + encode(data.clone().into_iter(), &mut buffer, 1); + let (iter, _) = Decoder::try_new(&buffer)?; - let result = iter.collect::, ParquetError>>()?; + let result = iter.collect::>()?; assert_eq!(result, data); Ok(()) } @@ -64,10 +65,10 @@ mod tests { } let mut buffer = vec![]; - encode(data.clone().into_iter(), &mut buffer); - let iter = Decoder::try_new(&buffer)?; + encode(data.clone().into_iter(), &mut buffer, 1); + let (iter, _) = Decoder::try_new(&buffer)?; - let result = iter.collect::, _>>()?; + let result = iter.collect::>()?; assert_eq!(result, data); Ok(()) } @@ -77,14 +78,47 @@ mod tests { let data = vec![2, 3, 1, 2, 1]; let mut buffer = vec![]; - encode(data.clone().into_iter(), &mut buffer); - let len = buffer.len(); - let mut iter = Decoder::try_new(&buffer)?; + encode(data.clone().into_iter(), &mut buffer, 1); + let (iter, _) = Decoder::try_new(&buffer)?; - let result = iter.by_ref().collect::, _>>()?; + let result = iter.collect::>()?; + assert_eq!(result, data); + + Ok(()) + } + + #[test] + fn overflow_constant() -> ParquetResult<()> { + let data = vec![i64::MIN, i64::MAX, i64::MIN, i64::MAX]; + + let mut buffer = vec![]; + encode(data.clone().into_iter(), &mut buffer, 1); + let (iter, _) = Decoder::try_new(&buffer)?; + + let result = iter.collect::>()?; + assert_eq!(result, data); + + Ok(()) + } + + #[test] + fn overflow_vary() -> ParquetResult<()> { + let data = vec![ + 0, + i64::MAX, + i64::MAX - 1, + i64::MIN + 1, + i64::MAX, + i64::MIN + 2, + ]; + + let mut buffer = vec![]; + encode(data.clone().into_iter(), &mut buffer, 1); + let (iter, _) = Decoder::try_new(&buffer)?; + + let result = iter.collect::>()?; assert_eq!(result, data); - assert_eq!(iter.consumed_bytes(), len); Ok(()) } } diff --git a/crates/polars-parquet/src/parquet/encoding/delta_byte_array/decoder.rs b/crates/polars-parquet/src/parquet/encoding/delta_byte_array/decoder.rs index 9196eaedb7c8..03889e0aa5d3 100644 --- a/crates/polars-parquet/src/parquet/encoding/delta_byte_array/decoder.rs +++ b/crates/polars-parquet/src/parquet/encoding/delta_byte_array/decoder.rs @@ -1,5 +1,6 @@ -use super::super::{delta_bitpacked, delta_length_byte_array}; -use crate::parquet::error::ParquetError; +use super::super::delta_bitpacked; +use crate::parquet::encoding::delta_bitpacked::SumGatherer; +use crate::parquet::error::ParquetResult; /// Decodes according to [Delta strings](https://github.com/apache/parquet-format/blob/master/Encodings.md#delta-strings-delta_byte_array--7), /// prefixes, lengths and values @@ -7,32 +8,47 @@ use crate::parquet::error::ParquetError; /// This struct does not allocate on the heap. #[derive(Debug)] pub struct Decoder<'a> { - values: &'a [u8], - prefix_lengths: delta_bitpacked::Decoder<'a>, + pub(crate) prefix_lengths: delta_bitpacked::Decoder<'a>, + pub(crate) suffix_lengths: delta_bitpacked::Decoder<'a>, + pub(crate) values: &'a [u8], + + pub(crate) offset: usize, + pub(crate) last: Vec, } impl<'a> Decoder<'a> { - pub fn try_new(values: &'a [u8]) -> Result { - let prefix_lengths = delta_bitpacked::Decoder::try_new(values)?; + pub fn try_new(values: &'a [u8]) -> ParquetResult { + let (prefix_lengths, values) = delta_bitpacked::Decoder::try_new(values)?; + let (suffix_lengths, values) = delta_bitpacked::Decoder::try_new(values)?; + Ok(Self { - values, prefix_lengths, + suffix_lengths, + values, + + offset: 0, + last: Vec::with_capacity(32), }) } - pub fn into_lengths(self) -> Result, ParquetError> { - assert_eq!(self.prefix_lengths.size_hint().0, 0); - delta_length_byte_array::Decoder::try_new( - &self.values[self.prefix_lengths.consumed_bytes()..], - ) + pub fn values(&self) -> &'a [u8] { + self.values } -} -impl<'a> Iterator for Decoder<'a> { - type Item = Result; + pub fn len(&self) -> usize { + debug_assert_eq!(self.prefix_lengths.len(), self.suffix_lengths.len()); + self.prefix_lengths.len() + } - fn next(&mut self) -> Option { - self.prefix_lengths.next().map(|x| x.map(|x| x as u32)) + pub fn skip_in_place(&mut self, n: usize) -> ParquetResult<()> { + let mut prefix_sum = 0usize; + self.prefix_lengths + .gather_n_into(&mut prefix_sum, n, &mut SumGatherer(0))?; + let mut suffix_sum = 0usize; + self.suffix_lengths + .gather_n_into(&mut suffix_sum, n, &mut SumGatherer(0))?; + self.offset += prefix_sum + suffix_sum; + Ok(()) } } @@ -40,8 +56,44 @@ impl<'a> Iterator for Decoder<'a> { mod tests { use super::*; + impl<'a> Iterator for Decoder<'a> { + type Item = ParquetResult>; + + fn next(&mut self) -> Option { + if self.len() == 0 { + return None; + } + + let mut prefix_length = vec![]; + let mut suffix_length = vec![]; + if let Err(e) = self.prefix_lengths.collect_n(&mut prefix_length, 1) { + return Some(Err(e)); + } + if let Err(e) = self.suffix_lengths.collect_n(&mut suffix_length, 1) { + return Some(Err(e)); + } + let prefix_length = prefix_length[0]; + let suffix_length = suffix_length[0]; + + let prefix_length = prefix_length as usize; + let suffix_length = suffix_length as usize; + + let mut value = Vec::with_capacity(prefix_length + suffix_length); + + value.extend_from_slice(&self.last[..prefix_length]); + value.extend_from_slice(&self.values[self.offset..self.offset + suffix_length]); + + self.last.clear(); + self.last.extend_from_slice(&value); + + self.offset += suffix_length; + + Some(Ok(value)) + } + } + #[test] - fn test_bla() -> Result<(), ParquetError> { + fn test_bla() -> ParquetResult<()> { // VALIDATED from spark==3.1.1 let data = &[ 128, 1, 4, 2, 0, 0, 0, 0, 0, 0, 128, 1, 4, 2, 10, 0, 0, 0, 0, 0, 72, 101, 108, 108, @@ -50,31 +102,16 @@ mod tests { // because they are beyond the sum of all lengths. 1, 2, 3, ]; - // result of encoding - let expected = &["Hello", "World"]; - let expected_lengths = expected.iter().map(|x| x.len() as i32).collect::>(); - let expected_prefixes = vec![0, 0]; - let expected_values = expected.join(""); - let expected_values = expected_values.as_bytes(); - - let mut decoder = Decoder::try_new(data)?; - let prefixes = decoder.by_ref().collect::, _>>()?; - assert_eq!(prefixes, expected_prefixes); - - // move to the lengths - let mut decoder = decoder.into_lengths()?; - - let lengths = decoder.by_ref().collect::, _>>()?; - assert_eq!(lengths, expected_lengths); - - // move to the values - let values = decoder.values(); - assert_eq!(values, expected_values); + + let decoder = Decoder::try_new(data)?; + let values = decoder.collect::, _>>()?; + assert_eq!(values, vec![b"Hello".to_vec(), b"World".to_vec()]); + Ok(()) } #[test] - fn test_with_prefix() -> Result<(), ParquetError> { + fn test_with_prefix() -> ParquetResult<()> { // VALIDATED from spark==3.1.1 let data = &[ 128, 1, 4, 2, 0, 6, 0, 0, 0, 0, 128, 1, 4, 2, 10, 4, 0, 0, 0, 0, 72, 101, 108, 108, @@ -83,24 +120,11 @@ mod tests { // because they are beyond the sum of all lengths. 1, 2, 3, ]; - // result of encoding - let expected_lengths = vec![5, 7]; - let expected_prefixes = vec![0, 3]; - let expected_values = b"Helloicopter"; - - let mut decoder = Decoder::try_new(data)?; - let prefixes = decoder.by_ref().collect::, _>>()?; - assert_eq!(prefixes, expected_prefixes); - - // move to the lengths - let mut decoder = decoder.into_lengths()?; - let lengths = decoder.by_ref().collect::, _>>()?; - assert_eq!(lengths, expected_lengths); + let decoder = Decoder::try_new(data)?; + let prefixes = decoder.collect::, _>>()?; + assert_eq!(prefixes, vec![b"Hello".to_vec(), b"Helicopter".to_vec()]); - // move to the values - let values = decoder.values(); - assert_eq!(values, expected_values); Ok(()) } } diff --git a/crates/polars-parquet/src/parquet/encoding/delta_byte_array/encoder.rs b/crates/polars-parquet/src/parquet/encoding/delta_byte_array/encoder.rs index 1e9e071c87be..3a36e90b9966 100644 --- a/crates/polars-parquet/src/parquet/encoding/delta_byte_array/encoder.rs +++ b/crates/polars-parquet/src/parquet/encoding/delta_byte_array/encoder.rs @@ -2,7 +2,10 @@ use super::super::delta_bitpacked; use crate::parquet::encoding::delta_length_byte_array; /// Encodes an iterator of according to DELTA_BYTE_ARRAY -pub fn encode<'a, I: Iterator + Clone>(iterator: I, buffer: &mut Vec) { +pub fn encode<'a, I: ExactSizeIterator + Clone>( + iterator: I, + buffer: &mut Vec, +) { let mut previous = b"".as_ref(); let mut sum_lengths = 0; @@ -22,7 +25,7 @@ pub fn encode<'a, I: Iterator + Clone>(iterator: I, buffer: &mu prefix_length as i64 }) .collect::>(); - delta_bitpacked::encode(prefixes.iter().copied(), buffer); + delta_bitpacked::encode(prefixes.iter().copied(), buffer, 1); let remaining = iterator .zip(prefixes) diff --git a/crates/polars-parquet/src/parquet/encoding/delta_byte_array/mod.rs b/crates/polars-parquet/src/parquet/encoding/delta_byte_array/mod.rs index b5927ab95b58..2bb51511d67e 100644 --- a/crates/polars-parquet/src/parquet/encoding/delta_byte_array/mod.rs +++ b/crates/polars-parquet/src/parquet/encoding/delta_byte_array/mod.rs @@ -17,13 +17,7 @@ mod tests { let mut decoder = Decoder::try_new(&buffer)?; let prefixes = decoder.by_ref().collect::, _>>()?; - assert_eq!(prefixes, vec![0, 3]); - - // move to the lengths - let mut decoder = decoder.into_lengths()?; - - let lengths = decoder.by_ref().collect::, _>>()?; - assert_eq!(lengths, vec![5, 7]); + assert_eq!(prefixes, vec![b"Hello".to_vec(), b"Helicopter".to_vec()]); // move to the values let values = decoder.values(); diff --git a/crates/polars-parquet/src/parquet/encoding/delta_length_byte_array/decoder.rs b/crates/polars-parquet/src/parquet/encoding/delta_length_byte_array/decoder.rs index bd9a77a00add..b3191e0a51ff 100644 --- a/crates/polars-parquet/src/parquet/encoding/delta_length_byte_array/decoder.rs +++ b/crates/polars-parquet/src/parquet/encoding/delta_length_byte_array/decoder.rs @@ -1,80 +1,57 @@ use super::super::delta_bitpacked; -use crate::parquet::error::ParquetError; +use crate::parquet::encoding::delta_bitpacked::SumGatherer; +use crate::parquet::error::ParquetResult; /// Decodes [Delta-length byte array](https://github.com/apache/parquet-format/blob/master/Encodings.md#delta-length-byte-array-delta_length_byte_array--6) /// lengths and values. /// # Implementation /// This struct does not allocate on the heap. -/// # Example -/// ``` -/// use polars_parquet::parquet::encoding::delta_length_byte_array::Decoder; -/// -/// let expected = &["Hello", "World"]; -/// let expected_lengths = expected.iter().map(|x| x.len() as i32).collect::>(); -/// let expected_values = expected.join(""); -/// let expected_values = expected_values.as_bytes(); -/// let data = &[ -/// 128, 1, 4, 2, 10, 0, 0, 0, 0, 0, 72, 101, 108, 108, 111, 87, 111, 114, 108, 100, -/// ]; -/// -/// let mut decoder = Decoder::try_new(data).unwrap(); -/// -/// // Extract the lengths -/// let lengths = decoder.by_ref().collect::, _>>().unwrap(); -/// assert_eq!(lengths, expected_lengths); -/// -/// // Extract the values. This _must_ be called after consuming all lengths by reference (see above). -/// let values = decoder.into_values(); -/// -/// assert_eq!(values, expected_values); #[derive(Debug)] -pub struct Decoder<'a> { - values: &'a [u8], - lengths: delta_bitpacked::Decoder<'a>, - total_length: u32, +pub(crate) struct Decoder<'a> { + pub(crate) lengths: delta_bitpacked::Decoder<'a>, + pub(crate) values: &'a [u8], + pub(crate) offset: usize, } impl<'a> Decoder<'a> { - pub fn try_new(values: &'a [u8]) -> Result { - let lengths = delta_bitpacked::Decoder::try_new(values)?; + pub fn try_new(values: &'a [u8]) -> ParquetResult { + let (lengths, values) = delta_bitpacked::Decoder::try_new(values)?; Ok(Self { - values, lengths, - total_length: 0, + values, + offset: 0, }) } - /// Consumes this decoder and returns the slice of concatenated values. - /// # Panics - /// This function panics if this iterator has not been fully consumed. - pub fn into_values(self) -> &'a [u8] { - assert_eq!(self.lengths.size_hint().0, 0); - let start = self.lengths.consumed_bytes(); - &self.values[start..start + self.total_length as usize] + pub(crate) fn skip_in_place(&mut self, n: usize) -> ParquetResult<()> { + let mut sum = 0usize; + self.lengths + .gather_n_into(&mut sum, n, &mut SumGatherer(0))?; + self.offset += sum; + Ok(()) } - /// Returns the slice of concatenated values. - /// # Panics - /// This function panics if this iterator has not yet been fully consumed. - pub fn values(&self) -> &'a [u8] { - assert_eq!(self.lengths.size_hint().0, 0); - let start = self.lengths.consumed_bytes(); - &self.values[start..start + self.total_length as usize] + pub fn len(&self) -> usize { + self.lengths.len() } } +#[cfg(test)] impl<'a> Iterator for Decoder<'a> { - type Item = Result; + type Item = ParquetResult<&'a [u8]>; fn next(&mut self) -> Option { - let result = self.lengths.next(); - match result { - Some(Ok(v)) => { - self.total_length += v as u32; - Some(Ok(v as i32)) - }, - Some(Err(error)) => Some(Err(error)), - None => None, + if self.lengths.len() == 0 { + return None; + } + + let mut length = vec![]; + if let Err(e) = self.lengths.collect_n(&mut length, 1) { + return Some(Err(e)); } + let length = length[0] as usize; + let value = &self.values[self.offset..self.offset + length]; + self.offset += length; + Some(Ok(value)) } } diff --git a/crates/polars-parquet/src/parquet/encoding/delta_length_byte_array/encoder.rs b/crates/polars-parquet/src/parquet/encoding/delta_length_byte_array/encoder.rs index fc2121cf68e8..d768b10c24f3 100644 --- a/crates/polars-parquet/src/parquet/encoding/delta_length_byte_array/encoder.rs +++ b/crates/polars-parquet/src/parquet/encoding/delta_length_byte_array/encoder.rs @@ -4,7 +4,10 @@ use crate::parquet::encoding::delta_bitpacked; /// # Implementation /// This encoding is equivalent to call [`delta_bitpacked::encode`] on the lengths of the items /// of the iterator followed by extending the buffer from each item of the iterator. -pub fn encode, I: Iterator + Clone>(iterator: I, buffer: &mut Vec) { +pub fn encode, I: ExactSizeIterator + Clone>( + iterator: I, + buffer: &mut Vec, +) { let mut total_length = 0; delta_bitpacked::encode( iterator.clone().map(|x| { @@ -13,6 +16,7 @@ pub fn encode, I: Iterator + Clone>(iterator: I, buffer len as i64 }), buffer, + 1, ); buffer.reserve(total_length); iterator.for_each(|x| buffer.extend(x.as_ref())) diff --git a/crates/polars-parquet/src/parquet/encoding/delta_length_byte_array/mod.rs b/crates/polars-parquet/src/parquet/encoding/delta_length_byte_array/mod.rs index 35b5bd9fd5fb..050ac766f545 100644 --- a/crates/polars-parquet/src/parquet/encoding/delta_length_byte_array/mod.rs +++ b/crates/polars-parquet/src/parquet/encoding/delta_length_byte_array/mod.rs @@ -1,8 +1,8 @@ mod decoder; mod encoder; -pub use decoder::Decoder; -pub use encoder::encode; +pub(crate) use decoder::Decoder; +pub(crate) use encoder::encode; #[cfg(test)] mod tests { @@ -19,9 +19,18 @@ mod tests { let mut iter = Decoder::try_new(&buffer)?; let result = iter.by_ref().collect::, _>>()?; - assert_eq!(result, vec![2, 3, 1, 2, 1]); - - let result = iter.values(); + assert_eq!( + result, + vec![ + b"aa".as_ref(), + b"bbb".as_ref(), + b"a".as_ref(), + b"aa".as_ref(), + b"b".as_ref() + ] + ); + + let result = iter.values; assert_eq!(result, b"aabbbaaab".as_ref()); Ok(()) } @@ -32,8 +41,11 @@ mod tests { for i in 0..136 { data.push(format!("a{}", i)) } - let expected_values = data.join(""); - let expected_lengths = data.iter().map(|x| x.len() as i32).collect::>(); + + let expected = data + .iter() + .map(|v| v.as_bytes().to_vec()) + .collect::>(); let mut buffer = vec![]; encode(data.into_iter(), &mut buffer); @@ -41,10 +53,8 @@ mod tests { let mut iter = Decoder::try_new(&buffer)?; let result = iter.by_ref().collect::, _>>()?; - assert_eq!(result, expected_lengths); + assert_eq!(result, expected); - let result = iter.into_values(); - assert_eq!(result, expected_values.as_bytes()); Ok(()) } } diff --git a/crates/polars-parquet/src/parquet/read/column/mod.rs b/crates/polars-parquet/src/parquet/read/column/mod.rs index 2cd15c4f61e6..1a1277637f27 100644 --- a/crates/polars-parquet/src/parquet/read/column/mod.rs +++ b/crates/polars-parquet/src/parquet/read/column/mod.rs @@ -1,7 +1,7 @@ use std::io::{Read, Seek}; use std::vec::IntoIter; -use super::{get_field_columns, get_page_iterator, MemReader, PageFilter, PageReader}; +use super::{get_field_columns, get_page_iterator, MemReader, PageReader}; use crate::parquet::error::{ParquetError, ParquetResult}; use crate::parquet::metadata::{ColumnChunkMetaData, RowGroupMetaData}; use crate::parquet::page::CompressedPage; @@ -22,14 +22,13 @@ pub fn get_column_iterator( reader: MemReader, row_group: &RowGroupMetaData, field_name: &str, - page_filter: Option, max_page_size: usize, ) -> ColumnIterator { let columns = get_field_columns(row_group.columns(), field_name) .cloned() .collect::>(); - ColumnIterator::new(reader, columns, page_filter, max_page_size) + ColumnIterator::new(reader, columns, max_page_size) } /// State of [`MutStreamingIterator`]. @@ -55,7 +54,6 @@ pub trait MutStreamingIterator: Sized { pub struct ColumnIterator { reader: MemReader, columns: Vec, - page_filter: Option, max_page_size: usize, } @@ -65,14 +63,12 @@ impl ColumnIterator { pub fn new( reader: MemReader, mut columns: Vec, - page_filter: Option, max_page_size: usize, ) -> Self { columns.reverse(); Self { reader, columns, - page_filter, max_page_size, } } @@ -87,16 +83,11 @@ impl Iterator for ColumnIterator { }; let column = self.columns.pop().unwrap(); - let iter = match get_page_iterator( - &column, - self.reader.clone(), - self.page_filter.clone(), - Vec::new(), - self.max_page_size, - ) { - Err(e) => return Some(Err(e)), - Ok(v) => v, - }; + let iter = + match get_page_iterator(&column, self.reader.clone(), Vec::new(), self.max_page_size) { + Err(e) => return Some(Err(e)), + Ok(v) => v, + }; Some(Ok((iter, column))) } } diff --git a/crates/polars-parquet/src/parquet/read/mod.rs b/crates/polars-parquet/src/parquet/read/mod.rs index cea8561193ef..e3426a38dc3c 100644 --- a/crates/polars-parquet/src/parquet/read/mod.rs +++ b/crates/polars-parquet/src/parquet/read/mod.rs @@ -8,7 +8,6 @@ mod page; mod stream; use std::io::{Seek, SeekFrom}; -use std::sync::Arc; pub use column::*; pub use compression::{decompress, BasicDecompressor}; @@ -16,7 +15,7 @@ pub use indexes::{read_columns_indexes, read_pages_locations}; pub use metadata::{deserialize_metadata, read_metadata, read_metadata_with_size}; #[cfg(feature = "async")] pub use page::{get_page_stream, get_page_stream_from_column_start}; -pub use page::{PageFilter, PageIterator, PageMetaData, PageReader}; +pub use page::{PageIterator, PageMetaData, PageReader}; use polars_utils::mmap::MemReader; #[cfg(feature = "async")] pub use stream::read_metadata as read_metadata_async; @@ -45,18 +44,14 @@ pub fn filter_row_groups( pub fn get_page_iterator( column_chunk: &ColumnChunkMetaData, mut reader: MemReader, - pages_filter: Option, scratch: Vec, max_page_size: usize, ) -> ParquetResult { - let pages_filter = pages_filter.unwrap_or_else(|| Arc::new(|_, _| true)); - let (col_start, _) = column_chunk.byte_range(); reader.seek(SeekFrom::Start(col_start))?; Ok(PageReader::new( reader, column_chunk, - pages_filter, scratch, max_page_size, )) diff --git a/crates/polars-parquet/src/parquet/read/page/mod.rs b/crates/polars-parquet/src/parquet/read/page/mod.rs index 98d76493ba50..14801839a693 100644 --- a/crates/polars-parquet/src/parquet/read/page/mod.rs +++ b/crates/polars-parquet/src/parquet/read/page/mod.rs @@ -2,7 +2,7 @@ mod reader; #[cfg(feature = "async")] mod stream; -pub use reader::{PageFilter, PageMetaData, PageReader}; +pub use reader::{PageMetaData, PageReader}; use crate::parquet::error::ParquetError; use crate::parquet::page::CompressedPage; diff --git a/crates/polars-parquet/src/parquet/read/page/reader.rs b/crates/polars-parquet/src/parquet/read/page/reader.rs index cf01a25d7e07..dcc94d51dec3 100644 --- a/crates/polars-parquet/src/parquet/read/page/reader.rs +++ b/crates/polars-parquet/src/parquet/read/page/reader.rs @@ -1,5 +1,5 @@ use std::io::Seek; -use std::sync::{Arc, OnceLock}; +use std::sync::OnceLock; use parquet_format_safe::thrift::protocol::TCompactInputProtocol; use polars_utils::mmap::{MemReader, MemSlice}; @@ -56,9 +56,6 @@ impl From<&ColumnChunkMetaData> for PageMetaData { } } -/// Type declaration for a page filter -pub type PageFilter = Arc bool + Send + Sync>; - /// A fallible [`Iterator`] of [`CompressedDataPage`]. This iterator reads pages back /// to back until all pages have been consumed. /// The pages from this iterator always have [`None`] [`crate::parquet::page::CompressedDataPage::selected_rows()`] since @@ -76,8 +73,6 @@ pub struct PageReader { // The number of total values in this column chunk. total_num_values: i64, - pages_filter: PageFilter, - descriptor: Descriptor, // The currently allocated buffer. @@ -95,11 +90,10 @@ impl PageReader { pub fn new( reader: MemReader, column: &ColumnChunkMetaData, - pages_filter: PageFilter, scratch: Vec, max_page_size: usize, ) -> Self { - Self::new_with_page_meta(reader, column.into(), pages_filter, scratch, max_page_size) + Self::new_with_page_meta(reader, column.into(), scratch, max_page_size) } /// Create a a new [`PageReader`] with [`PageMetaData`]. @@ -108,7 +102,6 @@ impl PageReader { pub fn new_with_page_meta( reader: MemReader, reader_meta: PageMetaData, - pages_filter: PageFilter, scratch: Vec, max_page_size: usize, ) -> Self { @@ -118,7 +111,6 @@ impl PageReader { compression: reader_meta.compression, seen_num_values: 0, descriptor: reader_meta.descriptor, - pages_filter, scratch, max_page_size, } @@ -135,6 +127,12 @@ impl PageReader { } pub fn read_dict(&mut self) -> ParquetResult> { + // If there are no pages, we cannot check if the first page is a dictionary page. Just + // return the fact there is no dictionary page. + if self.reader.remaining_len() == 0 { + return Ok(None); + } + // a dictionary page exists iff the first data page is not at the start of // the column let seek_offset = self.reader.position(); @@ -190,16 +188,7 @@ impl Iterator for PageReader { fn next(&mut self) -> Option { let mut buffer = std::mem::take(&mut self.scratch); let maybe_maybe_page = next_page(self).transpose(); - if let Some(ref maybe_page) = maybe_maybe_page { - if let Ok(CompressedPage::Data(page)) = maybe_page { - // check if we should filter it (only valid for data pages) - let to_consume = (self.pages_filter)(&self.descriptor, page.header()); - if !to_consume { - self.scratch = std::mem::take(&mut buffer); - return self.next(); - } - } - } else { + if maybe_maybe_page.is_none() { // no page => we take back the buffer self.scratch = std::mem::take(&mut buffer); } diff --git a/crates/polars-parquet/src/parquet/read/page/stream.rs b/crates/polars-parquet/src/parquet/read/page/stream.rs index 7b89dc3937cd..bc1ccb32880e 100644 --- a/crates/polars-parquet/src/parquet/read/page/stream.rs +++ b/crates/polars-parquet/src/parquet/read/page/stream.rs @@ -1,13 +1,11 @@ use std::io::SeekFrom; use async_stream::try_stream; -use futures::io::{copy, sink}; use futures::{AsyncRead, AsyncReadExt, AsyncSeek, AsyncSeekExt, Stream}; use parquet_format_safe::thrift::protocol::TCompactInputStreamProtocol; use polars_utils::mmap::MemSlice; use super::reader::{finish_page, PageMetaData}; -use super::PageFilter; use crate::parquet::compression::Compression; use crate::parquet::error::{ParquetError, ParquetResult}; use crate::parquet::metadata::{ColumnChunkMetaData, Descriptor}; @@ -19,17 +17,9 @@ pub async fn get_page_stream<'a, RR: AsyncRead + Unpin + Send + AsyncSeek>( column_metadata: &'a ColumnChunkMetaData, reader: &'a mut RR, scratch: Vec, - pages_filter: PageFilter, max_page_size: usize, ) -> ParquetResult> + 'a> { - get_page_stream_with_page_meta( - column_metadata.into(), - reader, - scratch, - pages_filter, - max_page_size, - ) - .await + get_page_stream_with_page_meta(column_metadata.into(), reader, scratch, max_page_size).await } /// Returns a stream of compressed data pages from a reader that begins at the start of the column @@ -37,7 +27,6 @@ pub async fn get_page_stream_from_column_start<'a, R: AsyncRead + Unpin + Send>( column_metadata: &'a ColumnChunkMetaData, reader: &'a mut R, scratch: Vec, - pages_filter: PageFilter, max_header_size: usize, ) -> ParquetResult> + 'a> { let page_metadata: PageMetaData = column_metadata.into(); @@ -47,7 +36,6 @@ pub async fn get_page_stream_from_column_start<'a, R: AsyncRead + Unpin + Send>( page_metadata.compression, page_metadata.descriptor, scratch, - pages_filter, max_header_size, )) } @@ -57,7 +45,6 @@ pub async fn get_page_stream_with_page_meta, - pages_filter: PageFilter, max_page_size: usize, ) -> ParquetResult> + '_> { let column_start = page_metadata.column_start; @@ -68,7 +55,6 @@ pub async fn get_page_stream_with_page_meta( compression: Compression, descriptor: Descriptor, mut scratch: Vec, - pages_filter: PageFilter, max_page_size: usize, ) -> impl Stream> + '_ { let mut seen_values = 0i64; @@ -93,14 +78,6 @@ fn _get_page_stream( let read_size: usize = page_header.compressed_page_size.try_into()?; - if let Some(data_header) = data_header { - if !pages_filter(&descriptor, &data_header) { - // page to be skipped, we sill need to seek - copy(reader.take(read_size as u64), &mut sink()).await?; - continue - } - } - if read_size > max_page_size { Err(ParquetError::WouldOverAllocate)? } diff --git a/crates/polars-stream/src/async_executor/mod.rs b/crates/polars-stream/src/async_executor/mod.rs index eb549cc7c1fa..ea239628990f 100644 --- a/crates/polars-stream/src/async_executor/mod.rs +++ b/crates/polars-stream/src/async_executor/mod.rs @@ -42,18 +42,23 @@ pub enum TaskPriority { } /// Metadata associated with a task to help schedule it and clean it up. +struct ScopedTaskMetadata { + task_key: TaskKey, + completed_tasks: Weak>>, +} + struct TaskMetadata { priority: TaskPriority, freshly_spawned: AtomicBool, - - task_key: TaskKey, - completed_tasks: Weak>>, + scoped: Option, } impl Drop for TaskMetadata { fn drop(&mut self) { - if let Some(completed_tasks) = self.completed_tasks.upgrade() { - completed_tasks.lock().push(self.task_key); + if let Some(scoped) = &self.scoped { + if let Some(completed_tasks) = scoped.completed_tasks.upgrade() { + completed_tasks.lock().push(scoped.task_key); + } } } } @@ -296,10 +301,12 @@ impl<'scope, 'env> TaskScope<'scope, 'env> { fut, on_wake, TaskMetadata { - task_key, priority, freshly_spawned: AtomicBool::new(true), - completed_tasks: Arc::downgrade(&self.completed_tasks), + scoped: Some(ScopedTaskMetadata { + task_key, + completed_tasks: Arc::downgrade(&self.completed_tasks), + }), }, ) }; @@ -338,6 +345,26 @@ where } } +#[allow(unused)] +pub fn spawn(priority: TaskPriority, fut: F) -> JoinHandle +where + ::Output: Send + 'static, +{ + let executor = Executor::global(); + let on_wake = move |task| executor.schedule_task(task); + let (runnable, join_handle) = task::spawn( + fut, + on_wake, + TaskMetadata { + priority, + freshly_spawned: AtomicBool::new(true), + scoped: None, + }, + ); + runnable.schedule(); + join_handle +} + fn random_permutation(len: u32, rng: &mut R) -> impl Iterator { let modulus = len.next_power_of_two(); let halfwidth = modulus.trailing_zeros() / 2; diff --git a/crates/polars-stream/src/async_executor/task.rs b/crates/polars-stream/src/async_executor/task.rs index b87b2a7b4be3..b1f0dfcfbe69 100644 --- a/crates/polars-stream/src/async_executor/task.rs +++ b/crates/polars-stream/src/async_executor/task.rs @@ -312,8 +312,7 @@ impl CancelHandle { } } -#[allow(unused)] -pub fn spawn(future: F, schedule: S, metadata: M) -> JoinHandle +pub fn spawn(future: F, schedule: S, metadata: M) -> (Runnable, JoinHandle) where F: Future + Send + 'static, F::Output: Send + 'static, @@ -321,7 +320,7 @@ where M: Send + Sync + 'static, { let task = unsafe { Task::spawn(future, schedule, metadata) }; - JoinHandle(Some(task)) + (task.clone().into_runnable(), task.into_join_handle()) } /// Takes a future and turns it into a runnable task with associated metadata. diff --git a/crates/polars/tests/it/io/parquet/read/mod.rs b/crates/polars/tests/it/io/parquet/read/mod.rs index 99b1c1b7c9dd..2e98f5dcacaa 100644 --- a/crates/polars/tests/it/io/parquet/read/mod.rs +++ b/crates/polars/tests/it/io/parquet/read/mod.rs @@ -200,7 +200,6 @@ pub fn read_column( reader, &metadata.row_groups[row_group], field.name(), - None, usize::MAX, ); diff --git a/crates/polars/tests/it/io/parquet/read/primitive_nested.rs b/crates/polars/tests/it/io/parquet/read/primitive_nested.rs index e4abd2046432..36fdb254420a 100644 --- a/crates/polars/tests/it/io/parquet/read/primitive_nested.rs +++ b/crates/polars/tests/it/io/parquet/read/primitive_nested.rs @@ -1,6 +1,7 @@ +use polars_parquet::parquet::encoding::bitpacked::{Unpackable, Unpacked}; use polars_parquet::parquet::encoding::hybrid_rle::HybridRleDecoder; use polars_parquet::parquet::encoding::{bitpacked, uleb128, Encoding}; -use polars_parquet::parquet::error::ParquetError; +use polars_parquet::parquet::error::{ParquetError, ParquetResult}; use polars_parquet::parquet::page::{split_buffer, DataPage, EncodedSplitBuffer}; use polars_parquet::parquet::read::levels::get_bit_width; use polars_parquet::parquet::types::NativeType; @@ -171,6 +172,51 @@ pub fn page_to_array( } } +pub struct DecoderIter<'a, T: Unpackable> { + pub(crate) decoder: bitpacked::Decoder<'a, T>, + pub(crate) buffered: T::Unpacked, + pub(crate) unpacked_start: usize, + pub(crate) unpacked_end: usize, +} + +impl<'a, T: Unpackable> Iterator for DecoderIter<'a, T> { + type Item = T; + + fn next(&mut self) -> Option { + if self.unpacked_start >= self.unpacked_end { + let length; + (self.buffered, length) = self.decoder.chunked().next_inexact()?; + debug_assert!(length > 0); + self.unpacked_start = 1; + self.unpacked_end = length; + return Some(self.buffered[0]); + } + + let v = self.buffered[self.unpacked_start]; + self.unpacked_start += 1; + Some(v) + } + + fn size_hint(&self) -> (usize, Option) { + let len = self.decoder.len() + self.unpacked_end - self.unpacked_start; + (len, Some(len)) + } +} + +impl<'a, T: Unpackable> ExactSizeIterator for DecoderIter<'a, T> {} + +impl<'a, T: Unpackable> DecoderIter<'a, T> { + pub fn new(packed: &'a [u8], num_bits: usize, length: usize) -> ParquetResult { + assert!(num_bits > 0); + Ok(Self { + decoder: bitpacked::Decoder::try_new(packed, num_bits, length)?, + buffered: T::Unpacked::zero(), + unpacked_start: 0, + unpacked_end: 0, + }) + } +} + fn read_dict_array( rep_levels: &[u8], def_levels: &[u8], @@ -188,8 +234,7 @@ fn read_dict_array( let (_, consumed) = uleb128::decode(values); let values = &values[consumed..]; - let indices = bitpacked::Decoder::::try_new(values, bit_width as usize, length as usize)? - .collect_into_iter(); + let indices = DecoderIter::::new(values, bit_width as usize, length as usize)?; let values = indices.map(|id| dict_values[id as usize]); diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py index ddcda3a1fd8a..2f45581610c1 100644 --- a/py-polars/polars/dataframe/frame.py +++ b/py-polars/polars/dataframe/frame.py @@ -5820,7 +5820,7 @@ def group_by( >>> for name, data in df.group_by("a"): # doctest: +SKIP ... print(name) ... print(data) - a + ('a',) shape: (2, 3) ┌─────┬─────┬─────┐ │ a ┆ b ┆ c │ @@ -5830,7 +5830,7 @@ def group_by( │ a ┆ 1 ┆ 5 │ │ a ┆ 1 ┆ 3 │ └─────┴─────┴─────┘ - b + ('b',) shape: (2, 3) ┌─────┬─────┬─────┐ │ a ┆ b ┆ c │ @@ -5840,7 +5840,7 @@ def group_by( │ b ┆ 2 ┆ 4 │ │ b ┆ 3 ┆ 2 │ └─────┴─────┴─────┘ - c + ('c',) shape: (1, 3) ┌─────┬─────┬─────┐ │ a ┆ b ┆ c │ diff --git a/py-polars/polars/dataframe/group_by.py b/py-polars/polars/dataframe/group_by.py index 42b386fdda7d..f3a9c185de41 100644 --- a/py-polars/polars/dataframe/group_by.py +++ b/py-polars/polars/dataframe/group_by.py @@ -93,13 +93,15 @@ def __iter__(self) -> Self: │ b ┆ 3 │ └─────┴─────┘ """ + # Every group gather can trigger a rechunk, so do early. + self.df = self.df.rechunk() temp_col = "__POLARS_GB_GROUP_INDICES" groups_df = ( self.df.lazy() .group_by(*self.by, **self.named_by, maintain_order=self.maintain_order) .agg(F.first().agg_groups().alias(temp_col)) .collect(no_optimization=True) - ).rechunk() + ) self._group_names = groups_df.select(F.all().exclude(temp_col)).iter_rows() self._group_indices = groups_df.select(temp_col).to_series() diff --git a/py-polars/polars/io/spreadsheet/functions.py b/py-polars/polars/io/spreadsheet/functions.py index 36e0ef9a462d..1fd25cc1417a 100644 --- a/py-polars/polars/io/spreadsheet/functions.py +++ b/py-polars/polars/io/spreadsheet/functions.py @@ -50,6 +50,7 @@ def read_excel( engine: ExcelSpreadsheetEngine = ..., engine_options: dict[str, Any] | None = ..., read_options: dict[str, Any] | None = ..., + has_header: bool = ..., columns: Sequence[int] | Sequence[str] | None = ..., schema_overrides: SchemaDict | None = ..., infer_schema_length: int | None = ..., @@ -65,6 +66,7 @@ def read_excel( sheet_name: None = ..., engine: ExcelSpreadsheetEngine = ..., engine_options: dict[str, Any] | None = ..., + has_header: bool = ..., read_options: dict[str, Any] | None = ..., columns: Sequence[int] | Sequence[str] | None = ..., schema_overrides: SchemaDict | None = ..., @@ -82,6 +84,7 @@ def read_excel( engine: ExcelSpreadsheetEngine = ..., engine_options: dict[str, Any] | None = ..., read_options: dict[str, Any] | None = ..., + has_header: bool = ..., columns: Sequence[int] | Sequence[str] | None = ..., schema_overrides: SchemaDict | None = ..., infer_schema_length: int | None = ..., @@ -100,6 +103,7 @@ def read_excel( engine: ExcelSpreadsheetEngine = ..., engine_options: dict[str, Any] | None = ..., read_options: dict[str, Any] | None = ..., + has_header: bool = ..., columns: Sequence[int] | Sequence[str] | None = ..., schema_overrides: SchemaDict | None = ..., infer_schema_length: int | None = ..., @@ -116,6 +120,7 @@ def read_excel( engine: ExcelSpreadsheetEngine = ..., engine_options: dict[str, Any] | None = ..., read_options: dict[str, Any] | None = ..., + has_header: bool = ..., columns: Sequence[int] | Sequence[str] | None = ..., schema_overrides: SchemaDict | None = ..., infer_schema_length: int | None = ..., @@ -132,6 +137,7 @@ def read_excel( engine: ExcelSpreadsheetEngine = ..., engine_options: dict[str, Any] | None = ..., read_options: dict[str, Any] | None = ..., + has_header: bool = ..., columns: Sequence[int] | Sequence[str] | None = ..., schema_overrides: SchemaDict | None = ..., infer_schema_length: int | None = ..., @@ -149,6 +155,7 @@ def read_excel( engine: ExcelSpreadsheetEngine = "calamine", engine_options: dict[str, Any] | None = None, read_options: dict[str, Any] | None = None, + has_header: bool = True, columns: Sequence[int] | Sequence[str] | None = None, schema_overrides: SchemaDict | None = None, infer_schema_length: int | None = N_INFER_DEFAULT, @@ -207,6 +214,10 @@ def read_excel( * "calamine": `ExcelReader.load_sheet_by_name` * "xlsx2csv": `pl.read_csv` * "openpyxl": n/a (can only provide `engine_options`) + has_header + Indicate if the first row of the table data is a header or not. If False, + column names will be autogenerated in the following format: `column_x`, with + `x` being an enumeration over every column in the dataset, starting at 1. columns Columns to read from the sheet; if not specified, all columns are read. Can be given as a sequence of column names or indices. @@ -285,6 +296,7 @@ def read_excel( schema_overrides=schema_overrides, infer_schema_length=infer_schema_length, raise_if_empty=raise_if_empty, + has_header=has_header, columns=columns, ) @@ -295,6 +307,7 @@ def read_ods( *, sheet_id: None = ..., sheet_name: str, + has_header: bool = ..., columns: Sequence[int] | Sequence[str] | None = ..., schema_overrides: SchemaDict | None = ..., infer_schema_length: int | None = ..., @@ -308,6 +321,7 @@ def read_ods( *, sheet_id: None = ..., sheet_name: None = ..., + has_header: bool = ..., columns: Sequence[int] | Sequence[str] | None = ..., schema_overrides: SchemaDict | None = ..., infer_schema_length: int | None = ..., @@ -321,6 +335,7 @@ def read_ods( *, sheet_id: int, sheet_name: str, + has_header: bool = ..., columns: Sequence[int] | Sequence[str] | None = ..., schema_overrides: SchemaDict | None = ..., infer_schema_length: int | None = ..., @@ -334,6 +349,7 @@ def read_ods( *, sheet_id: Literal[0] | Sequence[int], sheet_name: None = ..., + has_header: bool = ..., columns: Sequence[int] | Sequence[str] | None = ..., schema_overrides: SchemaDict | None = ..., infer_schema_length: int | None = ..., @@ -347,6 +363,7 @@ def read_ods( *, sheet_id: int, sheet_name: None = ..., + has_header: bool = ..., columns: Sequence[int] | Sequence[str] | None = ..., schema_overrides: SchemaDict | None = ..., infer_schema_length: int | None = ..., @@ -360,6 +377,7 @@ def read_ods( *, sheet_id: None, sheet_name: list[str] | tuple[str], + has_header: bool = ..., columns: Sequence[int] | Sequence[str] | None = ..., schema_overrides: SchemaDict | None = ..., infer_schema_length: int | None = ..., @@ -372,6 +390,7 @@ def read_ods( *, sheet_id: int | Sequence[int] | None = None, sheet_name: str | list[str] | tuple[str] | None = None, + has_header: bool = True, columns: Sequence[int] | Sequence[str] | None = None, schema_overrides: SchemaDict | None = None, infer_schema_length: int | None = N_INFER_DEFAULT, @@ -396,6 +415,10 @@ def read_ods( sheet_name Sheet name(s) to convert; cannot be used in conjunction with `sheet_id`. If more than one is given then a `{sheetname:frame,}` dict is returned. + has_header + Indicate if the first row of the table data is a header or not. If False, + column names will be autogenerated in the following format: `column_x`, with + `x` being an enumeration over every column in the dataset, starting at 1. columns Columns to read from the sheet; if not specified, all columns are read. Can be given as a sequence of column names or indices. @@ -446,6 +469,7 @@ def read_ods( schema_overrides=schema_overrides, infer_schema_length=infer_schema_length, raise_if_empty=raise_if_empty, + has_header=has_header, columns=columns, ) @@ -495,6 +519,7 @@ def _identify_workbook(wb: str | Path | IO[bytes] | bytes) -> str | None: def _read_spreadsheet( sheet_id: int | Sequence[int] | None, sheet_name: str | list[str] | tuple[str] | None, + *, source: str | Path | IO[bytes] | bytes, engine: ExcelSpreadsheetEngine, engine_options: dict[str, Any] | None = None, @@ -502,7 +527,7 @@ def _read_spreadsheet( schema_overrides: SchemaDict | None = None, infer_schema_length: int | None = N_INFER_DEFAULT, columns: Sequence[int] | Sequence[str] | None = None, - *, + has_header: bool = True, raise_if_empty: bool = True, ) -> pl.DataFrame | dict[str, pl.DataFrame]: if isinstance(source, (str, Path)): @@ -510,37 +535,16 @@ def _read_spreadsheet( if looks_like_url(source): source = process_file_url(source) - read_options = (read_options or {}).copy() + read_options = _get_read_options( + read_options, + engine=engine, + columns=columns, + has_header=has_header, + infer_schema_length=infer_schema_length, + ) engine_options = (engine_options or {}).copy() schema_overrides = dict(schema_overrides or {}) - # normalise some top-level parameters to 'read_options' entries - if engine == "calamine": - if ("use_columns" in read_options) and columns: - msg = 'cannot specify both `columns` and `read_options["use_columns"]`' - raise ParameterCollisionError(msg) - elif ("schema_sample_rows" in read_options) and ( - infer_schema_length != N_INFER_DEFAULT - ): - msg = 'cannot specify both `infer_schema_length` and `read_options["schema_sample_rows"]`' - raise ParameterCollisionError(msg) - - read_options["schema_sample_rows"] = infer_schema_length - - elif engine == "xlsx2csv": - if ("columns" in read_options) and columns: - msg = 'cannot specify both `columns` and `read_options["columns"]`' - raise ParameterCollisionError(msg) - elif ("infer_schema_length" in read_options) and ( - infer_schema_length != N_INFER_DEFAULT - ): - msg = 'cannot specify both `infer_schema_length` and `read_options["infer_schema_length"]`' - raise ParameterCollisionError(msg) - - read_options["infer_schema_length"] = infer_schema_length - else: - read_options["infer_schema_length"] = infer_schema_length - # establish the reading function, parser, and available worksheets reader_fn, parser, worksheets = _initialise_spreadsheet_parser( engine, source, engine_options @@ -573,6 +577,59 @@ def _read_spreadsheet( return next(iter(parsed_sheets.values())) +def _get_read_options( + read_options: dict[str, Any] | None, + *, + engine: ExcelSpreadsheetEngine, + columns: Sequence[int] | Sequence[str] | None, + infer_schema_length: int | None, + has_header: bool, +) -> dict[str, Any]: + """Normalise top-level parameters to engine-specific 'read_options' dict.""" + read_options = (read_options or {}).copy() + if engine == "calamine": + if ("use_columns" in read_options) and columns: + msg = 'cannot specify both `columns` and `read_options["use_columns"]`' + raise ParameterCollisionError(msg) + elif read_options.get("header_row") is not None and has_header is False: + msg = 'the values of `has_header` and `read_options["header_row"]` are not compatible' + raise ParameterCollisionError(msg) + elif ("schema_sample_rows" in read_options) and ( + infer_schema_length != N_INFER_DEFAULT + ): + msg = 'cannot specify both `infer_schema_length` and `read_options["schema_sample_rows"]`' + raise ParameterCollisionError(msg) + + read_options["schema_sample_rows"] = infer_schema_length + if has_header is False and "header_row" not in read_options: + read_options["header_row"] = None + + elif engine == "xlsx2csv": + if ("columns" in read_options) and columns: + msg = 'cannot specify both `columns` and `read_options["columns"]`' + raise ParameterCollisionError(msg) + elif ( + "has_header" in read_options + and read_options["has_header"] is not has_header + ): + msg = 'the values of `has_header` and `read_options["has_header"]` are not compatible' + raise ParameterCollisionError(msg) + elif ("infer_schema_length" in read_options) and ( + infer_schema_length != N_INFER_DEFAULT + ): + msg = 'cannot specify both `infer_schema_length` and `read_options["infer_schema_length"]`' + raise ParameterCollisionError(msg) + + read_options["infer_schema_length"] = infer_schema_length + if "has_header" not in read_options: + read_options["has_header"] = has_header + else: + read_options["infer_schema_length"] = infer_schema_length + read_options["has_header"] = has_header + + return read_options + + def _get_sheet_names( sheet_id: int | Sequence[int] | None, sheet_name: str | list[str] | tuple[str] | None, @@ -695,13 +752,7 @@ def _csv_buffer_to_frame( """Translate StringIO buffer containing delimited data as a DataFrame.""" # handle (completely) empty sheet data if csv.tell() == 0: - if raise_if_empty: - msg = ( - "empty Excel sheet" - "\n\nIf you want to read this as an empty DataFrame, set `raise_if_empty=False`." - ) - raise NoDataError(msg) - return pl.DataFrame() + return _empty_frame(raise_if_empty) if read_options is None: read_options = {} @@ -754,18 +805,21 @@ def _drop_null_data(df: pl.DataFrame, *, raise_if_empty: bool) -> pl.DataFrame: df = df.drop(*null_cols) if len(df) == 0 and len(df.columns) == 0: - if not raise_if_empty: - return df - else: - msg = ( - "empty Excel sheet" - "\n\nIf you want to read this as an empty DataFrame, set `raise_if_empty=False`." - ) - raise NoDataError(msg) + return _empty_frame(raise_if_empty) return df.filter(~F.all_horizontal(F.all().is_null())) +def _empty_frame(raise_if_empty: bool) -> pl.DataFrame: # noqa: FBT001 + if raise_if_empty: + msg = ( + "empty Excel sheet" + "\n\nIf you want to read this as an empty DataFrame, set `raise_if_empty=False`." + ) + raise NoDataError(msg) + return pl.DataFrame() + + def _reorder_columns( df: pl.DataFrame, columns: Sequence[int] | Sequence[str] | None ) -> pl.DataFrame: @@ -788,6 +842,7 @@ def _read_spreadsheet_openpyxl( ) -> pl.DataFrame: """Use the 'openpyxl' library to read data from the given worksheet.""" infer_schema_length = read_options.pop("infer_schema_length", None) + has_header = read_options.pop("has_header", True) no_inference = infer_schema_length == 0 ws = parser[sheet_name] @@ -797,17 +852,28 @@ def _read_spreadsheet_openpyxl( if tables := getattr(ws, "tables", None): table = next(iter(tables.values())) rows = list(ws[table.ref]) - header.extend(cell.value for cell in rows.pop(0)) + if not rows: + return _empty_frame(raise_if_empty) + if has_header: + header.extend(cell.value for cell in rows.pop(0)) + else: + header.extend(f"column_{n}" for n in range(1, len(rows[0]) + 1)) if table.totalsRowCount: rows = rows[: -table.totalsRowCount] - rows_iter = iter(rows) + rows_iter = rows else: - rows_iter = ws.iter_rows() - for row in rows_iter: - row_values = [cell.value for cell in row] - if any(v is not None for v in row_values): - header.extend(row_values) - break + if not has_header: + if not (rows_iter := list(ws.iter_rows())): + return _empty_frame(raise_if_empty) + n_cols = len(rows_iter[0]) + header = [f"column_{n}" for n in range(1, n_cols + 1)] + else: + rows_iter = ws.iter_rows() + for row in rows_iter: + row_values = [cell.value for cell in row] + if any(v is not None for v in row_values): + header.extend(row_values) + break dtype = String if no_inference else None series_data = [] @@ -815,8 +881,8 @@ def _read_spreadsheet_openpyxl( if name: values = [cell.value for cell in column_data] if no_inference or (dtype := (schema_overrides or {}).get(name)) == String: # type: ignore[assignment] - # note: if we init series with mixed-type data (eg: str/int) - # the non-strings will become null, so we handle the cast here + # note: if we initialise the series with mixed-type data (eg: str/int) + # then the non-strings will become null, so we handle the cast here values = [str(v) if (v is not None) else v for v in values] s = pl.Series(name, values, dtype=dtype, strict=False) @@ -889,6 +955,10 @@ def _read_spreadsheet_calamine( else: ws_arrow = parser.load_sheet_eager(sheet_name, **read_options) df = from_arrow(ws_arrow) + if read_options.get("header_row", False) is None and not read_options.get( + "column_names" + ): + df.columns = [f"column_{i}" for i in range(1, len(df.columns) + 1)] # note: even if we applied parser dtypes we still re-apply schema_overrides # natively as we can refine integer/float types, temporal precision, etc. diff --git a/py-polars/tests/unit/datatypes/test_struct.py b/py-polars/tests/unit/datatypes/test_struct.py index 265cc71d07c4..1351af61d582 100644 --- a/py-polars/tests/unit/datatypes/test_struct.py +++ b/py-polars/tests/unit/datatypes/test_struct.py @@ -623,7 +623,7 @@ def test_struct_categorical_5843() -> None: def test_empty_struct() -> None: # List df = pl.DataFrame({"a": [[{}]]}) - assert df.to_dict(as_series=False) == {"a": [[{"": None}]]} + assert df.to_dict(as_series=False) == {"a": [[None]]} # Struct one not empty df = pl.DataFrame({"a": [[{}, {"a": 10}]]}) @@ -631,7 +631,7 @@ def test_empty_struct() -> None: # Empty struct df = pl.DataFrame({"a": [{}]}) - assert df.to_dict(as_series=False) == {"a": [{"": None}]} + assert df.to_dict(as_series=False) == {"a": [None]} @pytest.mark.parametrize( @@ -710,7 +710,7 @@ def test_struct_null_cast() -> None: .lazy() .select([pl.lit(None, dtype=pl.Null).cast(dtype, strict=True)]) .collect() - ).to_dict(as_series=False) == {"literal": [{"a": None, "b": None, "c": None}]} + ).to_dict(as_series=False) == {"literal": [None]} def test_nested_struct_in_lists_cast() -> None: @@ -976,3 +976,23 @@ def test_named_exprs() -> None: res = df.select(pl.struct(schema=schema, b=pl.col("a"))) assert res.to_dict(as_series=False) == {"b": [{"b": 1}]} assert res.schema["b"] == pl.Struct(schema) + + +def test_struct_outer_nullability_zip_18119() -> None: + df = pl.Series("int", [0, 1, 2, 3], dtype=pl.Int64).to_frame() + assert df.lazy().with_columns( + result=pl.when(pl.col("int") >= 1).then( + pl.struct( + a=pl.when(pl.col("int") % 2 == 1).then(True), + b=pl.when(pl.col("int") >= 2).then(False), + ) + ) + ).collect().to_dict(as_series=False) == { + "int": [0, 1, 2, 3], + "result": [ + None, + {"a": True, "b": None}, + {"a": None, "b": False}, + {"a": True, "b": False}, + ], + } diff --git a/py-polars/tests/unit/io/test_parquet.py b/py-polars/tests/unit/io/test_parquet.py index 69a2d1aa230d..2f9c3436c4be 100644 --- a/py-polars/tests/unit/io/test_parquet.py +++ b/py-polars/tests/unit/io/test_parquet.py @@ -13,6 +13,7 @@ import pyarrow.parquet as pq import pytest from hypothesis import HealthCheck, given, settings +from hypothesis import strategies as st import polars as pl from polars.exceptions import ComputeError @@ -1389,6 +1390,20 @@ def test_scan_round_trip_parametric(tmp_path: Path, df: pl.DataFrame) -> None: test_scan_round_trip(tmp_path, df) +def test_empty_rg_no_dict_page_18146() -> None: + df = pl.DataFrame( + { + "a": [], + }, + schema={"a": pl.String}, + ) + + f = io.BytesIO() + pq.write_table(df.to_arrow(), f, compression="NONE", use_dictionary=False) + f.seek(0) + assert_frame_equal(pl.read_parquet(f), df) + + def test_write_sliced_lists_18069() -> None: f = io.BytesIO() a = pl.Series(3 * [None, ["$"] * 3], dtype=pl.List(pl.String)) @@ -1414,3 +1429,84 @@ def test_null_array_dict_pages_18085() -> None: test.to_parquet(f) f.seek(0) pl.read_parquet(f) + + +@given( + df=dataframes( + min_size=1, + max_size=1000, + allowed_dtypes=[ + pl.List, + pl.Int8, + pl.Int16, + pl.Int32, + pl.Int64, + pl.UInt8, + pl.UInt16, + pl.UInt32, + pl.UInt64, + ], + ), + row_group_size=st.integers(min_value=10, max_value=1000), +) +def test_delta_encoding_roundtrip(df: pl.DataFrame, row_group_size: int) -> None: + print(df.schema) + print(df) + + f = io.BytesIO() + pq.write_table( + df.to_arrow(), + f, + compression="NONE", + use_dictionary=False, + column_encoding="DELTA_BINARY_PACKED", + write_statistics=False, + row_group_size=row_group_size, + ) + + f.seek(0) + assert_frame_equal(pl.read_parquet(f), df) + + +@given( + df=dataframes(min_size=1, max_size=1000, allowed_dtypes=[pl.String, pl.Binary]), + row_group_size=st.integers(min_value=10, max_value=1000), +) +def test_delta_length_byte_array_encoding_roundtrip( + df: pl.DataFrame, row_group_size: int +) -> None: + f = io.BytesIO() + pq.write_table( + df.to_arrow(), + f, + compression="NONE", + use_dictionary=False, + column_encoding="DELTA_LENGTH_BYTE_ARRAY", + write_statistics=False, + row_group_size=row_group_size, + ) + + f.seek(0) + assert_frame_equal(pl.read_parquet(f), df) + + +@given( + df=dataframes(min_size=1, max_size=1000, allowed_dtypes=[pl.String, pl.Binary]), + row_group_size=st.integers(min_value=10, max_value=1000), +) +def test_delta_strings_encoding_roundtrip( + df: pl.DataFrame, row_group_size: int +) -> None: + f = io.BytesIO() + pq.write_table( + df.to_arrow(), + f, + compression="NONE", + use_dictionary=False, + column_encoding="DELTA_BYTE_ARRAY", + write_statistics=False, + row_group_size=row_group_size, + ) + + f.seek(0) + assert_frame_equal(pl.read_parquet(f), df) diff --git a/py-polars/tests/unit/io/test_spreadsheet.py b/py-polars/tests/unit/io/test_spreadsheet.py index 57354a4a336b..8869affa1e1b 100644 --- a/py-polars/tests/unit/io/test_spreadsheet.py +++ b/py-polars/tests/unit/io/test_spreadsheet.py @@ -644,20 +644,22 @@ def test_excel_round_trip(write_params: dict[str, Any]) -> None: engine: ExcelSpreadsheetEngine for engine in ("calamine", "xlsx2csv"): - read_options = ( - {} + read_options, has_header = ( + ({}, True) if write_params.get("include_header", True) else ( - {"has_header": False, "new_columns": ["dtm", "str", "val"]} + {"new_columns": ["dtm", "str", "val"]} if engine == "xlsx2csv" - else {"header_row": None, "column_names": ["dtm", "str", "val"]} + else {"column_names": ["dtm", "str", "val"]}, + False, ) ) + fmt_strptime = "%Y-%m-%d" if write_params.get("dtype_formats", {}).get(pl.Date) == "dd-mm-yyyy": fmt_strptime = "%d-%m-%Y" - # write to an xlsx with polars, using various parameters... + # write to xlsx using various parameters... xls = BytesIO() _wb = df.write_excel(workbook=xls, worksheet="data", **write_params) @@ -667,6 +669,7 @@ def test_excel_round_trip(write_params: dict[str, Any]) -> None: sheet_name="data", engine=engine, read_options=read_options, + has_header=has_header, )[:3].select(df.columns[:3]) if engine == "xlsx2csv": @@ -727,6 +730,19 @@ def test_excel_write_compound_types(engine: ExcelSpreadsheetEngine) -> None: ] +@pytest.mark.parametrize("engine", ["xlsx2csv", "openpyxl", "calamine"]) +def test_excel_read_no_headers(engine: ExcelSpreadsheetEngine) -> None: + df = pl.DataFrame( + {"colx": [1, 2, 3], "coly": ["aaa", "bbb", "ccc"], "colz": [0.5, 0.0, -1.0]} + ) + xls = BytesIO() + df.write_excel(xls, worksheet="data", include_header=False) + + xldf = pl.read_excel(xls, engine=engine, has_header=False) + expected = xldf.rename({"column_1": "colx", "column_2": "coly", "column_3": "colz"}) + assert_frame_equal(df, expected) + + @pytest.mark.parametrize("engine", ["xlsx2csv", "openpyxl", "calamine"]) def test_excel_write_sparklines(engine: ExcelSpreadsheetEngine) -> None: from xlsxwriter import Workbook diff --git a/py-polars/tests/unit/test_queries.py b/py-polars/tests/unit/test_queries.py index 5ddf46531840..4c50a183af49 100644 --- a/py-polars/tests/unit/test_queries.py +++ b/py-polars/tests/unit/test_queries.py @@ -241,8 +241,8 @@ def map_expr(name: str) -> pl.Expr: ).to_dict(as_series=False) == { "groups": [1, 2, 3, 4], "out": [ - {"sum": None, "count": None}, - {"sum": None, "count": None}, + None, + None, {"sum": 1, "count": 1}, {"sum": 2, "count": 1}, ],