diff --git a/.github/workflows/benchmark-remote.yml b/.github/workflows/benchmark-remote.yml index f21898f9689f..22a1a1c12a61 100644 --- a/.github/workflows/benchmark-remote.yml +++ b/.github/workflows/benchmark-remote.yml @@ -2,19 +2,22 @@ name: Remote Benchmark on: workflow_dispatch: + push: + branches: + - 'main' pull_request: types: [ labeled ] concurrency: group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true + cancel-in-progress: ${{ github.event.label.name == 'needs-bench' }} env: SCALE_FACTOR: '10.0' jobs: main: - if: ${{ github.event.label.name == 'needs-bench' }} + if: ${{ github.ref == 'refs/heads/main' || github.event.label.name == 'needs-bench' }} runs-on: self-hosted steps: - uses: actions/checkout@v4 @@ -62,4 +65,11 @@ jobs: - name: Run benchmark working-directory: polars-benchmark run: | - make run-polars-no-env + make run-polars-no-env | tee ../py-polars/benchmark-results + + - name: Cache the Polars build + if: ${{ github.ref == 'refs/heads/main' }} + working-directory: py-polars + run: | + "$HOME/py-polars-cache/save_benchmark_data.py" "$PWD/polars" < ./benchmark-results + "$HOME/py-polars-cache/cache-build.sh" "$PWD/polars" diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 4da3d3bad8e3..afa9219231a3 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -71,7 +71,7 @@ jobs: env: RUSTFLAGS: -C embed-bitcode -D warnings working-directory: py-polars - run: maturin develop --features new_streaming --release -- -C codegen-units=8 -C lto=thin -C target-cpu=native + run: maturin develop --release -- -C codegen-units=8 -C lto=thin -C target-cpu=native - name: Run benchmark tests uses: CodSpeedHQ/action@v3 diff --git a/.github/workflows/release-python.yml b/.github/workflows/release-python.yml index 06b7f096524e..4981710d4773 100644 --- a/.github/workflows/release-python.yml +++ b/.github/workflows/release-python.yml @@ -194,7 +194,7 @@ jobs: command: build target: ${{ steps.target.outputs.target }} args: > - --release + --profile dist-release --manifest-path py-polars/Cargo.toml --out dist manylinux: ${{ matrix.architecture == 'aarch64' && '2_24' || 'auto' }} diff --git a/.github/workflows/test-python.yml b/.github/workflows/test-python.yml index 6185b905f7c7..a117f5b7fe96 100644 --- a/.github/workflows/test-python.yml +++ b/.github/workflows/test-python.yml @@ -82,7 +82,7 @@ jobs: save-if: ${{ github.ref_name == 'main' }} - name: Install Polars - run: maturin develop --features new_streaming + run: maturin develop - name: Run doctests if: github.ref_name != 'main' && matrix.python-version == '3.12' && matrix.os == 'ubuntu-latest' diff --git a/Cargo.lock b/Cargo.lock index dbd7af500fcd..967c4f3562cf 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4,9 +4,9 @@ version = 4 [[package]] name = "addr2line" -version = "0.24.1" +version = "0.24.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f5fb1d8e4442bd405fdfd1dacb42792696b0cf9cb15882e5d097b742a676d375" +checksum = "dfbe277e56a376000877090da837660b4427aad530e3028d44e0bffe4f89a1c1" dependencies = [ "gimli", ] @@ -89,15 +89,15 @@ checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299" [[package]] name = "anstyle" -version = "1.0.8" +version = "1.0.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1bec1de6f59aedf83baf9ff929c98f2ad654b97c9510f4e70cf6f661d49fd5b1" +checksum = "8365de52b16c035ff4fcafe0092ba9390540e3e352870ac09933bebcaa2c8c56" [[package]] name = "anyhow" -version = "1.0.89" +version = "1.0.91" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "86fdf8605db99b54d3cd748a44c6d04df638eb5dafb219b135d0149bd0db01f6" +checksum = "c042108f3ed77fd83760a5fd79b53be043192bb3b9dba91d8c574c0ada7850c8" [[package]] name = "apache-avro" @@ -206,7 +206,7 @@ checksum = "c7c24de15d275a1ecfd47a380fb4d5ec9bfe0933f309ed5e705b775596a3574d" dependencies = [ "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.85", ] [[package]] @@ -217,7 +217,7 @@ checksum = "721cae7de5c34fbb2acd27e21e6d2cf7b886dce0c27388d46c4e6c47ea4318dd" dependencies = [ "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.85", ] [[package]] @@ -265,9 +265,9 @@ dependencies = [ [[package]] name = "aws-config" -version = "1.5.7" +version = "1.5.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8191fb3091fa0561d1379ef80333c3c7191c6f0435d986e85821bcf7acbd1126" +checksum = "2d6448cfb224dd6a9b9ac734f58622dd0d4751f3589f3b777345745f46b2eb14" dependencies = [ "aws-credential-types", "aws-runtime", @@ -368,9 +368,9 @@ dependencies = [ [[package]] name = "aws-sdk-sso" -version = "1.44.0" +version = "1.47.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b90cfe6504115e13c41d3ea90286ede5aa14da294f3fe077027a6e83850843c" +checksum = "a8776850becacbd3a82a4737a9375ddb5c6832a51379f24443a98e61513f852c" dependencies = [ "aws-credential-types", "aws-runtime", @@ -390,9 +390,9 @@ dependencies = [ [[package]] name = "aws-sdk-ssooidc" -version = "1.45.0" +version = "1.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "167c0fad1f212952084137308359e8e4c4724d1c643038ce163f06de9662c1d0" +checksum = "0007b5b8004547133319b6c4e87193eee2a0bcb3e4c18c75d09febe9dab7b383" dependencies = [ "aws-credential-types", "aws-runtime", @@ -412,9 +412,9 @@ dependencies = [ [[package]] name = "aws-sdk-sts" -version = "1.44.0" +version = "1.47.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2cb5f98188ec1435b68097daa2a37d74b9d17c9caa799466338a8d1544e71b9d" +checksum = "9fffaa356e7f1c725908b75136d53207fa714e348f365671df14e95a60530ad3" dependencies = [ "aws-credential-types", "aws-runtime", @@ -435,9 +435,9 @@ dependencies = [ [[package]] name = "aws-sigv4" -version = "1.2.4" +version = "1.2.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cc8db6904450bafe7473c6ca9123f88cc11089e41a025408f992db4e22d3be68" +checksum = "5619742a0d8f253be760bfbb8e8e8368c69e3587e4637af5754e488a611499b1" dependencies = [ "aws-credential-types", "aws-smithy-eventstream", @@ -547,9 +547,9 @@ dependencies = [ [[package]] name = "aws-smithy-runtime" -version = "1.7.1" +version = "1.7.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d1ce695746394772e7000b39fe073095db6d45a862d0767dd5ad0ac0d7f8eb87" +checksum = "be28bd063fa91fd871d131fc8b68d7cd4c5fa0869bea68daca50dcb1cbd76be2" dependencies = [ "aws-smithy-async", "aws-smithy-http", @@ -562,7 +562,7 @@ dependencies = [ "http-body 0.4.6", "http-body 1.0.1", "httparse", - "hyper 0.14.30", + "hyper 0.14.31", "hyper-rustls 0.24.2", "once_cell", "pin-project-lite", @@ -591,9 +591,9 @@ dependencies = [ [[package]] name = "aws-smithy-types" -version = "1.2.7" +version = "1.2.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "147100a7bea70fa20ef224a6bad700358305f5dc0f84649c53769761395b355b" +checksum = "07c9cdc179e6afbf5d391ab08c85eac817b51c87e1892a5edb5f7bbdc64314b4" dependencies = [ "base64-simd", "bytes", @@ -689,9 +689,9 @@ checksum = "8c3c1a368f70d6cf7302d78f8f7093da241fb8e8807c05cc9e51a125895a6d5b" [[package]] name = "bigdecimal" -version = "0.4.5" +version = "0.4.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "51d712318a27c7150326677b321a5fa91b55f6d9034ffd67f20319e147d40cee" +checksum = "8f850665a0385e070b64c38d2354e6c104c8479c59868d1e48a0c13ee2c7a1c1" dependencies = [ "autocfg", "libm", @@ -779,22 +779,22 @@ checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c" [[package]] name = "bytemuck" -version = "1.18.0" +version = "1.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "94bbb0ad554ad961ddc5da507a12a29b14e4ae5bda06b19f575a3e6079d2e2ae" +checksum = "8334215b81e418a0a7bdb8ef0849474f40bb10c8b71f1c4ed315cff49f32494d" dependencies = [ "bytemuck_derive", ] [[package]] name = "bytemuck_derive" -version = "1.7.1" +version = "1.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0cc8b54b395f2fcfbb3d90c47b01c7f444d94d05bdeb775811dec868ac3bbc26" +checksum = "bcfcc3cd946cb52f0bbfdbbcfa2f4e24f75ebb6c0e1002f7c25904fada18b9ec" dependencies = [ "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.85", ] [[package]] @@ -805,9 +805,9 @@ checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" [[package]] name = "bytes" -version = "1.7.2" +version = "1.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "428d9aa8fbc0670b7b8d6030a7fadd0f86151cae55e4dbbece15f3780a3dfaf3" +checksum = "9ac0150caa2ae65ca5bd83f25c7de183dea78d4d366469f148435e2acfbad0da" [[package]] name = "bytes-utils" @@ -845,9 +845,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.1.24" +version = "1.1.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "812acba72f0a070b003d3697490d2b55b837230ae7c6c6497f05cc2ddbb8d938" +checksum = "c2e7962b54006dcfcc61cb72735f4d89bb97061dd6a7ed882ec6b8ee53714c6f" dependencies = [ "jobserver", "libc", @@ -924,18 +924,18 @@ dependencies = [ [[package]] name = "clap" -version = "4.5.19" +version = "4.5.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7be5744db7978a28d9df86a214130d106a89ce49644cbc4e3f0c22c3fba30615" +checksum = "b97f376d85a664d5837dbae44bf546e6477a679ff6610010f17276f686d867e8" dependencies = [ "clap_builder", ] [[package]] name = "clap_builder" -version = "4.5.19" +version = "4.5.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a5fbc17d3ef8278f55b282b2a2e75ae6f6c7d4bb70ed3d0382375104bfafdb4b" +checksum = "19bc80abd44e4bed93ca373a0704ccbd1b710dc5749406201bb018272808dc54" dependencies = [ "anstyle", "clap_lex", @@ -1212,9 +1212,9 @@ dependencies = [ [[package]] name = "dary_heap" -version = "0.3.6" +version = "0.3.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7762d17f1241643615821a8455a0b2c3e803784b058693d990b11f2dce25a0ca" +checksum = "04d2cd9c18b9f454ed67da600630b021a8a80bf33f8c95896ab33aaf1c26b728" [[package]] name = "der" @@ -1308,7 +1308,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.85", ] [[package]] @@ -1390,9 +1390,9 @@ dependencies = [ [[package]] name = "float-cmp" -version = "0.9.0" +version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "98de4bbd547a563b716d8dfa9aad1cb19bfab00f4fa09a6a4ed21dbcf44ce9c4" +checksum = "b09cf3155332e944990140d967ff5eceb70df778b34f77d8075db46e4704e6d8" dependencies = [ "num-traits", ] @@ -1436,9 +1436,9 @@ dependencies = [ [[package]] name = "futures" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "645c6916888f6cb6350d2550b80fb63e734897a8498abe35cfb732b6487804b0" +checksum = "65bc07b1a8bc7c85c5f2e110c476c7389b4554ba72af57d8445ea63a576b0876" dependencies = [ "futures-channel", "futures-core", @@ -1451,9 +1451,9 @@ dependencies = [ [[package]] name = "futures-channel" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eac8f7d7865dcb88bd4373ab671c8cf4508703796caa2b1985a9ca867b3fcb78" +checksum = "2dff15bf788c671c1934e366d07e30c1814a8ef514e1af724a602e8a2fbe1b10" dependencies = [ "futures-core", "futures-sink", @@ -1461,15 +1461,15 @@ dependencies = [ [[package]] name = "futures-core" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dfc6580bb841c5a68e9ef15c77ccc837b40a7504914d52e47b8b0e9bbda25a1d" +checksum = "05f29059c0c2090612e8d742178b0580d2dc940c837851ad723096f87af6663e" [[package]] name = "futures-executor" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a576fc72ae164fca6b9db127eaa9a9dda0d61316034f33a0a0d4eda41f02b01d" +checksum = "1e28d1d997f585e54aebc3f97d39e72338912123a67330d723fdbb564d646c9f" dependencies = [ "futures-core", "futures-task", @@ -1478,38 +1478,38 @@ dependencies = [ [[package]] name = "futures-io" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a44623e20b9681a318efdd71c299b6b222ed6f231972bfe2f224ebad6311f0c1" +checksum = "9e5c1b78ca4aae1ac06c48a526a655760685149f0d465d21f37abfe57ce075c6" [[package]] name = "futures-macro" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac" +checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650" dependencies = [ "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.85", ] [[package]] name = "futures-sink" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9fb8e00e87438d937621c1c6269e53f536c14d3fbd6a042bb24879e57d474fb5" +checksum = "e575fab7d1e0dcb8d0c7bcf9a63ee213816ab51902e6d244a95819acacf1d4f7" [[package]] name = "futures-task" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38d84fa142264698cdce1a9f9172cf383a0c82de1bddcf3092901442c4097004" +checksum = "f90f7dce0722e95104fcb095585910c0977252f286e354b5e3bd38902cd99988" [[package]] name = "futures-util" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3d6401deb83407ab3da39eba7e33987a73c3df0c82b4bb5813ee871c19c41d48" +checksum = "9fa08315bb612088cc391249efdc3bc77536f16c91f6cf495e6fbe85b20a4a81" dependencies = [ "futures-channel", "futures-core", @@ -1558,9 +1558,9 @@ dependencies = [ [[package]] name = "gimli" -version = "0.31.0" +version = "0.31.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32085ea23f3234fc7846555e85283ba4de91e21016dc0455a16286d87a292d64" +checksum = "07e28edb80900c19c28f1072f2e8aeca7fa06b23cd4169cefe1af5aa3260783f" [[package]] name = "glob" @@ -1792,9 +1792,9 @@ checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" [[package]] name = "hyper" -version = "0.14.30" +version = "0.14.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a152ddd61dfaec7273fe8419ab357f33aee0d914c5f4efbf0d96fa749eea5ec9" +checksum = "8c08302e8fa335b151b788c775ff56e7a03ae64ff85c548ee820fecb70356e85" dependencies = [ "bytes", "futures-channel", @@ -1816,9 +1816,9 @@ dependencies = [ [[package]] name = "hyper" -version = "1.4.1" +version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "50dfd22e0e76d0f662d429a5f80fcaf3855009297eab6a0a9f8543834744ba05" +checksum = "bbbff0a806a4728c99295b254c8838933b5b082d75e3cb70c8dab21fdfbcfa9a" dependencies = [ "bytes", "futures-channel", @@ -1842,7 +1842,7 @@ checksum = "ec3efd23720e2049821a693cbc7e65ea87c72f1c58ff2f9522ff332b1491e590" dependencies = [ "futures-util", "http 0.2.12", - "hyper 0.14.30", + "hyper 0.14.31", "log", "rustls 0.21.12", "rustls-native-certs 0.6.3", @@ -1858,9 +1858,9 @@ checksum = "08afdbb5c31130e3034af566421053ab03787c640246a446327f550d11bcb333" dependencies = [ "futures-util", "http 1.1.0", - "hyper 1.4.1", + "hyper 1.5.0", "hyper-util", - "rustls 0.23.13", + "rustls 0.23.15", "rustls-native-certs 0.8.0", "rustls-pki-types", "tokio", @@ -1879,7 +1879,7 @@ dependencies = [ "futures-util", "http 1.1.0", "http-body 1.0.1", - "hyper 1.4.1", + "hyper 1.5.0", "pin-project-lite", "socket2", "tokio", @@ -1945,9 +1945,9 @@ checksum = "f958d3d68f4167080a18141e10381e7634563984a537f2a49a30fd8e53ac5767" [[package]] name = "ipnet" -version = "2.10.0" +version = "2.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "187674a687eed5fe42285b40c6291f9a01517d415fad1c3cbc6a9f778af7fcd4" +checksum = "ddc24109865250148c2e0f3d25d4f0f479571723792d3802153c60922a4fb708" [[package]] name = "is-terminal" @@ -2021,9 +2021,9 @@ dependencies = [ [[package]] name = "js-sys" -version = "0.3.70" +version = "0.3.72" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1868808506b929d7b0cfa8f75951347aa71bb21144b7791bae35d9bccfcfe37a" +checksum = "6a88f1bda2bd75b0452a14784937d796722fdebfe50df998aeb3f0b7603019a9" dependencies = [ "wasm-bindgen", ] @@ -2047,9 +2047,9 @@ checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" [[package]] name = "libc" -version = "0.2.159" +version = "0.2.161" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "561d97a539a36e26a9a5fad1ea11a3039a67714694aaa379433e580854bc3dc5" +checksum = "8e9489c2807c139ffd9c1794f4af0ebe86a828db53ecdc7fea2111d0fed085d1" [[package]] name = "libflate" @@ -2107,9 +2107,9 @@ dependencies = [ [[package]] name = "libm" -version = "0.2.8" +version = "0.2.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058" +checksum = "a00419de735aac21d53b0de5ce2c03bd3627277cf471300f27ebc89f7d828047" [[package]] name = "libmimalloc-sys" @@ -2123,9 +2123,9 @@ dependencies = [ [[package]] name = "libz-ng-sys" -version = "1.1.16" +version = "1.1.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4436751a01da56f1277f323c80d584ffad94a3d14aecd959dd0dff75aa73a438" +checksum = "8f0f7295a34685977acb2e8cc8b08ee4a8dffd6cf278eeccddbe1ed55ba815d5" dependencies = [ "cmake", "libc", @@ -2155,11 +2155,11 @@ checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24" [[package]] name = "lru" -version = "0.12.4" +version = "0.12.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "37ee39891760e7d94734f6f63fedc29a2e4a152f836120753a72503f09fcf904" +checksum = "234cf4f4a04dc1f57e24b96cc0cd600cf2af460d4161ac5ecdd0af8e1f3b2a38" dependencies = [ - "hashbrown 0.14.5", + "hashbrown 0.15.0", ] [[package]] @@ -2483,9 +2483,9 @@ dependencies = [ [[package]] name = "object" -version = "0.36.4" +version = "0.36.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "084f1a5821ac4c651660a94a7153d27ac9d8a53736203f58b31945ded098070a" +checksum = "aedf0a2d09c573ed1d8d85b30c119153926a2b36dce0ab28322c09a117a4683e" dependencies = [ "memchr", ] @@ -2502,7 +2502,7 @@ dependencies = [ "chrono", "futures", "humantime", - "hyper 1.4.1", + "hyper 1.5.0", "itertools 0.13.0", "md-5", "parking_lot", @@ -2523,12 +2523,9 @@ dependencies = [ [[package]] name = "once_cell" -version = "1.20.1" +version = "1.20.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "82881c4be219ab5faaf2ad5e5e5ecdff8c66bd7402ca3160975c93b24961afd1" -dependencies = [ - "portable-atomic", -] +checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775" [[package]] name = "oorandom" @@ -2637,9 +2634,9 @@ dependencies = [ [[package]] name = "pin-project-lite" -version = "0.2.14" +version = "0.2.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bda66fc9667c18cb2758a2ac84d1167245054bcf85d5d1aaa6923f45801bdd02" +checksum = "915a1e146535de9163f3987b8944ed8cf49a18bb0056bcebcdcece385cece4ff" [[package]] name = "pin-utils" @@ -2702,7 +2699,7 @@ dependencies = [ [[package]] name = "polars" -version = "0.43.1" +version = "0.44.1" dependencies = [ "ahash", "apache-avro", @@ -2731,7 +2728,7 @@ dependencies = [ [[package]] name = "polars-arrow" -version = "0.43.1" +version = "0.44.1" dependencies = [ "ahash", "async-stream", @@ -2797,7 +2794,7 @@ dependencies = [ [[package]] name = "polars-compute" -version = "0.43.1" +version = "0.44.1" dependencies = [ "bytemuck", "either", @@ -2812,7 +2809,7 @@ dependencies = [ [[package]] name = "polars-core" -version = "0.43.1" +version = "0.44.1" dependencies = [ "ahash", "bincode", @@ -2848,7 +2845,7 @@ dependencies = [ [[package]] name = "polars-doc-examples" -version = "0.43.1" +version = "0.44.1" dependencies = [ "aws-config", "aws-sdk-s3", @@ -2862,7 +2859,7 @@ dependencies = [ [[package]] name = "polars-error" -version = "0.43.1" +version = "0.44.1" dependencies = [ "avro-schema", "object_store", @@ -2874,7 +2871,7 @@ dependencies = [ [[package]] name = "polars-expr" -version = "0.43.1" +version = "0.44.1" dependencies = [ "ahash", "bitflags", @@ -2891,12 +2888,13 @@ dependencies = [ "polars-row", "polars-time", "polars-utils", + "rand", "rayon", ] [[package]] name = "polars-ffi" -version = "0.43.1" +version = "0.44.1" dependencies = [ "polars-arrow", "polars-core", @@ -2904,7 +2902,7 @@ dependencies = [ [[package]] name = "polars-io" -version = "0.43.1" +version = "0.44.1" dependencies = [ "ahash", "async-trait", @@ -2953,7 +2951,7 @@ dependencies = [ [[package]] name = "polars-json" -version = "0.43.1" +version = "0.44.1" dependencies = [ "ahash", "chrono", @@ -2973,7 +2971,7 @@ dependencies = [ [[package]] name = "polars-lazy" -version = "0.43.1" +version = "0.44.1" dependencies = [ "ahash", "bitflags", @@ -3001,7 +2999,7 @@ dependencies = [ [[package]] name = "polars-mem-engine" -version = "0.43.1" +version = "0.44.1" dependencies = [ "futures", "memmap2", @@ -3022,7 +3020,7 @@ dependencies = [ [[package]] name = "polars-ops" -version = "0.43.1" +version = "0.44.1" dependencies = [ "ahash", "aho-corasick", @@ -3059,7 +3057,7 @@ dependencies = [ [[package]] name = "polars-parquet" -version = "0.43.1" +version = "0.44.1" dependencies = [ "ahash", "async-stream", @@ -3090,8 +3088,9 @@ dependencies = [ [[package]] name = "polars-parquet-format" -version = "2.10.0" -source = "git+https://github.com/pola-rs/parquet-format#b96e00d2b054739ee02da06987bcd7f44b82a4ef" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c025243dcfe8dbc57e94d9f82eb3bef10b565ab180d5b99bed87fd8aea319ce1" dependencies = [ "async-trait", "futures", @@ -3099,7 +3098,7 @@ dependencies = [ [[package]] name = "polars-pipe" -version = "0.43.1" +version = "0.44.1" dependencies = [ "crossbeam-channel", "crossbeam-queue", @@ -3124,7 +3123,7 @@ dependencies = [ [[package]] name = "polars-plan" -version = "0.43.1" +version = "0.44.1" dependencies = [ "ahash", "bitflags", @@ -3162,7 +3161,7 @@ dependencies = [ [[package]] name = "polars-python" -version = "0.43.1" +version = "0.44.1" dependencies = [ "ahash", "arboard", @@ -3198,7 +3197,7 @@ dependencies = [ [[package]] name = "polars-row" -version = "0.43.1" +version = "0.44.1" dependencies = [ "bytemuck", "polars-arrow", @@ -3208,7 +3207,7 @@ dependencies = [ [[package]] name = "polars-schema" -version = "0.43.1" +version = "0.44.1" dependencies = [ "indexmap", "polars-error", @@ -3219,7 +3218,7 @@ dependencies = [ [[package]] name = "polars-sql" -version = "0.43.1" +version = "0.44.1" dependencies = [ "hex", "once_cell", @@ -3239,7 +3238,7 @@ dependencies = [ [[package]] name = "polars-stream" -version = "0.43.1" +version = "0.44.1" dependencies = [ "atomic-waker", "crossbeam-deque", @@ -3266,7 +3265,7 @@ dependencies = [ [[package]] name = "polars-time" -version = "0.43.1" +version = "0.44.1" dependencies = [ "atoi", "bytemuck", @@ -3286,7 +3285,7 @@ dependencies = [ [[package]] name = "polars-utils" -version = "0.43.1" +version = "0.44.1" dependencies = [ "ahash", "bytemuck", @@ -3332,9 +3331,9 @@ dependencies = [ [[package]] name = "proc-macro2" -version = "1.0.86" +version = "1.0.89" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5e719e8df665df0d1c8fbfd238015744736151d4445ec0836b8e628aae103b77" +checksum = "f139b0662de085916d1fb67d2b4169d1addddda1919e696f3252b740b629986e" dependencies = [ "unicode-ident", ] @@ -3389,7 +3388,7 @@ dependencies = [ [[package]] name = "py-polars" -version = "1.11.0" +version = "1.12.0" dependencies = [ "jemallocator", "libc", @@ -3448,7 +3447,7 @@ dependencies = [ "proc-macro2", "pyo3-macros-backend", "quote", - "syn 2.0.79", + "syn 2.0.85", ] [[package]] @@ -3461,7 +3460,7 @@ dependencies = [ "proc-macro2", "pyo3-build-config", "quote", - "syn 2.0.79", + "syn 2.0.85", ] [[package]] @@ -3502,7 +3501,7 @@ dependencies = [ "quinn-proto", "quinn-udp", "rustc-hash 2.0.0", - "rustls 0.23.13", + "rustls 0.23.15", "socket2", "thiserror", "tokio", @@ -3519,7 +3518,7 @@ dependencies = [ "rand", "ring", "rustc-hash 2.0.0", - "rustls 0.23.13", + "rustls 0.23.15", "slab", "thiserror", "tinyvec", @@ -3659,7 +3658,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "76009fbe0614077fc1a2ce255e3a1881a2e3a3527097d5dc6d8212c585e7e38b" dependencies = [ "quote", - "syn 2.0.79", + "syn 2.0.85", ] [[package]] @@ -3688,14 +3687,14 @@ checksum = "bcc303e793d3734489387d205e9b186fac9c6cfacedd98cbb2e8a5943595f3e6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.85", ] [[package]] name = "regex" -version = "1.11.0" +version = "1.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38200e5ee88914975b69f657f0801b6f6dccafd44fd9326302a4aaeecfacb1d8" +checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191" dependencies = [ "aho-corasick", "memchr", @@ -3747,7 +3746,7 @@ dependencies = [ "http 1.1.0", "http-body 1.0.1", "http-body-util", - "hyper 1.4.1", + "hyper 1.5.0", "hyper-rustls 0.27.3", "hyper-util", "ipnet", @@ -3758,7 +3757,7 @@ dependencies = [ "percent-encoding", "pin-project-lite", "quinn", - "rustls 0.23.13", + "rustls 0.23.15", "rustls-native-certs 0.8.0", "rustls-pemfile 2.2.0", "rustls-pki-types", @@ -3839,9 +3838,9 @@ dependencies = [ [[package]] name = "rustix" -version = "0.38.37" +version = "0.38.38" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8acb788b847c24f28525660c4d7758620a7210875711f79e7f663cc152726811" +checksum = "aa260229e6538e52293eeb577aabd09945a09d6d9cc0fc550ed7529056c2e32a" dependencies = [ "bitflags", "errno", @@ -3864,9 +3863,9 @@ dependencies = [ [[package]] name = "rustls" -version = "0.23.13" +version = "0.23.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2dabaac7466917e566adb06783a81ca48944c6898a1b08b9374106dd671f4c8" +checksum = "5fbb44d7acc4e873d613422379f69f237a1b141928c02f6bc6ccfddddc2d7993" dependencies = [ "once_cell", "ring", @@ -3921,9 +3920,9 @@ dependencies = [ [[package]] name = "rustls-pki-types" -version = "1.9.0" +version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0e696e35370c65c9c541198af4543ccd580cf17fc25d8e05c5a242b202488c55" +checksum = "16f1201b3c9a7ee8039bcadc17b7e605e2945b27eee7631788c1bd2b0643674b" [[package]] name = "rustls-webpki" @@ -3948,9 +3947,9 @@ dependencies = [ [[package]] name = "rustversion" -version = "1.0.17" +version = "1.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "955d28af4278de8121b7ebeb796b6a45735dc01436d898801014aced2773a3d6" +checksum = "0e819f2bc632f285be6d7cd36e25940d45b2391dd6d9b939e79de557f7014248" [[package]] name = "ryu" @@ -4015,9 +4014,9 @@ dependencies = [ [[package]] name = "schannel" -version = "0.1.24" +version = "0.1.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e9aaafd5a2b6e3d657ff009d82fbd630b6bd54dd4eb06f21693925cdf80f9b8b" +checksum = "01227be5826fa0690321a2ba6c5cd57a19cf3f6a09e76973b58e61de6ab9d1c1" dependencies = [ "windows-sys 0.59.0", ] @@ -4083,9 +4082,9 @@ checksum = "61697e0a1c7e512e84a621326239844a24d8207b4669b41bc18b32ea5cbf988b" [[package]] name = "serde" -version = "1.0.210" +version = "1.0.213" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c8e3592472072e6e22e0a54d5904d9febf8508f65fb8552499a1abc7d1078c3a" +checksum = "3ea7893ff5e2466df8d720bb615088341b295f849602c6956047f8f80f0e9bc1" dependencies = [ "serde_derive", ] @@ -4101,20 +4100,20 @@ dependencies = [ [[package]] name = "serde_derive" -version = "1.0.210" +version = "1.0.213" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "243902eda00fad750862fc144cea25caca5e20d615af0a81bee94ca738f1df1f" +checksum = "7e85ad2009c50b58e87caa8cd6dac16bdf511bbfb7af6c33df902396aa480fa5" dependencies = [ "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.85", ] [[package]] name = "serde_json" -version = "1.0.128" +version = "1.0.132" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6ff5456707a1de34e7e37f2a6fd3d3f808c318259cbd01ab6377795054b483d8" +checksum = "d726bfaff4b320266d395898905d0eba0345aae23b54aee3a737e260fd46db03" dependencies = [ "indexmap", "itoa", @@ -4184,9 +4183,9 @@ dependencies = [ [[package]] name = "simd-json" -version = "0.14.0" +version = "0.14.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05f0b376aada35f30a0012f5790e50aed62f91804a0682669aefdbe81c7fcb91" +checksum = "b1df0290e9bfe79ddd5ff8798ca887cd107b75353d2957efe9777296e17f26b5" dependencies = [ "ahash", "getrandom", @@ -4354,7 +4353,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.79", + "syn 2.0.85", ] [[package]] @@ -4376,9 +4375,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.79" +version = "2.0.85" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89132cd0bf050864e1d38dc3bbc07a0eb8e7530af26344d3d2bbbef83499f590" +checksum = "5023162dfcd14ef8f32034d8bcd4cc5ddc61ef7a247c024a33e24e1f24d21b56" dependencies = [ "proc-macro2", "quote", @@ -4434,22 +4433,22 @@ dependencies = [ [[package]] name = "thiserror" -version = "1.0.64" +version = "1.0.65" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d50af8abc119fb8bb6dbabcfa89656f46f84aa0ac7688088608076ad2b459a84" +checksum = "5d11abd9594d9b38965ef50805c5e469ca9cc6f197f883f717e0269a3057b3d5" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" -version = "1.0.64" +version = "1.0.65" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08904e7672f5eb876eaaf87e0ce17857500934f4981c4a0ab2b4aa98baac7fc3" +checksum = "ae71770322cbd277e69d762a16c444af02aa0575ac0d174f0b9562d3b37f8602" dependencies = [ "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.85", ] [[package]] @@ -4509,9 +4508,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.40.0" +version = "1.41.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e2b070231665d27ad9ec9b8df639893f46727666c6767db40317fbe920a5d998" +checksum = "145f3413504347a2be84393cc8a7d2fb4d863b375909ea59f2158261aa258bbb" dependencies = [ "backtrace", "bytes", @@ -4532,7 +4531,7 @@ checksum = "693d596312e88961bc67d7f1f97af8a70227d9f90c31bba5806eec004978d752" dependencies = [ "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.85", ] [[package]] @@ -4551,7 +4550,7 @@ version = "0.26.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0c7bc40d0e5a97695bb96e27995cd3a08538541b0a846f65bba7a359f36700d4" dependencies = [ - "rustls 0.23.13", + "rustls 0.23.15", "rustls-pki-types", "tokio", ] @@ -4595,7 +4594,7 @@ checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.85", ] [[package]] @@ -4640,7 +4639,7 @@ checksum = "f9534daa9fd3ed0bd911d462a37f172228077e7abf18c18a5f67199d959205f8" dependencies = [ "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.85", ] [[package]] @@ -4728,9 +4727,9 @@ checksum = "daf8dba3b7eb870caf1ddeed7bc9d2a049f3cfdfae7cb521b087cc33ae4c49da" [[package]] name = "uuid" -version = "1.10.0" +version = "1.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "81dfa00651efa65069b0b6b651f4aaa31ba9e3c3ce0137aaad053604ee7e0314" +checksum = "f8c5f0a0af699448548ad1a2fbf920fb4bee257eae39953ba95cb84891a0446a" dependencies = [ "getrandom", "serde", @@ -4738,9 +4737,9 @@ dependencies = [ [[package]] name = "value-trait" -version = "0.10.0" +version = "0.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bcaa56177466248ba59d693a048c0959ddb67f1151b963f904306312548cf392" +checksum = "9170e001f458781e92711d2ad666110f153e4e50bfd5cbd02db6547625714187" dependencies = [ "float-cmp", "halfbrown", @@ -4787,9 +4786,9 @@ checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" [[package]] name = "wasm-bindgen" -version = "0.2.93" +version = "0.2.95" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a82edfc16a6c469f5f44dc7b571814045d60404b55a0ee849f9bcfa2e63dd9b5" +checksum = "128d1e363af62632b8eb57219c8fd7877144af57558fb2ef0368d0087bddeb2e" dependencies = [ "cfg-if", "once_cell", @@ -4798,24 +4797,24 @@ dependencies = [ [[package]] name = "wasm-bindgen-backend" -version = "0.2.93" +version = "0.2.95" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9de396da306523044d3302746f1208fa71d7532227f15e347e2d93e4145dd77b" +checksum = "cb6dd4d3ca0ddffd1dd1c9c04f94b868c37ff5fac97c30b97cff2d74fce3a358" dependencies = [ "bumpalo", "log", "once_cell", "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.85", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-futures" -version = "0.4.43" +version = "0.4.45" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "61e9300f63a621e96ed275155c108eb6f843b6a26d053f122ab69724559dc8ed" +checksum = "cc7ec4f8827a71586374db3e87abdb5a2bb3a15afed140221307c3ec06b1f63b" dependencies = [ "cfg-if", "js-sys", @@ -4825,9 +4824,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.93" +version = "0.2.95" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "585c4c91a46b072c92e908d99cb1dcdf95c5218eeb6f3bf1efa991ee7a68cccf" +checksum = "e79384be7f8f5a9dd5d7167216f022090cf1f9ec128e6e6a482a2cb5c5422c56" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -4835,28 +4834,28 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.93" +version = "0.2.95" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "afc340c74d9005395cf9dd098506f7f44e38f2b4a21c6aaacf9a105ea5e1e836" +checksum = "26c6ab57572f7a24a4985830b120de1594465e5d500f24afe89e16b4e833ef68" dependencies = [ "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.85", "wasm-bindgen-backend", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-shared" -version = "0.2.93" +version = "0.2.95" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c62a0a307cb4a311d3a07867860911ca130c3494e8c2719593806c08bc5d0484" +checksum = "65fc09f10666a9f147042251e0dda9c18f166ff7de300607007e96bdebc1068d" [[package]] name = "wasm-streams" -version = "0.4.1" +version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4e072d4e72f700fb3443d8fe94a39315df013eef1104903cdb0a2abd322bbecd" +checksum = "15053d8d85c7eccdbefef60f06769760a563c7f0a9d6902a13d35c7800b0ad65" dependencies = [ "futures-util", "js-sys", @@ -4867,9 +4866,9 @@ dependencies = [ [[package]] name = "web-sys" -version = "0.3.70" +version = "0.3.72" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "26fdeaafd9bd129f65e7c031593c24d62186301e0c72c8978fa1678be7d532c0" +checksum = "f6488b90108c040df0fe62fa815cbdee25124641df01814dd7282749234c6112" dependencies = [ "js-sys", "wasm-bindgen", @@ -4945,7 +4944,7 @@ checksum = "9107ddc059d5b6fbfbffdfa7a7fe3e22a226def0b2608f72e9d552763d3e1ad7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.85", ] [[package]] @@ -4956,7 +4955,7 @@ checksum = "29bee4b38ea3cde66011baa44dba677c432a78593e202392d1e9070cf2a7fca7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.85", ] [[package]] @@ -5184,7 +5183,7 @@ checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.85", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index 2944e5a8587d..a57add1faf0a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,7 +10,7 @@ default-members = [ ] [workspace.package] -version = "0.43.1" +version = "0.44.1" authors = ["Ritchie Vink "] edition = "2021" homepage = "https://www.pola.rs/" @@ -91,27 +91,27 @@ version_check = "0.9.4" xxhash-rust = { version = "0.8.6", features = ["xxh3"] } zstd = "0.13" -polars = { version = "0.43.1", path = "crates/polars", default-features = false } -polars-compute = { version = "0.43.1", path = "crates/polars-compute", default-features = false } -polars-core = { version = "0.43.1", path = "crates/polars-core", default-features = false } -polars-error = { version = "0.43.1", path = "crates/polars-error", default-features = false } -polars-expr = { version = "0.43.1", path = "crates/polars-expr", default-features = false } -polars-ffi = { version = "0.43.1", path = "crates/polars-ffi", default-features = false } -polars-io = { version = "0.43.1", path = "crates/polars-io", default-features = false } -polars-json = { version = "0.43.1", path = "crates/polars-json", default-features = false } -polars-lazy = { version = "0.43.1", path = "crates/polars-lazy", default-features = false } -polars-mem-engine = { version = "0.43.1", path = "crates/polars-mem-engine", default-features = false } -polars-ops = { version = "0.43.1", path = "crates/polars-ops", default-features = false } -polars-parquet = { version = "0.43.1", path = "crates/polars-parquet", default-features = false } -polars-pipe = { version = "0.43.1", path = "crates/polars-pipe", default-features = false } -polars-plan = { version = "0.43.1", path = "crates/polars-plan", default-features = false } -polars-python = { version = "0.43.1", path = "crates/polars-python", default-features = false } -polars-row = { version = "0.43.1", path = "crates/polars-row", default-features = false } -polars-schema = { version = "0.43.1", path = "crates/polars-schema", default-features = false } -polars-sql = { version = "0.43.1", path = "crates/polars-sql", default-features = false } -polars-stream = { version = "0.43.1", path = "crates/polars-stream", default-features = false } -polars-time = { version = "0.43.1", path = "crates/polars-time", default-features = false } -polars-utils = { version = "0.43.1", path = "crates/polars-utils", default-features = false } +polars = { version = "0.44.1", path = "crates/polars", default-features = false } +polars-compute = { version = "0.44.1", path = "crates/polars-compute", default-features = false } +polars-core = { version = "0.44.1", path = "crates/polars-core", default-features = false } +polars-error = { version = "0.44.1", path = "crates/polars-error", default-features = false } +polars-expr = { version = "0.44.1", path = "crates/polars-expr", default-features = false } +polars-ffi = { version = "0.44.1", path = "crates/polars-ffi", default-features = false } +polars-io = { version = "0.44.1", path = "crates/polars-io", default-features = false } +polars-json = { version = "0.44.1", path = "crates/polars-json", default-features = false } +polars-lazy = { version = "0.44.1", path = "crates/polars-lazy", default-features = false } +polars-mem-engine = { version = "0.44.1", path = "crates/polars-mem-engine", default-features = false } +polars-ops = { version = "0.44.1", path = "crates/polars-ops", default-features = false } +polars-parquet = { version = "0.44.1", path = "crates/polars-parquet", default-features = false } +polars-pipe = { version = "0.44.1", path = "crates/polars-pipe", default-features = false } +polars-plan = { version = "0.44.1", path = "crates/polars-plan", default-features = false } +polars-python = { version = "0.44.1", path = "crates/polars-python", default-features = false } +polars-row = { version = "0.44.1", path = "crates/polars-row", default-features = false } +polars-schema = { version = "0.44.1", path = "crates/polars-schema", default-features = false } +polars-sql = { version = "0.44.1", path = "crates/polars-sql", default-features = false } +polars-stream = { version = "0.44.1", path = "crates/polars-stream", default-features = false } +polars-time = { version = "0.44.1", path = "crates/polars-time", default-features = false } +polars-utils = { version = "0.44.1", path = "crates/polars-utils", default-features = false } [workspace.dependencies.arrow-format] package = "polars-arrow-format" @@ -119,7 +119,7 @@ version = "0.1.0" [workspace.dependencies.arrow] package = "polars-arrow" -version = "0.43.1" +version = "0.44.1" path = "crates/polars-arrow" default-features = false features = [ @@ -136,17 +136,24 @@ features = [ # packed_simd_2 = { git = "https://github.com/rust-lang/packed_simd", rev = "e57c7ba11386147e6d2cbad7c88f376aab4bdc86" } # simd-json = { git = "https://github.com/ritchie46/simd-json", branch = "alignment" } -[profile.opt-dev] +[profile.mindebug-dev] inherits = "dev" -opt-level = 1 +debug = "line-tables-only" + +[profile.release] +lto = "thin" +debug = "line-tables-only" + +[profile.nodebug-release] +inherits = "release" +debug = false [profile.debug-release] inherits = "release" debug = true -incremental = true -codegen-units = 16 -lto = "thin" -[profile.release] +[profile.dist-release] +inherits = "release" codegen-units = 1 +debug = false lto = "fat" diff --git a/Makefile b/Makefile index 5dd746aa5b7a..534e14076b73 100644 --- a/Makefile +++ b/Makefile @@ -10,6 +10,56 @@ else VENV_BIN=$(VENV)/bin endif +# Detect CPU architecture. +ifeq ($(OS),Windows_NT) + ifeq ($(PROCESSOR_ARCHITECTURE),AMD64) + ARCH := amd64 + else ifeq ($(PROCESSOR_ARCHITECTURE),x86) + ARCH := x86 + else ifeq ($(PROCESSOR_ARCHITECTURE),ARM64) + ARCH := arm64 + else + ARCH := unknown + endif +else + UNAME_P := $(shell uname -p) + ifeq ($(UNAME_P),x86_64) + ARCH := amd64 + else ifneq ($(filter %86,$(UNAME_P)),) + ARCH := x86 + else ifneq ($(filter arm%,$(UNAME_P)),) + ARCH := arm64 + else + ARCH := unknown + endif +endif + +# Ensure boolean arguments are normalized to 1/0 to prevent surprises. +ifdef LTS_CPU + ifeq ($(LTS_CPU),0) + else ifeq ($(LTS_CPU),1) + else +$(error LTS_CPU must be 0 or 1 (or undefined, default to 0)) + endif +endif + +# Define RUSTFLAGS and CFLAGS appropriate for the architecture. +# Keep synchronized with .github/workflows/release-python.yml. +ifeq ($(ARCH),amd64) + ifeq ($(LTS_CPU),1) + FEAT_RUSTFLAGS=-C target-feature=+sse3,+ssse3,+sse4.1,+sse4.2,+popcnt,+cmpxchg16b + FEAT_CFLAGS=-msse3 -mssse3 -msse4.1 -msse4.2 -mpopcnt -mcx16 + else + FEAT_RUSTFLAGS=-C target-feature=+sse3,+ssse3,+sse4.1,+sse4.2,+popcnt,+cmpxchg16b,+avx,+avx2,+fma,+bmi1,+bmi2,+lzcnt,+pclmulqdq,+movbe -Z tune-cpu=skylake + FEAT_CFLAGS=-msse3 -mssse3 -msse4.1 -msse4.2 -mpopcnt -mcx16 -mavx -mavx2 -mfma -mbmi -mbmi2 -mlzcnt -mpclmul -mmovbe -mtune=skylake + endif +endif + +override RUSTFLAGS+=$(FEAT_RUSTFLAGS) +override CFLAGS+=$(FEAT_CFLAGS) +export RUSTFLAGS +export CFLAGS + # Define command to filter pip warnings when running maturin FILTER_PIP_WARNINGS=| grep -v "don't match your environment"; test $${PIPESTATUS[0]} -eq 0 @@ -35,55 +85,37 @@ requirements-all: .venv ## Install/refresh all Python requirements (including t .PHONY: build build: .venv ## Compile and install Python Polars for development @unset CONDA_PREFIX \ - && $(VENV_BIN)/maturin develop -m py-polars/Cargo.toml \ + && $(VENV_BIN)/maturin develop -m py-polars/Cargo.toml $(ARGS) \ $(FILTER_PIP_WARNINGS) -.PHONY: build-debug-opt -build-debug-opt: .venv ## Compile and install Python Polars with minimal optimizations turned on +.PHONY: build-mindebug +build-mindebug: .venv ## Same as build, but don't include full debug information @unset CONDA_PREFIX \ - && $(VENV_BIN)/maturin develop -m py-polars/Cargo.toml --profile opt-dev \ + && $(VENV_BIN)/maturin develop -m py-polars/Cargo.toml --profile mindebug-dev $(ARGS) \ $(FILTER_PIP_WARNINGS) -.PHONY: build-debug-opt-subset -build-debug-opt-subset: .venv ## Compile and install Python Polars with minimal optimizations turned on and no default features +.PHONY: build-release +build-release: .venv ## Compile and install Python Polars binary with optimizations, with minimal debug symbols @unset CONDA_PREFIX \ - && $(VENV_BIN)/maturin develop -m py-polars/Cargo.toml --no-default-features --profile opt-dev \ + && $(VENV_BIN)/maturin develop -m py-polars/Cargo.toml --release $(ARGS) \ $(FILTER_PIP_WARNINGS) -.PHONY: build-opt -build-opt: .venv ## Compile and install Python Polars with nearly full optimization on and debug assertions turned off, but with debug symbols on +.PHONY: build-nodebug-release +build-nodebug-release: .venv ## Same as build-release, but without any debug symbols at all (a bit faster to build) @unset CONDA_PREFIX \ - && $(VENV_BIN)/maturin develop -m py-polars/Cargo.toml --profile debug-release \ + && $(VENV_BIN)/maturin develop -m py-polars/Cargo.toml --profile nodebug-release $(ARGS) \ $(FILTER_PIP_WARNINGS) -.PHONY: build-release -build-release: .venv ## Compile and install a faster Python Polars binary with full optimizations +.PHONY: build-debug-release +build-debug-release: .venv ## Same as build-release, but with full debug symbols turned on (a bit slower to build) @unset CONDA_PREFIX \ - && $(VENV_BIN)/maturin develop -m py-polars/Cargo.toml --release \ + && $(VENV_BIN)/maturin develop -m py-polars/Cargo.toml --profile debug-release $(ARGS) \ $(FILTER_PIP_WARNINGS) -.PHONY: build-native -build-native: .venv ## Same as build, except with native CPU optimizations turned on - @unset CONDA_PREFIX && RUSTFLAGS='-C target-cpu=native' \ - $(VENV_BIN)/maturin develop -m py-polars/Cargo.toml \ - $(FILTER_PIP_WARNINGS) - -.PHONY: build-debug-opt-native -build-debug-opt-native: .venv ## Same as build-debug-opt, except with native CPU optimizations turned on - @unset CONDA_PREFIX && RUSTFLAGS='-C target-cpu=native' \ - $(VENV_BIN)/maturin develop -m py-polars/Cargo.toml --profile opt-dev \ - $(FILTER_PIP_WARNINGS) - -.PHONY: build-opt-native -build-opt-native: .venv ## Same as build-opt, except with native CPU optimizations turned on - @unset CONDA_PREFIX && RUSTFLAGS='-C target-cpu=native' \ - $(VENV_BIN)/maturin develop -m py-polars/Cargo.toml --profile debug-release \ - $(FILTER_PIP_WARNINGS) - -.PHONY: build-release-native -build-release-native: .venv ## Same as build-release, except with native CPU optimizations turned on - @unset CONDA_PREFIX && RUSTFLAGS='-C target-cpu=native' \ - $(VENV_BIN)/maturin develop -m py-polars/Cargo.toml --release \ +.PHONY: build-dist-release +build-dist-release: .venv ## Compile and install Python Polars binary with super slow extra optimization turned on, for distribution + @unset CONDA_PREFIX \ + && $(VENV_BIN)/maturin develop -m py-polars/Cargo.toml --profile dist-release $(ARGS) \ $(FILTER_PIP_WARNINGS) .PHONY: check @@ -121,3 +153,6 @@ clean: ## Clean up caches, build artifacts, and the venv help: ## Display this help screen @echo -e "\033[1mAvailable commands:\033[0m" @grep -E '^[a-z.A-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf " \033[36m%-22s\033[0m %s\n", $$1, $$2}' | sort + @echo + @echo The build commands support LTS_CPU=1 for building for older CPUs, and ARGS which is passed through to maturin. + @echo 'For example to build without default features use: make build ARGS="--no-default-features".' diff --git a/README.md b/README.md index 2f7c5b290ad8..43ac43596813 100644 --- a/README.md +++ b/README.md @@ -233,13 +233,14 @@ This can be done by going through the following steps in sequence: 1. Install the latest [Rust compiler](https://www.rust-lang.org/tools/install) 2. Install [maturin](https://maturin.rs/): `pip install maturin` 3. `cd py-polars` and choose one of the following: - - `make build-release`, fastest binary, very long compile times - - `make build-opt`, fast binary with debug symbols, long compile times - - `make build-debug-opt`, medium-speed binary with debug assertions and symbols, medium compile times - `make build`, slow binary with debug assertions and symbols, fast compile times + - `make build-release`, fast binary without debug assertions, minimal debug symbols, long compile times + - `make build-nodebug-release`, same as build-release but without any debug symbols, slightly faster to compile + - `make build-debug-release`, same as build-release but with full debug symbols, slightly slower to compile + - `make build-dist-release`, fastest binary, extreme compile times - Append `-native` (e.g. `make build-release-native`) to enable further optimizations specific to - your CPU. This produces a non-portable binary/wheel however. +By default the binary is compiled with optimizations turned on for a modern CPU. Specify `LTS_CPU=1` +with the command if your CPU is older and does not support e.g. AVX2. Note that the Rust crate implementing the Python bindings is called `py-polars` to distinguish from the wrapped Rust crate `polars` itself. However, both the Python package and the Python module are named `polars`, so you diff --git a/crates/polars-arrow/Cargo.toml b/crates/polars-arrow/Cargo.toml index 2bca77858be1..5ce6d5deb9e7 100644 --- a/crates/polars-arrow/Cargo.toml +++ b/crates/polars-arrow/Cargo.toml @@ -154,7 +154,7 @@ timezones = [ ] dtype-array = [] dtype-decimal = ["atoi", "itoap"] -bigidx = [] +bigidx = ["polars-utils/bigidx"] nightly = [] performant = [] strings = [] diff --git a/crates/polars-arrow/src/array/dictionary/iterator.rs b/crates/polars-arrow/src/array/dictionary/iterator.rs index 68e95ca86fed..af6ef539572d 100644 --- a/crates/polars-arrow/src/array/dictionary/iterator.rs +++ b/crates/polars-arrow/src/array/dictionary/iterator.rs @@ -21,7 +21,7 @@ impl<'a, K: DictionaryKey> DictionaryValuesIter<'a, K> { } } -impl<'a, K: DictionaryKey> Iterator for DictionaryValuesIter<'a, K> { +impl Iterator for DictionaryValuesIter<'_, K> { type Item = Box; #[inline] @@ -40,9 +40,9 @@ impl<'a, K: DictionaryKey> Iterator for DictionaryValuesIter<'a, K> { } } -unsafe impl<'a, K: DictionaryKey> TrustedLen for DictionaryValuesIter<'a, K> {} +unsafe impl TrustedLen for DictionaryValuesIter<'_, K> {} -impl<'a, K: DictionaryKey> DoubleEndedIterator for DictionaryValuesIter<'a, K> { +impl DoubleEndedIterator for DictionaryValuesIter<'_, K> { #[inline] fn next_back(&mut self) -> Option { if self.index == self.end { diff --git a/crates/polars-arrow/src/array/dictionary/typed_iterator.rs b/crates/polars-arrow/src/array/dictionary/typed_iterator.rs index 5257bde2cae0..d7e7637bf28d 100644 --- a/crates/polars-arrow/src/array/dictionary/typed_iterator.rs +++ b/crates/polars-arrow/src/array/dictionary/typed_iterator.rs @@ -117,11 +117,9 @@ impl<'a, K: DictionaryKey, V: DictValue> Iterator for DictionaryValuesIterTyped< } } -unsafe impl<'a, K: DictionaryKey, V: DictValue> TrustedLen for DictionaryValuesIterTyped<'a, K, V> {} +unsafe impl TrustedLen for DictionaryValuesIterTyped<'_, K, V> {} -impl<'a, K: DictionaryKey, V: DictValue> DoubleEndedIterator - for DictionaryValuesIterTyped<'a, K, V> -{ +impl DoubleEndedIterator for DictionaryValuesIterTyped<'_, K, V> { #[inline] fn next_back(&mut self) -> Option { if self.index == self.end { @@ -181,9 +179,9 @@ impl<'a, K: DictionaryKey, V: DictValue> Iterator for DictionaryIterTyped<'a, K, } } -unsafe impl<'a, K: DictionaryKey, V: DictValue> TrustedLen for DictionaryIterTyped<'a, K, V> {} +unsafe impl TrustedLen for DictionaryIterTyped<'_, K, V> {} -impl<'a, K: DictionaryKey, V: DictValue> DoubleEndedIterator for DictionaryIterTyped<'a, K, V> { +impl DoubleEndedIterator for DictionaryIterTyped<'_, K, V> { #[inline] fn next_back(&mut self) -> Option { if self.index == self.end { diff --git a/crates/polars-arrow/src/array/growable/null.rs b/crates/polars-arrow/src/array/growable/null.rs index c0b92e132819..e663fc31b8b4 100644 --- a/crates/polars-arrow/src/array/growable/null.rs +++ b/crates/polars-arrow/src/array/growable/null.rs @@ -23,7 +23,7 @@ impl GrowableNull { } } -impl<'a> Growable<'a> for GrowableNull { +impl Growable<'_> for GrowableNull { unsafe fn extend(&mut self, _: usize, _: usize, len: usize) { self.length += len; } diff --git a/crates/polars-arrow/src/array/iterator.rs b/crates/polars-arrow/src/array/iterator.rs index 59e71968fde7..5009442d5718 100644 --- a/crates/polars-arrow/src/array/iterator.rs +++ b/crates/polars-arrow/src/array/iterator.rs @@ -118,7 +118,7 @@ impl<'a, A: ArrayAccessor<'a> + ?Sized> Iterator for NonNullValuesIter<'a, A> { unsafe impl<'a, A: ArrayAccessor<'a> + ?Sized> TrustedLen for NonNullValuesIter<'a, A> {} -impl<'a, A: ?Sized> Clone for NonNullValuesIter<'a, A> { +impl Clone for NonNullValuesIter<'_, A> { fn clone(&self) -> Self { Self { accessor: self.accessor, diff --git a/crates/polars-arrow/src/array/map/iterator.rs b/crates/polars-arrow/src/array/map/iterator.rs index 558405ddc8de..79fc630cc520 100644 --- a/crates/polars-arrow/src/array/map/iterator.rs +++ b/crates/polars-arrow/src/array/map/iterator.rs @@ -22,7 +22,7 @@ impl<'a> MapValuesIter<'a> { } } -impl<'a> Iterator for MapValuesIter<'a> { +impl Iterator for MapValuesIter<'_> { type Item = Box; #[inline] @@ -43,9 +43,9 @@ impl<'a> Iterator for MapValuesIter<'a> { } } -unsafe impl<'a> TrustedLen for MapValuesIter<'a> {} +unsafe impl TrustedLen for MapValuesIter<'_> {} -impl<'a> DoubleEndedIterator for MapValuesIter<'a> { +impl DoubleEndedIterator for MapValuesIter<'_> { #[inline] fn next_back(&mut self) -> Option { if self.index == self.end { diff --git a/crates/polars-arrow/src/array/mod.rs b/crates/polars-arrow/src/array/mod.rs index a8d0469d5f08..a2acd7164f6a 100644 --- a/crates/polars-arrow/src/array/mod.rs +++ b/crates/polars-arrow/src/array/mod.rs @@ -189,7 +189,7 @@ pub trait Array: Send + Sync + dyn_clone::DynClone + 'static { new } - /// Clones this [`Array`] with a new new assigned bitmap. + /// Clones this [`Array`] with a new assigned bitmap. /// # Panic /// This function panics iff `validity.len() != self.len()`. fn with_validity(&self, validity: Option) -> Box; diff --git a/crates/polars-arrow/src/array/static_array_collect.rs b/crates/polars-arrow/src/array/static_array_collect.rs index 296d93502abe..9ff5ceb39361 100644 --- a/crates/polars-arrow/src/array/static_array_collect.rs +++ b/crates/polars-arrow/src/array/static_array_collect.rs @@ -417,10 +417,10 @@ impl IntoBytes for T { } } impl TrivialIntoBytes for Vec {} -impl<'a> TrivialIntoBytes for Cow<'a, [u8]> {} -impl<'a> TrivialIntoBytes for &'a [u8] {} +impl TrivialIntoBytes for Cow<'_, [u8]> {} +impl TrivialIntoBytes for &[u8] {} impl TrivialIntoBytes for String {} -impl<'a> TrivialIntoBytes for &'a str {} +impl TrivialIntoBytes for &str {} impl<'a> IntoBytes for Cow<'a, str> { type AsRefT = Cow<'a, [u8]>; fn into_bytes(self) -> Cow<'a, [u8]> { @@ -590,8 +590,8 @@ unsafe fn into_utf8array(arr: BinaryArray) -> Utf8Array { trait StrIntoBytes: IntoBytes {} impl StrIntoBytes for String {} -impl<'a> StrIntoBytes for &'a str {} -impl<'a> StrIntoBytes for Cow<'a, str> {} +impl StrIntoBytes for &str {} +impl StrIntoBytes for Cow<'_, str> {} impl ArrayFromIter for Utf8ViewArray { #[inline] diff --git a/crates/polars-arrow/src/array/struct_/iterator.rs b/crates/polars-arrow/src/array/struct_/iterator.rs index 4e89af3a6a7f..38a49f274cde 100644 --- a/crates/polars-arrow/src/array/struct_/iterator.rs +++ b/crates/polars-arrow/src/array/struct_/iterator.rs @@ -20,7 +20,7 @@ impl<'a> StructValueIter<'a> { } } -impl<'a> Iterator for StructValueIter<'a> { +impl Iterator for StructValueIter<'_> { type Item = Vec>; #[inline] @@ -48,9 +48,9 @@ impl<'a> Iterator for StructValueIter<'a> { } } -unsafe impl<'a> TrustedLen for StructValueIter<'a> {} +unsafe impl TrustedLen for StructValueIter<'_> {} -impl<'a> DoubleEndedIterator for StructValueIter<'a> { +impl DoubleEndedIterator for StructValueIter<'_> { #[inline] fn next_back(&mut self) -> Option { if self.index == self.end { diff --git a/crates/polars-arrow/src/array/union/iterator.rs b/crates/polars-arrow/src/array/union/iterator.rs index bdcf5825af6c..e93223e46c43 100644 --- a/crates/polars-arrow/src/array/union/iterator.rs +++ b/crates/polars-arrow/src/array/union/iterator.rs @@ -15,7 +15,7 @@ impl<'a> UnionIter<'a> { } } -impl<'a> Iterator for UnionIter<'a> { +impl Iterator for UnionIter<'_> { type Item = Box; #[inline] @@ -54,6 +54,6 @@ impl<'a> UnionArray { } } -impl<'a> std::iter::ExactSizeIterator for UnionIter<'a> {} +impl std::iter::ExactSizeIterator for UnionIter<'_> {} -unsafe impl<'a> TrustedLen for UnionIter<'a> {} +unsafe impl TrustedLen for UnionIter<'_> {} diff --git a/crates/polars-arrow/src/bitmap/bitmap_ops.rs b/crates/polars-arrow/src/bitmap/bitmap_ops.rs index 032c76710f66..ec9814e4fad3 100644 --- a/crates/polars-arrow/src/bitmap/bitmap_ops.rs +++ b/crates/polars-arrow/src/bitmap/bitmap_ops.rs @@ -338,7 +338,7 @@ impl PartialEq for Bitmap { } } -impl<'a, 'b> BitOr<&'b Bitmap> for &'a Bitmap { +impl<'b> BitOr<&'b Bitmap> for &Bitmap { type Output = Bitmap; fn bitor(self, rhs: &'b Bitmap) -> Bitmap { @@ -346,7 +346,7 @@ impl<'a, 'b> BitOr<&'b Bitmap> for &'a Bitmap { } } -impl<'a, 'b> BitAnd<&'b Bitmap> for &'a Bitmap { +impl<'b> BitAnd<&'b Bitmap> for &Bitmap { type Output = Bitmap; fn bitand(self, rhs: &'b Bitmap) -> Bitmap { @@ -354,7 +354,7 @@ impl<'a, 'b> BitAnd<&'b Bitmap> for &'a Bitmap { } } -impl<'a, 'b> BitXor<&'b Bitmap> for &'a Bitmap { +impl<'b> BitXor<&'b Bitmap> for &Bitmap { type Output = Bitmap; fn bitxor(self, rhs: &'b Bitmap) -> Bitmap { diff --git a/crates/polars-arrow/src/bitmap/builder.rs b/crates/polars-arrow/src/bitmap/builder.rs new file mode 100644 index 000000000000..c507df97c5ba --- /dev/null +++ b/crates/polars-arrow/src/bitmap/builder.rs @@ -0,0 +1,100 @@ +use crate::bitmap::{Bitmap, MutableBitmap}; +use crate::storage::SharedStorage; + +/// Used to build bitmaps bool-by-bool in sequential order. +#[derive(Default, Clone)] +pub struct BitmapBuilder { + buf: u64, + len: usize, + cap: usize, + set_bits: usize, + bytes: Vec, +} + +impl BitmapBuilder { + pub fn new() -> Self { + Self::default() + } + + pub fn len(&self) -> usize { + self.len + } + + pub fn capacity(&self) -> usize { + self.cap + } + + pub fn with_capacity(bits: usize) -> Self { + let bytes = Vec::with_capacity(bits.div_ceil(64) * 8); + let words_available = bytes.capacity() / 8; + Self { + buf: 0, + len: 0, + cap: words_available * 64, + set_bits: 0, + bytes, + } + } + + #[inline(always)] + pub fn reserve(&mut self, additional: usize) { + if self.len + additional > self.cap { + self.reserve_slow(additional) + } + } + + #[cold] + #[inline(never)] + fn reserve_slow(&mut self, additional: usize) { + let bytes_needed = (self.len + additional).div_ceil(64) * 8; + self.bytes.reserve(bytes_needed - self.bytes.capacity()); + let words_available = self.bytes.capacity() / 8; + self.cap = words_available * 64; + } + + #[inline(always)] + pub fn push(&mut self, x: bool) { + self.reserve(1); + unsafe { self.push_unchecked(x) } + } + + /// # Safety + /// self.len() < self.capacity() must hold. + #[inline(always)] + pub unsafe fn push_unchecked(&mut self, x: bool) { + debug_assert!(self.len < self.cap); + self.buf |= (x as u64) << (self.len % 64); + self.len += 1; + if self.len % 64 == 0 { + let p = self.bytes.as_mut_ptr().add(self.bytes.len()).cast::(); + p.write_unaligned(self.buf.to_le()); + self.bytes.set_len(self.bytes.len() + 8); + self.set_bits += self.buf.count_ones() as usize; + self.buf = 0; + } + } + + /// # Safety + /// May only be called once at the end. + unsafe fn finish(&mut self) { + if self.len % 64 != 0 { + self.bytes.extend_from_slice(&self.buf.to_le_bytes()); + self.set_bits += self.buf.count_ones() as usize; + } + } + + pub fn into_mut(mut self) -> MutableBitmap { + unsafe { + self.finish(); + MutableBitmap::from_vec(self.bytes, self.len) + } + } + + pub fn freeze(mut self) -> Bitmap { + unsafe { + self.finish(); + let storage = SharedStorage::from_vec(self.bytes); + Bitmap::from_inner_unchecked(storage, 0, self.len, Some(self.len - self.set_bits)) + } + } +} diff --git a/crates/polars-arrow/src/bitmap/iterator.rs b/crates/polars-arrow/src/bitmap/iterator.rs index bd48fb706c0a..84e0a2d7a985 100644 --- a/crates/polars-arrow/src/bitmap/iterator.rs +++ b/crates/polars-arrow/src/bitmap/iterator.rs @@ -58,7 +58,7 @@ impl<'a> TrueIdxIter<'a> { } } -impl<'a> Iterator for TrueIdxIter<'a> { +impl Iterator for TrueIdxIter<'_> { type Item = usize; #[inline] @@ -93,7 +93,7 @@ impl<'a> Iterator for TrueIdxIter<'a> { } } -unsafe impl<'a> TrustedLen for TrueIdxIter<'a> {} +unsafe impl TrustedLen for TrueIdxIter<'_> {} pub struct FastU32BitmapIter<'a> { bytes: &'a [u8], @@ -143,7 +143,7 @@ impl<'a> FastU32BitmapIter<'a> { } } -impl<'a> Iterator for FastU32BitmapIter<'a> { +impl Iterator for FastU32BitmapIter<'_> { type Item = u32; #[inline] @@ -171,7 +171,7 @@ impl<'a> Iterator for FastU32BitmapIter<'a> { } } -unsafe impl<'a> TrustedLen for FastU32BitmapIter<'a> {} +unsafe impl TrustedLen for FastU32BitmapIter<'_> {} pub struct FastU56BitmapIter<'a> { bytes: &'a [u8], @@ -222,7 +222,7 @@ impl<'a> FastU56BitmapIter<'a> { } } -impl<'a> Iterator for FastU56BitmapIter<'a> { +impl Iterator for FastU56BitmapIter<'_> { type Item = u64; #[inline] @@ -252,7 +252,7 @@ impl<'a> Iterator for FastU56BitmapIter<'a> { } } -unsafe impl<'a> TrustedLen for FastU56BitmapIter<'a> {} +unsafe impl TrustedLen for FastU56BitmapIter<'_> {} pub struct FastU64BitmapIter<'a> { bytes: &'a [u8], @@ -317,7 +317,7 @@ impl<'a> FastU64BitmapIter<'a> { } } -impl<'a> Iterator for FastU64BitmapIter<'a> { +impl Iterator for FastU64BitmapIter<'_> { type Item = u64; #[inline] @@ -349,7 +349,7 @@ impl<'a> Iterator for FastU64BitmapIter<'a> { } } -unsafe impl<'a> TrustedLen for FastU64BitmapIter<'a> {} +unsafe impl TrustedLen for FastU64BitmapIter<'_> {} /// This crates' equivalent of [`std::vec::IntoIter`] for [`Bitmap`]. #[derive(Debug, Clone)] diff --git a/crates/polars-arrow/src/bitmap/mod.rs b/crates/polars-arrow/src/bitmap/mod.rs index e7ed5fa363e8..6d518bf596b4 100644 --- a/crates/polars-arrow/src/bitmap/mod.rs +++ b/crates/polars-arrow/src/bitmap/mod.rs @@ -19,3 +19,6 @@ pub use assign_ops::*; pub mod utils; pub mod bitmask; + +mod builder; +pub use builder::*; diff --git a/crates/polars-arrow/src/bitmap/utils/iterator.rs b/crates/polars-arrow/src/bitmap/utils/iterator.rs index bba98ffbc704..243372599687 100644 --- a/crates/polars-arrow/src/bitmap/utils/iterator.rs +++ b/crates/polars-arrow/src/bitmap/utils/iterator.rs @@ -205,7 +205,7 @@ impl<'a> BitmapIter<'a> { } } -impl<'a> Iterator for BitmapIter<'a> { +impl Iterator for BitmapIter<'_> { type Item = bool; #[inline] @@ -238,7 +238,7 @@ impl<'a> Iterator for BitmapIter<'a> { } } -impl<'a> DoubleEndedIterator for BitmapIter<'a> { +impl DoubleEndedIterator for BitmapIter<'_> { #[inline] fn next_back(&mut self) -> Option { if self.rest_len > 0 { diff --git a/crates/polars-arrow/src/bitmap/utils/slice_iterator.rs b/crates/polars-arrow/src/bitmap/utils/slice_iterator.rs index f3083ad0b141..9f43a3dfe89a 100644 --- a/crates/polars-arrow/src/bitmap/utils/slice_iterator.rs +++ b/crates/polars-arrow/src/bitmap/utils/slice_iterator.rs @@ -74,7 +74,7 @@ impl<'a> SlicesIterator<'a> { } } -impl<'a> Iterator for SlicesIterator<'a> { +impl Iterator for SlicesIterator<'_> { type Item = (usize, usize); #[inline] diff --git a/crates/polars-arrow/src/compute/cast/binary_to.rs b/crates/polars-arrow/src/compute/cast/binary_to.rs index 6b8c1c59e470..5d2bd3e0b6d9 100644 --- a/crates/polars-arrow/src/compute/cast/binary_to.rs +++ b/crates/polars-arrow/src/compute/cast/binary_to.rs @@ -199,7 +199,7 @@ pub fn fixed_size_binary_to_binview(from: &FixedSizeBinaryArray) -> BinaryViewAr // This is NOT equal to MAX_BYTES_PER_BUFFER because of integer division let split_point = num_elements_per_buffer * size; - // This is zero-copy for the buffer since split just increases the the data since + // This is zero-copy for the buffer since split just increases the data since let mut buffer = from.values().clone(); let mut buffers = Vec::with_capacity(num_buffers); for _ in 0..num_buffers - 1 { diff --git a/crates/polars-arrow/src/compute/take/fixed_size_list.rs b/crates/polars-arrow/src/compute/take/fixed_size_list.rs index bf5bc64606d4..2a52a1ae3fd1 100644 --- a/crates/polars-arrow/src/compute/take/fixed_size_list.rs +++ b/crates/polars-arrow/src/compute/take/fixed_size_list.rs @@ -18,11 +18,14 @@ use std::mem::ManuallyDrop; use polars_utils::itertools::Itertools; +use polars_utils::IdxSize; use super::Index; use crate::array::growable::{Growable, GrowableFixedSizeList}; -use crate::array::{Array, ArrayRef, FixedSizeListArray, PrimitiveArray}; +use crate::array::{Array, ArrayRef, FixedSizeListArray, PrimitiveArray, StaticArray}; use crate::bitmap::MutableBitmap; +use crate::compute::take::bitmap::{take_bitmap_nulls_unchecked, take_bitmap_unchecked}; +use crate::compute::utils::combine_validities_and; use crate::datatypes::reshape::{Dimension, ReshapeDimension}; use crate::datatypes::{ArrowDataType, PhysicalType}; use crate::legacy::prelude::FromData; @@ -151,21 +154,23 @@ unsafe fn aligned_vec(dt: &ArrowDataType, n_bytes: usize) -> Vec { } } -fn no_inner_validities(values: &ArrayRef) -> bool { - if let Some(arr) = values.as_any().downcast_ref::() { - arr.validity().is_none() && no_inner_validities(arr.values()) - } else { - values.validity().is_none() - } +fn arr_no_validities_recursive(arr: &dyn Array) -> bool { + arr.validity().is_none() + && arr + .as_any() + .downcast_ref::() + .map_or(true, |x| arr_no_validities_recursive(x.values().as_ref())) } /// `take` implementation for FixedSizeListArrays -pub(super) unsafe fn take_unchecked( +pub(super) unsafe fn take_unchecked( values: &FixedSizeListArray, - indices: &PrimitiveArray, + indices: &PrimitiveArray, ) -> ArrayRef { let (stride, leaf_type) = get_stride_and_leaf_type(values.dtype(), 1); - if leaf_type.to_physical_type().is_primitive() && no_inner_validities(values.values()) { + if leaf_type.to_physical_type().is_primitive() + && arr_no_validities_recursive(values.values().as_ref()) + { let leaves = get_leaves(values); let (leaves_buf, leave_size) = get_buffer_and_size(leaves); @@ -178,7 +183,7 @@ pub(super) unsafe fn take_unchecked( let dst = buf.spare_capacity_mut(); let mut count = 0; - let validity = if indices.null_count() == 0 { + let outer_validity = if indices.null_count() == 0 { for i in indices.values().iter() { let i = i.to_usize(); @@ -214,10 +219,24 @@ pub(super) unsafe fn take_unchecked( } Some(new_validity.freeze()) }; - assert_eq!(count * bytes_per_element, total_bytes); + assert_eq!(count * bytes_per_element, total_bytes); buf.set_len(total_bytes); + let outer_validity = combine_validities_and( + outer_validity.as_ref(), + values + .validity() + .map(|x| { + if indices.has_nulls() { + take_bitmap_nulls_unchecked(x, indices) + } else { + take_bitmap_unchecked(x, indices.as_slice().unwrap()) + } + }) + .as_ref(), + ); + let leaves = from_buffer(buf, leaves.dtype()); let mut shape = values.get_dims(); shape[0] = Dimension::new(indices.len() as _); @@ -228,8 +247,87 @@ pub(super) unsafe fn take_unchecked( FixedSizeListArray::from_shape(leaves.clone(), &shape) .unwrap() - .with_validity(validity) + .with_validity(outer_validity) } else { take_unchecked_slow(values, indices).boxed() } } + +#[cfg(test)] +mod tests { + use crate::array::StaticArray; + use crate::datatypes::ArrowDataType; + + /// Test gather for FixedSizeListArray with outer validity but no inner validities. + #[test] + fn test_arr_gather_nulls_outer_validity_19482() { + use polars_utils::IdxSize; + + use super::take_unchecked; + use crate::array::{FixedSizeListArray, Int64Array, PrimitiveArray}; + use crate::bitmap::Bitmap; + use crate::datatypes::reshape::{Dimension, ReshapeDimension}; + + unsafe { + let dyn_arr = FixedSizeListArray::from_shape( + Box::new(Int64Array::from_slice([1, 2, 3, 4])), + &[ + ReshapeDimension::Specified(Dimension::new(2)), + ReshapeDimension::Specified(Dimension::new(2)), + ], + ) + .unwrap() + .with_validity(Some(Bitmap::from_iter([true, false]))); // FixedSizeListArray[[1, 2], None] + + let arr = dyn_arr + .as_any() + .downcast_ref::() + .unwrap(); + + assert_eq!( + [arr.validity().is_some(), arr.values().validity().is_some()], + [true, false] + ); + + assert_eq!( + take_unchecked(arr, &PrimitiveArray::::from_slice([0, 1])), + dyn_arr + ) + } + } + + #[test] + fn test_arr_gather_nulls_inner_validity() { + use polars_utils::IdxSize; + + use super::take_unchecked; + use crate::array::{FixedSizeListArray, Int64Array, PrimitiveArray}; + use crate::datatypes::reshape::{Dimension, ReshapeDimension}; + + unsafe { + let dyn_arr = FixedSizeListArray::from_shape( + Box::new(Int64Array::full_null(4, ArrowDataType::Int64)), + &[ + ReshapeDimension::Specified(Dimension::new(2)), + ReshapeDimension::Specified(Dimension::new(2)), + ], + ) + .unwrap(); // FixedSizeListArray[[None, None], [None, None]] + + let arr = dyn_arr + .as_any() + .downcast_ref::() + .unwrap(); + + assert_eq!( + [arr.validity().is_some(), arr.values().validity().is_some()], + [false, true] + ); + + assert_eq!( + take_unchecked(arr, &PrimitiveArray::::from_slice([0, 1])), + dyn_arr + ) + } + } +} diff --git a/crates/polars-arrow/src/ffi/array.rs b/crates/polars-arrow/src/ffi/array.rs index 4adf1b1c613c..5e179be6cca7 100644 --- a/crates/polars-arrow/src/ffi/array.rs +++ b/crates/polars-arrow/src/ffi/array.rs @@ -620,7 +620,7 @@ pub struct ArrowArrayChild<'a> { parent: InternalArrowArray, } -impl<'a> ArrowArrayRef for ArrowArrayChild<'a> { +impl ArrowArrayRef for ArrowArrayChild<'_> { /// the dtype as declared in the schema fn dtype(&self) -> &ArrowDataType { &self.dtype diff --git a/crates/polars-arrow/src/io/ipc/read/common.rs b/crates/polars-arrow/src/io/ipc/read/common.rs index fbb9155149bd..6b893c0e8ce3 100644 --- a/crates/polars-arrow/src/io/ipc/read/common.rs +++ b/crates/polars-arrow/src/io/ipc/read/common.rs @@ -42,7 +42,7 @@ impl<'a, A, I: Iterator> ProjectionIter<'a, A, I> { } } -impl<'a, A, I: Iterator> Iterator for ProjectionIter<'a, A, I> { +impl> Iterator for ProjectionIter<'_, A, I> { type Item = ProjectionResult; fn next(&mut self) -> Option { diff --git a/crates/polars-arrow/src/io/ipc/read/flight.rs b/crates/polars-arrow/src/io/ipc/read/flight.rs index aecf816c2a9a..8c35fa1cfd60 100644 --- a/crates/polars-arrow/src/io/ipc/read/flight.rs +++ b/crates/polars-arrow/src/io/ipc/read/flight.rs @@ -176,7 +176,7 @@ pub struct FlightStreamProducer<'a, R: AsyncRead + AsyncSeek + Unpin + Send> { reader: &'a mut R, } -impl<'a, R: AsyncRead + AsyncSeek + Unpin + Send> Drop for FlightStreamProducer<'a, R> { +impl Drop for FlightStreamProducer<'_, R> { fn drop(&mut self) { if let Some(p) = self.footer { unsafe { @@ -186,7 +186,7 @@ impl<'a, R: AsyncRead + AsyncSeek + Unpin + Send> Drop for FlightStreamProducer< } } -unsafe impl<'a, R: AsyncRead + AsyncSeek + Unpin + Send> Send for FlightStreamProducer<'a, R> {} +unsafe impl Send for FlightStreamProducer<'_, R> {} impl<'a, R: AsyncRead + AsyncSeek + Unpin + Send> FlightStreamProducer<'a, R> { pub async fn new(reader: &'a mut R) -> PolarsResult>> { @@ -414,7 +414,6 @@ mod test { } #[tokio::test] - #[allow(clippy::needless_return)] async fn test_file_flight_simple() { let path = &get_file_path(); let mut file = tokio::fs::File::open(path).await.unwrap(); @@ -427,7 +426,6 @@ mod test { } #[tokio::test] - #[allow(clippy::needless_return)] async fn test_file_flight_amortized() { let path = &get_file_path(); let mut file = File::open(path).await.unwrap(); diff --git a/crates/polars-arrow/src/io/ipc/write/common.rs b/crates/polars-arrow/src/io/ipc/write/common.rs index a49c7fdcd790..6fa1b5f0d8c4 100644 --- a/crates/polars-arrow/src/io/ipc/write/common.rs +++ b/crates/polars-arrow/src/io/ipc/write/common.rs @@ -482,7 +482,7 @@ pub struct Record<'a> { fields: Option>, } -impl<'a> Record<'a> { +impl Record<'_> { /// Get the IPC fields for this record. pub fn fields(&self) -> Option<&[IpcField]> { self.fields.as_deref() diff --git a/crates/polars-arrow/src/legacy/kernels/mod.rs b/crates/polars-arrow/src/legacy/kernels/mod.rs index b8676430ec52..89b31684beed 100644 --- a/crates/polars-arrow/src/legacy/kernels/mod.rs +++ b/crates/polars-arrow/src/legacy/kernels/mod.rs @@ -137,7 +137,7 @@ impl<'a> MaskedSlicesIterator<'a> { } } -impl<'a> Iterator for MaskedSlicesIterator<'a> { +impl Iterator for MaskedSlicesIterator<'_> { type Item = (usize, usize); fn next(&mut self) -> Option { @@ -209,7 +209,7 @@ impl<'a> BinaryMaskedSliceIterator<'a> { } } -impl<'a> Iterator for BinaryMaskedSliceIterator<'a> { +impl Iterator for BinaryMaskedSliceIterator<'_> { type Item = (usize, usize, bool); fn next(&mut self) -> Option { diff --git a/crates/polars-arrow/src/legacy/kernels/rolling/nulls/min_max.rs b/crates/polars-arrow/src/legacy/kernels/rolling/nulls/min_max.rs index 52039fe77572..7f0c65d42bd7 100644 --- a/crates/polars-arrow/src/legacy/kernels/rolling/nulls/min_max.rs +++ b/crates/polars-arrow/src/legacy/kernels/rolling/nulls/min_max.rs @@ -25,7 +25,7 @@ pub struct SortedMinMax<'a, T: NativeType> { null_count: usize, } -impl<'a, T: NativeType> SortedMinMax<'a, T> { +impl SortedMinMax<'_, T> { fn count_nulls(&self, start: usize, end: usize) -> usize { let (bytes, offset, _) = self.validity.as_slice(); count_zeros(bytes, offset + start, end - start) diff --git a/crates/polars-arrow/src/legacy/kernels/rolling/nulls/sum.rs b/crates/polars-arrow/src/legacy/kernels/rolling/nulls/sum.rs index d1392ef7cb50..599bdb241b07 100644 --- a/crates/polars-arrow/src/legacy/kernels/rolling/nulls/sum.rs +++ b/crates/polars-arrow/src/legacy/kernels/rolling/nulls/sum.rs @@ -9,7 +9,7 @@ pub struct SumWindow<'a, T> { pub(super) null_count: usize, } -impl<'a, T: NativeType + IsFloat + Add + Sub> SumWindow<'a, T> { +impl + Sub> SumWindow<'_, T> { // compute sum from the entire window unsafe fn compute_sum_and_null_count(&mut self, start: usize, end: usize) -> Option { let mut sum = None; diff --git a/crates/polars-arrow/src/legacy/kernels/rolling/nulls/variance.rs b/crates/polars-arrow/src/legacy/kernels/rolling/nulls/variance.rs index ee97d4cb15a3..8252c8931c4f 100644 --- a/crates/polars-arrow/src/legacy/kernels/rolling/nulls/variance.rs +++ b/crates/polars-arrow/src/legacy/kernels/rolling/nulls/variance.rs @@ -9,8 +9,8 @@ pub(super) struct SumSquaredWindow<'a, T> { null_count: usize, } -impl<'a, T: NativeType + IsFloat + Add + Sub + Mul> - SumSquaredWindow<'a, T> +impl + Sub + Mul> + SumSquaredWindow<'_, T> { // compute sum from the entire window unsafe fn compute_sum_and_null_count(&mut self, start: usize, end: usize) -> Option { diff --git a/crates/polars-arrow/src/legacy/kernels/rolling/quantile_filter.rs b/crates/polars-arrow/src/legacy/kernels/rolling/quantile_filter.rs index 0b5fb4d97e86..c616310ee568 100644 --- a/crates/polars-arrow/src/legacy/kernels/rolling/quantile_filter.rs +++ b/crates/polars-arrow/src/legacy/kernels/rolling/quantile_filter.rs @@ -32,7 +32,7 @@ struct Block<'a, A> { nulls_in_window: usize, } -impl<'a, A> Debug for Block<'a, A> +impl Debug for Block<'_, A> where A: Indexable, A::Item: Debug + Copy, @@ -443,7 +443,7 @@ where } } -impl<'a, A> LenGet for BlockUnion<'a, A> +impl LenGet for BlockUnion<'_, A> where A: Indexable + Bounded + NullCount + Clone, ::Item: TotalOrd + Copy + Debug, diff --git a/crates/polars-arrow/src/storage.rs b/crates/polars-arrow/src/storage.rs index 76116f917316..ddde815b5b10 100644 --- a/crates/polars-arrow/src/storage.rs +++ b/crates/polars-arrow/src/storage.rs @@ -147,7 +147,7 @@ pub struct SharedStorageAsVecMut<'a, T> { vec: ManuallyDrop>, } -impl<'a, T> Deref for SharedStorageAsVecMut<'a, T> { +impl Deref for SharedStorageAsVecMut<'_, T> { type Target = Vec; fn deref(&self) -> &Self::Target { @@ -155,13 +155,13 @@ impl<'a, T> Deref for SharedStorageAsVecMut<'a, T> { } } -impl<'a, T> DerefMut for SharedStorageAsVecMut<'a, T> { +impl DerefMut for SharedStorageAsVecMut<'_, T> { fn deref_mut(&mut self) -> &mut Self::Target { &mut self.vec } } -impl<'a, T> Drop for SharedStorageAsVecMut<'a, T> { +impl Drop for SharedStorageAsVecMut<'_, T> { fn drop(&mut self) { unsafe { // Restore the SharedStorage. diff --git a/crates/polars-core/src/chunked_array/cast.rs b/crates/polars-core/src/chunked_array/cast.rs index 98ae4962b1ae..fc8f993400d1 100644 --- a/crates/polars-core/src/chunked_array/cast.rs +++ b/crates/polars-core/src/chunked_array/cast.rs @@ -206,10 +206,9 @@ where // - remain signed // - unsigned -> signed // this may still fail with overflow? - let dtype = self.dtype(); - let to_signed = dtype.is_signed_integer(); - let unsigned2unsigned = dtype.is_unsigned_integer() && dtype.is_unsigned_integer(); + let unsigned2unsigned = + self.dtype().is_unsigned_integer() && dtype.is_unsigned_integer(); let allowed = to_signed || unsigned2unsigned; if (allowed) diff --git a/crates/polars-core/src/chunked_array/from_iterator.rs b/crates/polars-core/src/chunked_array/from_iterator.rs index 5d784fb51fd9..ba9e8d1e6ccc 100644 --- a/crates/polars-core/src/chunked_array/from_iterator.rs +++ b/crates/polars-core/src/chunked_array/from_iterator.rs @@ -81,12 +81,12 @@ impl PolarsAsRef for &str {} // &["foo", "bar"] impl PolarsAsRef for &&str {} -impl<'a> PolarsAsRef for Cow<'a, str> {} +impl PolarsAsRef for Cow<'_, str> {} impl PolarsAsRef<[u8]> for Vec {} impl PolarsAsRef<[u8]> for &[u8] {} // TODO: remove! impl PolarsAsRef<[u8]> for &&[u8] {} -impl<'a> PolarsAsRef<[u8]> for Cow<'a, [u8]> {} +impl PolarsAsRef<[u8]> for Cow<'_, [u8]> {} impl FromIterator for StringChunked where diff --git a/crates/polars-core/src/chunked_array/iterator/mod.rs b/crates/polars-core/src/chunked_array/iterator/mod.rs index 728ffc5a8cff..a87d888968f3 100644 --- a/crates/polars-core/src/chunked_array/iterator/mod.rs +++ b/crates/polars-core/src/chunked_array/iterator/mod.rs @@ -25,7 +25,7 @@ pub trait PolarsIterator: ExactSizeIterator + DoubleEndedIterator + Send + Sync + TrustedLen { } -unsafe impl<'a, I> TrustedLen for Box + 'a> {} +unsafe impl TrustedLen for Box + '_> {} /// Implement [`PolarsIterator`] for every iterator that implements the needed traits. impl PolarsIterator for T where @@ -79,7 +79,7 @@ impl<'a> BoolIterNoNull<'a> { } } -impl<'a> Iterator for BoolIterNoNull<'a> { +impl Iterator for BoolIterNoNull<'_> { type Item = bool; fn next(&mut self) -> Option { @@ -100,7 +100,7 @@ impl<'a> Iterator for BoolIterNoNull<'a> { } } -impl<'a> DoubleEndedIterator for BoolIterNoNull<'a> { +impl DoubleEndedIterator for BoolIterNoNull<'_> { fn next_back(&mut self) -> Option { if self.current_end == self.current { None @@ -112,7 +112,7 @@ impl<'a> DoubleEndedIterator for BoolIterNoNull<'a> { } /// all arrays have known size. -impl<'a> ExactSizeIterator for BoolIterNoNull<'a> {} +impl ExactSizeIterator for BoolIterNoNull<'_> {} impl BooleanChunked { #[allow(clippy::wrong_self_convention)] @@ -339,7 +339,7 @@ impl<'a> FixedSizeListIterNoNull<'a> { } #[cfg(feature = "dtype-array")] -impl<'a> Iterator for FixedSizeListIterNoNull<'a> { +impl Iterator for FixedSizeListIterNoNull<'_> { type Item = Series; fn next(&mut self) -> Option { @@ -367,7 +367,7 @@ impl<'a> Iterator for FixedSizeListIterNoNull<'a> { } #[cfg(feature = "dtype-array")] -impl<'a> DoubleEndedIterator for FixedSizeListIterNoNull<'a> { +impl DoubleEndedIterator for FixedSizeListIterNoNull<'_> { fn next_back(&mut self) -> Option { if self.current_end == self.current { None @@ -388,7 +388,7 @@ impl<'a> DoubleEndedIterator for FixedSizeListIterNoNull<'a> { /// all arrays have known size. #[cfg(feature = "dtype-array")] -impl<'a> ExactSizeIterator for FixedSizeListIterNoNull<'a> {} +impl ExactSizeIterator for FixedSizeListIterNoNull<'_> {} #[cfg(feature = "dtype-array")] impl ArrayChunked { diff --git a/crates/polars-core/src/chunked_array/list/iterator.rs b/crates/polars-core/src/chunked_array/list/iterator.rs index b575dcdf5a65..2c48da805171 100644 --- a/crates/polars-core/src/chunked_array/list/iterator.rs +++ b/crates/polars-core/src/chunked_array/list/iterator.rs @@ -18,7 +18,7 @@ pub struct AmortizedListIter<'a, I: Iterator>> { inner_dtype: DataType, } -impl<'a, I: Iterator>> AmortizedListIter<'a, I> { +impl>> AmortizedListIter<'_, I> { pub(crate) unsafe fn new( len: usize, series_container: Series, @@ -37,7 +37,7 @@ impl<'a, I: Iterator>> AmortizedListIter<'a, I> { } } -impl<'a, I: Iterator>> Iterator for AmortizedListIter<'a, I> { +impl>> Iterator for AmortizedListIter<'_, I> { type Item = Option; fn next(&mut self) -> Option { @@ -106,8 +106,8 @@ impl<'a, I: Iterator>> Iterator for AmortizedListIter<'a // # Safety // we correctly implemented size_hint -unsafe impl<'a, I: Iterator>> TrustedLen for AmortizedListIter<'a, I> {} -impl<'a, I: Iterator>> ExactSizeIterator for AmortizedListIter<'a, I> {} +unsafe impl>> TrustedLen for AmortizedListIter<'_, I> {} +impl>> ExactSizeIterator for AmortizedListIter<'_, I> {} impl ListChunked { /// This is an iterator over a [`ListChunked`] that saves allocations. @@ -152,7 +152,7 @@ impl ListChunked { let (s, ptr) = unsafe { unstable_series_container_and_ptr(name, inner_values.clone(), &iter_dtype) }; - // SAFETY: ptr belongs the the Series.. + // SAFETY: ptr belongs the Series.. unsafe { AmortizedListIter::new( self.len(), diff --git a/crates/polars-core/src/chunked_array/logical/categorical/mod.rs b/crates/polars-core/src/chunked_array/logical/categorical/mod.rs index d740455777c4..8ccd455e4bd0 100644 --- a/crates/polars-core/src/chunked_array/logical/categorical/mod.rs +++ b/crates/polars-core/src/chunked_array/logical/categorical/mod.rs @@ -443,7 +443,7 @@ pub struct CatIter<'a> { iter: Box> + 'a>, } -unsafe impl<'a> TrustedLen for CatIter<'a> {} +unsafe impl TrustedLen for CatIter<'_> {} impl<'a> Iterator for CatIter<'a> { type Item = Option<&'a str>; @@ -463,7 +463,7 @@ impl<'a> Iterator for CatIter<'a> { } } -impl<'a> ExactSizeIterator for CatIter<'a> {} +impl ExactSizeIterator for CatIter<'_> {} #[cfg(test)] mod test { diff --git a/crates/polars-core/src/chunked_array/object/iterator.rs b/crates/polars-core/src/chunked_array/object/iterator.rs index 7a5c6e00b590..7abb9c46f4ee 100644 --- a/crates/polars-core/src/chunked_array/object/iterator.rs +++ b/crates/polars-core/src/chunked_array/object/iterator.rs @@ -54,7 +54,7 @@ impl<'a, T: PolarsObject> std::iter::Iterator for ObjectIter<'a, T> { } } -impl<'a, T: PolarsObject> std::iter::DoubleEndedIterator for ObjectIter<'a, T> { +impl std::iter::DoubleEndedIterator for ObjectIter<'_, T> { fn next_back(&mut self) -> Option { if self.current_end == self.current { None @@ -75,7 +75,7 @@ impl<'a, T: PolarsObject> std::iter::DoubleEndedIterator for ObjectIter<'a, T> { } /// all arrays have known size. -impl<'a, T: PolarsObject> std::iter::ExactSizeIterator for ObjectIter<'a, T> {} +impl std::iter::ExactSizeIterator for ObjectIter<'_, T> {} impl<'a, T: PolarsObject> IntoIterator for &'a ObjectArray { type Item = Option<&'a T>; diff --git a/crates/polars-core/src/datatypes/any_value.rs b/crates/polars-core/src/datatypes/any_value.rs index edf76969e976..a1a87e3001bd 100644 --- a/crates/polars-core/src/datatypes/any_value.rs +++ b/crates/polars-core/src/datatypes/any_value.rs @@ -854,13 +854,13 @@ impl AnyValue<'_> { } } -impl<'a> Hash for AnyValue<'a> { +impl Hash for AnyValue<'_> { fn hash(&self, state: &mut H) { self.hash_impl(state, false) } } -impl<'a> Eq for AnyValue<'a> {} +impl Eq for AnyValue<'_> {} impl<'a, T> From> for AnyValue<'a> where diff --git a/crates/polars-core/src/frame/group_by/proxy.rs b/crates/polars-core/src/frame/group_by/proxy.rs index d1c04162b7b9..63b1a8022108 100644 --- a/crates/polars-core/src/frame/group_by/proxy.rs +++ b/crates/polars-core/src/frame/group_by/proxy.rs @@ -546,7 +546,7 @@ pub enum GroupsIndicator<'a> { Slice([IdxSize; 2]), } -impl<'a> GroupsIndicator<'a> { +impl GroupsIndicator<'_> { pub fn len(&self) -> usize { match self { GroupsIndicator::Idx(g) => g.1.len(), diff --git a/crates/polars-core/src/frame/mod.rs b/crates/polars-core/src/frame/mod.rs index 8ce1525b2ed2..0d4230ff3f91 100644 --- a/crates/polars-core/src/frame/mod.rs +++ b/crates/polars-core/src/frame/mod.rs @@ -2851,7 +2851,8 @@ impl DataFrame { dtype.is_numeric() || matches!(dtype, DataType::Boolean) }) .cloned() - .collect(); + .collect::>(); + polars_ensure!(!columns.is_empty(), InvalidOperation: "'horizontal_mean' expected at least 1 numerical column"); let numeric_df = unsafe { DataFrame::_new_no_checks_impl(self.height(), columns) }; let sum = || numeric_df.sum_horizontal(null_strategy); @@ -3309,7 +3310,7 @@ pub struct RecordBatchIter<'a> { parallel: bool, } -impl<'a> Iterator for RecordBatchIter<'a> { +impl Iterator for RecordBatchIter<'_> { type Item = RecordBatch; fn next(&mut self) -> Option { diff --git a/crates/polars-core/src/frame/row/mod.rs b/crates/polars-core/src/frame/row/mod.rs index 2311194303db..ad8831ebda54 100644 --- a/crates/polars-core/src/frame/row/mod.rs +++ b/crates/polars-core/src/frame/row/mod.rs @@ -206,7 +206,7 @@ pub fn rows_to_schema_first_non_null( .iter_values() .enumerate() .filter_map(|(i, dtype)| { - // double check struct and list types types + // double check struct and list types // nested null values can be wrongly inferred by front ends match dtype { DataType::Null | DataType::List(_) => Some(i), diff --git a/crates/polars-core/src/hashing/vector_hasher.rs b/crates/polars-core/src/hashing/vector_hasher.rs index 7dfb07c64d58..e00e45f1ede8 100644 --- a/crates/polars-core/src/hashing/vector_hasher.rs +++ b/crates/polars-core/src/hashing/vector_hasher.rs @@ -1,4 +1,5 @@ use arrow::bitmap::utils::get_bit_unchecked; +use polars_utils::hashing::folded_multiply; use polars_utils::total_ord::{ToTotalOrd, TotalHash}; use rayon::prelude::*; use xxhash_rust::xxh3::xxh3_64_with_seed; @@ -30,11 +31,6 @@ pub trait VecHash { } } -pub(crate) const fn folded_multiply(s: u64, by: u64) -> u64 { - let result = (s as u128).wrapping_mul(by as u128); - ((result & 0xffff_ffff_ffff_ffff) as u64) ^ ((result >> 64) as u64) -} - pub(crate) fn get_null_hash_value(random_state: &PlRandomState) -> u64 { // we just start with a large prime number and hash that twice // to get a constant hash value for null/None diff --git a/crates/polars-core/src/series/mod.rs b/crates/polars-core/src/series/mod.rs index 9508a72cc1d7..fadd2b4f570f 100644 --- a/crates/polars-core/src/series/mod.rs +++ b/crates/polars-core/src/series/mod.rs @@ -600,6 +600,7 @@ impl Series { /// * Time -> Int64 /// * Categorical -> UInt32 /// * List(inner) -> List(physical of inner) + /// * Array(inner) -> Array(physical of inner) /// * Struct -> Struct with physical repr of each struct column pub fn to_physical_repr(&self) -> Cow { use DataType::*; @@ -620,6 +621,11 @@ impl Series { Cow::Owned(ca.physical().clone().into_series()) }, List(inner) => Cow::Owned(self.cast(&List(Box::new(inner.to_physical()))).unwrap()), + #[cfg(feature = "dtype-array")] + Array(inner, size) => Cow::Owned( + self.cast(&Array(Box::new(inner.to_physical()), *size)) + .unwrap(), + ), #[cfg(feature = "dtype-struct")] Struct(_) => { let arr = self.struct_().unwrap(); @@ -997,7 +1003,7 @@ fn equal_outer_type(dtype: &DataType) -> bool { } } -impl<'a, T> AsRef> for dyn SeriesTrait + 'a +impl AsRef> for dyn SeriesTrait + '_ where T: 'static + PolarsDataType, { @@ -1014,7 +1020,7 @@ where } } -impl<'a, T> AsMut> for dyn SeriesTrait + 'a +impl AsMut> for dyn SeriesTrait + '_ where T: 'static + PolarsDataType, { diff --git a/crates/polars-core/src/series/series_trait.rs b/crates/polars-core/src/series/series_trait.rs index 14a0752eae1e..0352343baa82 100644 --- a/crates/polars-core/src/series/series_trait.rs +++ b/crates/polars-core/src/series/series_trait.rs @@ -613,7 +613,7 @@ pub trait SeriesTrait: } } -impl<'a> (dyn SeriesTrait + 'a) { +impl (dyn SeriesTrait + '_) { pub fn unpack(&self) -> PolarsResult<&ChunkedArray> where N: 'static + PolarsDataType, diff --git a/crates/polars-expr/Cargo.toml b/crates/polars-expr/Cargo.toml index 0911445617aa..29aa34652146 100644 --- a/crates/polars-expr/Cargo.toml +++ b/crates/polars-expr/Cargo.toml @@ -24,6 +24,7 @@ polars-plan = { workspace = true } polars-row = { workspace = true } polars-time = { workspace = true, optional = true } polars-utils = { workspace = true } +rand = { workspace = true } rayon = { workspace = true } [features] diff --git a/crates/polars-expr/src/expressions/group_iter.rs b/crates/polars-expr/src/expressions/group_iter.rs index 6b1d54d0ac13..b42851e49d2a 100644 --- a/crates/polars-expr/src/expressions/group_iter.rs +++ b/crates/polars-expr/src/expressions/group_iter.rs @@ -4,7 +4,7 @@ use polars_core::series::amortized_iter::AmortSeries; use super::*; -impl<'a> AggregationContext<'a> { +impl AggregationContext<'_> { pub(super) fn iter_groups( &mut self, keep_names: bool, diff --git a/crates/polars-expr/src/expressions/ternary.rs b/crates/polars-expr/src/expressions/ternary.rs index c776e4b951dd..37600c71f06a 100644 --- a/crates/polars-expr/src/expressions/ternary.rs +++ b/crates/polars-expr/src/expressions/ternary.rs @@ -230,7 +230,7 @@ impl PhysicalExpr for TernaryExpr { // * `zip_with` can be called directly with the series // * mix of unit literals and AggregatedList // * `zip_with` can be called with the flat values after the offsets - // have been been checked for alignment + // have been checked for alignment let ac_target = non_literal_acs.first().unwrap(); let agg_state_out = match ac_target.agg_state() { diff --git a/crates/polars-expr/src/expressions/window.rs b/crates/polars-expr/src/expressions/window.rs index d03467d01da9..f843c0e83d95 100644 --- a/crates/polars-expr/src/expressions/window.rs +++ b/crates/polars-expr/src/expressions/window.rs @@ -754,7 +754,7 @@ where unsafe { values.set_len(len) } ChunkedArray::new_vec(ca.name().clone(), values).into_series() } else { - // We don't use a mutable bitmap as bits will have have race conditions! + // We don't use a mutable bitmap as bits will have race conditions! // A single byte might alias if we write from single threads. let mut validity: Vec = vec![false; len]; let validity_ptr = validity.as_mut_ptr(); diff --git a/crates/polars-expr/src/groups/mod.rs b/crates/polars-expr/src/groups/mod.rs index 5eb32b34a052..43091244c661 100644 --- a/crates/polars-expr/src/groups/mod.rs +++ b/crates/polars-expr/src/groups/mod.rs @@ -23,21 +23,20 @@ pub trait Grouper: Any + Send { /// the ith group of other now has group index group_idxs[i] in self. fn combine(&mut self, other: &dyn Grouper, group_idxs: &mut Vec); - /// Partitions this Grouper into the given partitions. + /// Partitions this Grouper into the given number of partitions. /// - /// Updates partition_idxs and group_idxs such that the ith group of self - /// has group index group_idxs[i] in partition partition_idxs[i]. + /// Updates partition_idxs such that the ith group of self moves to partition + /// partition_idxs[i]. /// /// It is guaranteed that two equal keys in two independent partition_into /// calls map to the same partition index if the seed and the number of /// partitions is equal. - fn partition_into( + fn partition( &self, seed: u64, - partitions: &mut [Box], + num_partitions: usize, partition_idxs: &mut Vec, - group_idxs: &mut Vec, - ); + ) -> Vec>; /// Returns the keys in this Grouper in group order, that is the key for /// group i is returned in row i. diff --git a/crates/polars-expr/src/groups/row_encoded.rs b/crates/polars-expr/src/groups/row_encoded.rs index 46ec956106a5..1a2fd5209436 100644 --- a/crates/polars-expr/src/groups/row_encoded.rs +++ b/crates/polars-expr/src/groups/row_encoded.rs @@ -4,8 +4,10 @@ use hashbrown::hash_table::{Entry, HashTable}; use polars_core::chunked_array::ops::row_encode::_get_rows_encoded_unordered; use polars_row::EncodingField; use polars_utils::aliases::PlRandomState; +use polars_utils::hashing::{folded_multiply, hash_to_partition}; use polars_utils::itertools::Itertools; use polars_utils::vec::PushUnchecked; +use rand::Rng; use super::*; @@ -27,7 +29,13 @@ pub struct RowEncodedHashGrouper { key_schema: Arc, table: HashTable, key_data: Vec, + + // Used for computing canonical hashes. random_state: PlRandomState, + + // Internal random seed used to keep hash iteration order decorrelated. + // We simply store a random odd number and multiply the canonical hash by it. + seed: u64, } impl RowEncodedHashGrouper { @@ -35,6 +43,7 @@ impl RowEncodedHashGrouper { Self { key_schema, random_state, + seed: rand::random::() | 1, ..Default::default() } } @@ -42,9 +51,9 @@ impl RowEncodedHashGrouper { fn insert_key(&mut self, hash: u64, key: &[u8]) -> IdxSize { let num_groups = self.table.len(); let entry = self.table.entry( - hash, + hash.wrapping_mul(self.seed), |g| unsafe { hash == g.key_hash && key == g.key(&self.key_data) }, - |g| g.key_hash, + |g| g.key_hash.wrapping_mul(self.seed), ); match entry { @@ -64,6 +73,23 @@ impl RowEncodedHashGrouper { } } + /// Insert a key, without checking that it is unique. + fn insert_key_unique(&mut self, hash: u64, key: &[u8]) -> IdxSize { + let group_idx = self.table.len().try_into().unwrap(); + let group = Group { + key_hash: hash, + key_offset: self.key_data.len(), + key_length: key.len().try_into().unwrap(), + group_idx, + }; + self.key_data.extend(key); + self.table + .insert_unique(hash.wrapping_mul(self.seed), group, |g| { + g.key_hash.wrapping_mul(self.seed) + }); + group_idx + } + fn finalize_keys(&self, mut key_rows: Vec<&[u8]>) -> DataFrame { let key_dtypes = self .key_schema @@ -125,7 +151,9 @@ impl Grouper for RowEncodedHashGrouper { fn combine(&mut self, other: &dyn Grouper, group_idxs: &mut Vec) { let other = other.as_any().downcast_ref::().unwrap(); - self.table.reserve(other.table.len(), |g| g.key_hash); // TODO: cardinality estimation. + // TODO: cardinality estimation. + self.table + .reserve(other.table.len(), |g| g.key_hash.wrapping_mul(self.seed)); unsafe { group_idxs.clear(); @@ -167,14 +195,54 @@ impl Grouper for RowEncodedHashGrouper { ) } - fn partition_into( + fn partition( &self, - _seed: u64, - _partitions: &mut [Box], - _partition_idxs: &mut Vec, - _group_idxs: &mut Vec, - ) { - unimplemented!() + seed: u64, + num_partitions: usize, + partition_idxs: &mut Vec, + ) -> Vec> { + assert!(num_partitions > 0); + + // Two-pass algorithm to prevent reallocations. + let mut partition_size = vec![(0, 0); num_partitions]; // (keys, bytes) + unsafe { + for group in self.table.iter() { + let ph = folded_multiply(group.key_hash, seed | 1); + let p_idx = hash_to_partition(ph, num_partitions); + let (p_keys, p_bytes) = partition_size.get_unchecked_mut(p_idx as usize); + *p_keys += 1; + *p_bytes += group.key_length as usize; + } + } + + let mut rng = rand::thread_rng(); + let mut partitions = partition_size + .into_iter() + .map(|(keys, bytes)| Self { + key_schema: self.key_schema.clone(), + table: HashTable::with_capacity(keys), + key_data: Vec::with_capacity(bytes), + random_state: self.random_state.clone(), + seed: rng.gen::() | 1, + }) + .collect_vec(); + + unsafe { + partition_idxs.clear(); + partition_idxs.reserve(self.table.len()); + let partition_idxs_out = partition_idxs.spare_capacity_mut(); + for group in self.table.iter() { + let ph = folded_multiply(group.key_hash, seed | 1); + let p_idx = hash_to_partition(ph, num_partitions); + let p = partitions.get_unchecked_mut(p_idx); + p.insert_key_unique(group.key_hash, group.key(&self.key_data)); + *partition_idxs_out.get_unchecked_mut(group.group_idx as usize) = + MaybeUninit::new(p_idx as IdxSize); + } + partition_idxs.set_len(self.table.len()); + } + + partitions.into_iter().map(|p| Box::new(p) as _).collect() } fn as_any(&self) -> &dyn Any { diff --git a/crates/polars-expr/src/reduce/len.rs b/crates/polars-expr/src/reduce/len.rs index fa5aedb91f18..57641b1a02b6 100644 --- a/crates/polars-expr/src/reduce/len.rs +++ b/crates/polars-expr/src/reduce/len.rs @@ -1,6 +1,7 @@ use polars_core::error::constants::LENGTH_LIMIT_MSG; use super::*; +use crate::reduce::partition::partition_vec; #[derive(Default)] pub struct LenReduce { @@ -61,6 +62,17 @@ impl GroupedReduction for LenReduce { Ok(ca.into_series()) } + unsafe fn partition( + self: Box, + partition_sizes: &[IdxSize], + partition_idxs: &[IdxSize], + ) -> Vec> { + partition_vec(self.groups, partition_sizes, partition_idxs) + .into_iter() + .map(|groups| Box::new(Self { groups }) as _) + .collect() + } + fn as_any(&self) -> &dyn Any { self } diff --git a/crates/polars-expr/src/reduce/min_max.rs b/crates/polars-expr/src/reduce/min_max.rs index f4541d7a88a1..de25d3efc927 100644 --- a/crates/polars-expr/src/reduce/min_max.rs +++ b/crates/polars-expr/src/reduce/min_max.rs @@ -11,6 +11,7 @@ use polars_utils::float::IsFloat; use polars_utils::min_max::MinMax; use super::*; +use crate::reduce::partition::partition_mask; pub fn new_min_reduction(dtype: DataType, propagate_nans: bool) -> Box { use DataType::*; @@ -344,6 +345,25 @@ impl GroupedReduction for BoolMinGroupedReduction { Ok(()) } + unsafe fn partition( + self: Box, + partition_sizes: &[IdxSize], + partition_idxs: &[IdxSize], + ) -> Vec> { + let p_values = partition_mask(&self.values.freeze(), partition_sizes, partition_idxs); + let p_mask = partition_mask(&self.mask.freeze(), partition_sizes, partition_idxs); + p_values + .into_iter() + .zip(p_mask) + .map(|(values, mask)| { + Box::new(Self { + values: values.into_mut(), + mask: mask.into_mut(), + }) as _ + }) + .collect() + } + fn finalize(&mut self) -> PolarsResult { let v = core::mem::take(&mut self.values); let m = core::mem::take(&mut self.mask); @@ -450,6 +470,25 @@ impl GroupedReduction for BoolMaxGroupedReduction { }) } + unsafe fn partition( + self: Box, + partition_sizes: &[IdxSize], + partition_idxs: &[IdxSize], + ) -> Vec> { + let p_values = partition_mask(&self.values.freeze(), partition_sizes, partition_idxs); + let p_mask = partition_mask(&self.mask.freeze(), partition_sizes, partition_idxs); + p_values + .into_iter() + .zip(p_mask) + .map(|(values, mask)| { + Box::new(Self { + values: values.into_mut(), + mask: mask.into_mut(), + }) as _ + }) + .collect() + } + fn as_any(&self) -> &dyn Any { self } diff --git a/crates/polars-expr/src/reduce/mod.rs b/crates/polars-expr/src/reduce/mod.rs index 170ee6abff6c..bfe4cb56417b 100644 --- a/crates/polars-expr/src/reduce/mod.rs +++ b/crates/polars-expr/src/reduce/mod.rs @@ -2,6 +2,7 @@ mod convert; mod len; mod mean; mod min_max; +mod partition; mod sum; mod var_std; @@ -49,6 +50,22 @@ pub trait GroupedReduction: Any + Send { group_idxs: &[IdxSize], ) -> PolarsResult<()>; + /// Partitions this GroupedReduction into several partitions. + /// + /// The ith group of this GroupedReduction should becomes the group_idxs[i] + /// group in partition partition_idxs[i]. + /// + /// # Safety + /// partitions_idxs[i] < partition_sizes.len() for all i. + /// group_idxs[i] < partition_sizes[partition_idxs[i]] for all i. + /// Each partition p has an associated set of group_idxs, this set contains + /// 0..partition_size[p] exactly once. + unsafe fn partition( + self: Box, + partition_sizes: &[IdxSize], + partition_idxs: &[IdxSize], + ) -> Vec>; + /// Returns the finalized value per group as a Series. /// /// After this operation the number of groups is reset to 0. @@ -245,6 +262,23 @@ where Ok(()) } + unsafe fn partition( + self: Box, + partition_sizes: &[IdxSize], + partition_idxs: &[IdxSize], + ) -> Vec> { + partition::partition_vec(self.values, partition_sizes, partition_idxs) + .into_iter() + .map(|values| { + Box::new(Self { + values, + in_dtype: self.in_dtype.clone(), + reducer: self.reducer.clone(), + }) as _ + }) + .collect() + } + fn finalize(&mut self) -> PolarsResult { let v = core::mem::take(&mut self.values); self.reducer.finish(v, None, &self.in_dtype) @@ -353,6 +387,29 @@ where Ok(()) } + unsafe fn partition( + self: Box, + partition_sizes: &[IdxSize], + partition_idxs: &[IdxSize], + ) -> Vec> { + partition::partition_vec_mask( + self.values, + &self.mask.freeze(), + partition_sizes, + partition_idxs, + ) + .into_iter() + .map(|(values, mask)| { + Box::new(Self { + values, + mask: mask.into_mut(), + in_dtype: self.in_dtype.clone(), + reducer: self.reducer.clone(), + }) as _ + }) + .collect() + } + fn finalize(&mut self) -> PolarsResult { let v = core::mem::take(&mut self.values); let m = core::mem::take(&mut self.mask); diff --git a/crates/polars-expr/src/reduce/partition.rs b/crates/polars-expr/src/reduce/partition.rs new file mode 100644 index 000000000000..0152035879bd --- /dev/null +++ b/crates/polars-expr/src/reduce/partition.rs @@ -0,0 +1,105 @@ +use arrow::bitmap::{Bitmap, BitmapBuilder}; +use polars_utils::itertools::Itertools; +use polars_utils::vec::PushUnchecked; +use polars_utils::IdxSize; + +/// Partitions this Vec into multiple Vecs. +/// +/// # Safety +/// partitions_idxs[i] < partition_sizes.len() for all i. +/// idx_in_partition[i] < partition_sizes[partition_idxs[i]] for all i. +/// Each partition p has an associated set of idx_in_partition, this set +/// contains 0..partition_size[p] exactly once. +pub unsafe fn partition_vec( + v: Vec, + partition_sizes: &[IdxSize], + partition_idxs: &[IdxSize], +) -> Vec> { + assert!(partition_idxs.len() == v.len()); + + let mut partitions = partition_sizes + .iter() + .map(|sz| Vec::::with_capacity(*sz as usize)) + .collect_vec(); + + unsafe { + // Scatter into each partition. + for (i, val) in v.into_iter().enumerate() { + let p_idx = *partition_idxs.get_unchecked(i) as usize; + debug_assert!(p_idx < partitions.len()); + let p = partitions.get_unchecked_mut(p_idx); + p.push_unchecked(val); + } + + for (p, sz) in partitions.iter_mut().zip(partition_sizes) { + p.set_len(*sz as usize); + } + } + + partitions +} + +/// # Safety +/// Same as partition_vec. +pub unsafe fn partition_mask( + m: &Bitmap, + partition_sizes: &[IdxSize], + partition_idxs: &[IdxSize], +) -> Vec { + assert!(partition_idxs.len() == m.len()); + + let mut partitions = partition_sizes + .iter() + .map(|sz| BitmapBuilder::with_capacity(*sz as usize)) + .collect_vec(); + + unsafe { + // Scatter into each partition. + for i in 0..m.len() { + let p_idx = *partition_idxs.get_unchecked(i) as usize; + let p = partitions.get_unchecked_mut(p_idx); + p.push_unchecked(m.get_bit_unchecked(i)); + } + } + + partitions +} + +/// A fused loop of partition_vec and partition_mask. +/// # Safety +/// Same as partition_vec. +pub unsafe fn partition_vec_mask( + v: Vec, + m: &Bitmap, + partition_sizes: &[IdxSize], + partition_idxs: &[IdxSize], +) -> Vec<(Vec, BitmapBuilder)> { + assert!(partition_idxs.len() == v.len()); + assert!(m.len() == v.len()); + + let mut partitions = partition_sizes + .iter() + .map(|sz| { + ( + Vec::::with_capacity(*sz as usize), + BitmapBuilder::with_capacity(*sz as usize), + ) + }) + .collect_vec(); + + unsafe { + // Scatter into each partition. + for (i, val) in v.into_iter().enumerate() { + let p_idx = *partition_idxs.get_unchecked(i) as usize; + let (pv, pm) = partitions.get_unchecked_mut(p_idx); + pv.push_unchecked(val); + pm.push_unchecked(m.get_bit_unchecked(i)); + } + + for (p, sz) in partitions.iter_mut().zip(partition_sizes) { + p.0.set_len(*sz as usize); + } + } + + partitions +} diff --git a/crates/polars-expr/src/reduce/sum.rs b/crates/polars-expr/src/reduce/sum.rs index 111d69eec4f2..466d5ffb9f9d 100644 --- a/crates/polars-expr/src/reduce/sum.rs +++ b/crates/polars-expr/src/reduce/sum.rs @@ -126,6 +126,22 @@ where Ok(()) } + unsafe fn partition( + self: Box, + partition_sizes: &[IdxSize], + partition_idxs: &[IdxSize], + ) -> Vec> { + partition::partition_vec(self.sums, partition_sizes, partition_idxs) + .into_iter() + .map(|sums| { + Box::new(Self { + sums, + in_dtype: self.in_dtype.clone(), + }) as _ + }) + .collect() + } + fn finalize(&mut self) -> PolarsResult { let v = core::mem::take(&mut self.sums); let arr = Box::new(PrimitiveArray::::from_vec(v)); diff --git a/crates/polars-ffi/src/version_0.rs b/crates/polars-ffi/src/version_0.rs index 3cffd4425045..504f6cc126d1 100644 --- a/crates/polars-ffi/src/version_0.rs +++ b/crates/polars-ffi/src/version_0.rs @@ -132,7 +132,7 @@ impl CallerContext { self.bitflags |= 1 << k } - /// Parallelism is done by polars' main engine, the plugin should not run run its own parallelism. + /// Parallelism is done by polars' main engine, the plugin should not run its own parallelism. /// If this is `false`, the plugin could use parallelism without (much) contention with polars /// parallelism strategies. pub fn parallel(&self) -> bool { diff --git a/crates/polars-io/src/cloud/credential_provider.rs b/crates/polars-io/src/cloud/credential_provider.rs index 989e0318120c..e6de837488c1 100644 --- a/crates/polars-io/src/cloud/credential_provider.rs +++ b/crates/polars-io/src/cloud/credential_provider.rs @@ -343,9 +343,6 @@ impl serde::Serialize for PlCredentialProvider { { use serde::ser::Error; - // TODO: - // * Add magic bytes here to indicate a python function - // * Check the Python version on deserialize #[cfg(feature = "python")] if let PlCredentialProvider::Python(v) = self { return v.serialize(serializer); diff --git a/crates/polars-io/src/csv/read/read_impl.rs b/crates/polars-io/src/csv/read/read_impl.rs index 520f6bee729d..52f29ee0a128 100644 --- a/crates/polars-io/src/csv/read/read_impl.rs +++ b/crates/polars-io/src/csv/read/read_impl.rs @@ -126,7 +126,7 @@ pub(crate) struct CoreReader<'a> { truncate_ragged_lines: bool, } -impl<'a> fmt::Debug for CoreReader<'a> { +impl fmt::Debug for CoreReader<'_> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.debug_struct("Reader") .field("schema", &self.schema) diff --git a/crates/polars-io/src/csv/read/read_impl/batched.rs b/crates/polars-io/src/csv/read/read_impl/batched.rs index 3bf6e2dd4e32..90e0b4e4e37c 100644 --- a/crates/polars-io/src/csv/read/read_impl/batched.rs +++ b/crates/polars-io/src/csv/read/read_impl/batched.rs @@ -66,7 +66,7 @@ struct ChunkOffsetIter<'a> { eol_char: u8, } -impl<'a> Iterator for ChunkOffsetIter<'a> { +impl Iterator for ChunkOffsetIter<'_> { type Item = (usize, usize); fn next(&mut self) -> Option { @@ -209,7 +209,7 @@ pub struct BatchedCsvReader<'a> { decimal_comma: bool, } -impl<'a> BatchedCsvReader<'a> { +impl BatchedCsvReader<'_> { pub fn next_batches(&mut self, n: usize) -> PolarsResult>> { if n == 0 || self.remaining == 0 { return Ok(None); diff --git a/crates/polars-io/src/csv/write/write_impl/serializer.rs b/crates/polars-io/src/csv/write/write_impl/serializer.rs index db973524c69c..6a4f964d88b3 100644 --- a/crates/polars-io/src/csv/write/write_impl/serializer.rs +++ b/crates/polars-io/src/csv/write/write_impl/serializer.rs @@ -689,7 +689,7 @@ pub(super) fn serializer_for<'a>( quote_if_always!(decimal_serializer, scale.unwrap_or(0)) }, _ => { - polars_bail!(ComputeError: "datatype {dtype} cannot be written to CSV\n\nConsider using JSON or or a binary format.") + polars_bail!(ComputeError: "datatype {dtype} cannot be written to CSV\n\nConsider using JSON or a binary format.") }, }; Ok(serializer) diff --git a/crates/polars-io/src/ipc/ipc_file.rs b/crates/polars-io/src/ipc/ipc_file.rs index ab6805d18967..100d37b2c941 100644 --- a/crates/polars-io/src/ipc/ipc_file.rs +++ b/crates/polars-io/src/ipc/ipc_file.rs @@ -1,6 +1,6 @@ //! # (De)serializing Arrows IPC format. //! -//! Arrow IPC is a [binary format format](https://arrow.apache.org/docs/python/ipc.html). +//! Arrow IPC is a [binary format](https://arrow.apache.org/docs/python/ipc.html). //! It is the recommended way to serialize and deserialize Polars DataFrames as this is most true //! to the data schema. //! diff --git a/crates/polars-io/src/ipc/ipc_stream.rs b/crates/polars-io/src/ipc/ipc_stream.rs index 6b16579ac93d..6393c639cf35 100644 --- a/crates/polars-io/src/ipc/ipc_stream.rs +++ b/crates/polars-io/src/ipc/ipc_stream.rs @@ -1,6 +1,6 @@ //! # (De)serializing Arrows Streaming IPC format. //! -//! Arrow Streaming IPC is a [binary format format](https://arrow.apache.org/docs/python/ipc.html). +//! Arrow Streaming IPC is a [binary format](https://arrow.apache.org/docs/python/ipc.html). //! It used for sending an arbitrary length sequence of record batches. //! The format must be processed from start to end, and does not support random access. //! It is different than IPC, if you can't deserialize a file with `IpcReader::new`, it's probably an IPC Stream File. diff --git a/crates/polars-io/src/json/mod.rs b/crates/polars-io/src/json/mod.rs index 432efca38360..df2ff8bb9c28 100644 --- a/crates/polars-io/src/json/mod.rs +++ b/crates/polars-io/src/json/mod.rs @@ -236,7 +236,7 @@ pub fn remove_bom(bytes: &[u8]) -> PolarsResult<&[u8]> { Ok(bytes) } } -impl<'a, R> SerReader for JsonReader<'a, R> +impl SerReader for JsonReader<'_, R> where R: MmapBytesReader, { diff --git a/crates/polars-io/src/ndjson/buffer.rs b/crates/polars-io/src/ndjson/buffer.rs index 2bb2a028f1ca..1c9938979af5 100644 --- a/crates/polars-io/src/ndjson/buffer.rs +++ b/crates/polars-io/src/ndjson/buffer.rs @@ -12,9 +12,9 @@ use simd_json::{BorrowedValue as Value, KnownKey, StaticNode}; #[derive(Debug, Clone, PartialEq)] pub(crate) struct BufferKey<'a>(pub(crate) KnownKey<'a>); -impl<'a> Eq for BufferKey<'a> {} +impl Eq for BufferKey<'_> {} -impl<'a> Hash for BufferKey<'a> { +impl Hash for BufferKey<'_> { fn hash(&self, state: &mut H) { self.0.key().hash(state) } diff --git a/crates/polars-io/src/ndjson/core.rs b/crates/polars-io/src/ndjson/core.rs index a72b4ccf7038..2fabd7dcd589 100644 --- a/crates/polars-io/src/ndjson/core.rs +++ b/crates/polars-io/src/ndjson/core.rs @@ -133,7 +133,7 @@ where } } -impl<'a> JsonLineReader<'a, File> { +impl JsonLineReader<'_, File> { /// This is the recommended way to create a json reader as this allows for fastest parsing. pub fn from_path>(path: P) -> PolarsResult { let path = crate::resolve_homedir(&path.into()); @@ -141,7 +141,7 @@ impl<'a> JsonLineReader<'a, File> { Ok(Self::new(f).with_path(Some(path))) } } -impl<'a, R> SerReader for JsonLineReader<'a, R> +impl SerReader for JsonLineReader<'_, R> where R: MmapBytesReader, { diff --git a/crates/polars-io/src/parquet/read/read_impl.rs b/crates/polars-io/src/parquet/read/read_impl.rs index aa86bfccce51..de22b639bf8b 100644 --- a/crates/polars-io/src/parquet/read/read_impl.rs +++ b/crates/polars-io/src/parquet/read/read_impl.rs @@ -43,7 +43,7 @@ fn assert_dtypes(dtype: &ArrowDataType) { // These should all be casted to the BinaryView / Utf8View variants D::Utf8 | D::Binary | D::LargeUtf8 | D::LargeBinary => unreachable!(), - // These should be casted to to Float32 + // These should be casted to Float32 D::Float16 => unreachable!(), // This should have been converted to a LargeList diff --git a/crates/polars-io/src/path_utils/mod.rs b/crates/polars-io/src/path_utils/mod.rs index 1795cda6ebd0..71c59fecb31d 100644 --- a/crates/polars-io/src/path_utils/mod.rs +++ b/crates/polars-io/src/path_utils/mod.rs @@ -99,7 +99,7 @@ struct HiveIdxTracker<'a> { check_directory_level: bool, } -impl<'a> HiveIdxTracker<'a> { +impl HiveIdxTracker<'_> { fn update(&mut self, i: usize, path_idx: usize) -> PolarsResult<()> { let check_directory_level = self.check_directory_level; let paths = self.paths; diff --git a/crates/polars-json/src/json/write/mod.rs b/crates/polars-json/src/json/write/mod.rs index a23b245b68b2..6796ef7436bb 100644 --- a/crates/polars-json/src/json/write/mod.rs +++ b/crates/polars-json/src/json/write/mod.rs @@ -101,7 +101,7 @@ impl<'a> RecordSerializer<'a> { } } -impl<'a> FallibleStreamingIterator for RecordSerializer<'a> { +impl FallibleStreamingIterator for RecordSerializer<'_> { type Item = [u8]; type Error = PolarsError; diff --git a/crates/polars-lazy/Cargo.toml b/crates/polars-lazy/Cargo.toml index 25986e512381..78f8274fb079 100644 --- a/crates/polars-lazy/Cargo.toml +++ b/crates/polars-lazy/Cargo.toml @@ -71,7 +71,7 @@ temporal = [ ] # debugging purposes fmt = ["polars-core/fmt", "polars-plan/fmt"] -strings = ["polars-plan/strings"] +strings = ["polars-plan/strings", "polars-stream?/strings"] future = [] dtype-full = [ @@ -163,7 +163,7 @@ bitwise = [ "polars-plan/bitwise", "polars-expr/bitwise", "polars-core/bitwise", - "polars-stream/bitwise", + "polars-stream?/bitwise", "polars-ops/bitwise", ] approx_unique = ["polars-plan/approx_unique"] @@ -203,7 +203,7 @@ dynamic_group_by = [ "temporal", "polars-expr/dynamic_group_by", "polars-mem-engine/dynamic_group_by", - "polars-stream/dynamic_group_by", + "polars-stream?/dynamic_group_by", ] ewma = ["polars-plan/ewma"] ewma_by = ["polars-plan/ewma_by"] @@ -258,7 +258,7 @@ replace = ["polars-plan/replace"] binary_encoding = ["polars-plan/binary_encoding"] string_encoding = ["polars-plan/string_encoding"] -bigidx = ["polars-plan/bigidx"] +bigidx = ["polars-plan/bigidx", "polars-utils/bigidx"] polars_cloud = ["polars-plan/polars_cloud"] panic_on_schema = ["polars-plan/panic_on_schema", "polars-expr/panic_on_schema"] diff --git a/crates/polars-lazy/src/frame/mod.rs b/crates/polars-lazy/src/frame/mod.rs index 503d952e017f..cf90c5232450 100644 --- a/crates/polars-lazy/src/frame/mod.rs +++ b/crates/polars-lazy/src/frame/mod.rs @@ -1349,7 +1349,7 @@ impl LazyFrame { right_on: E, args: JoinArgs, ) -> LazyFrame { - // if any of the nodes reads from files we must activate this this plan as well. + // if any of the nodes reads from files we must activate this plan as well. if other.opt_state.contains(OptFlags::FILE_CACHING) { self.opt_state |= OptFlags::FILE_CACHING; } diff --git a/crates/polars-mem-engine/src/executors/group_by_partitioned.rs b/crates/polars-mem-engine/src/executors/group_by_partitioned.rs index 57bb99c0fad9..ad41378b3086 100644 --- a/crates/polars-mem-engine/src/executors/group_by_partitioned.rs +++ b/crates/polars-mem-engine/src/executors/group_by_partitioned.rs @@ -144,7 +144,7 @@ fn estimate_unique_count(keys: &[Column], mut sample_size: usize) -> PolarsResul if keys.len() == 1 { // we sample as that will work also with sorted data. - // not that sampling without replacement is very very expensive. don't do that. + // not that sampling without replacement is *very* expensive. don't do that. let s = keys[0].sample_n(sample_size, true, false, None).unwrap(); // fast multi-threaded way to get unique. let groups = s.as_materialized_series().group_tuples(true, false)?; diff --git a/crates/polars-ops/src/chunked_array/list/to_struct.rs b/crates/polars-ops/src/chunked_array/list/to_struct.rs index 6676de3983db..fad1bcebb9a1 100644 --- a/crates/polars-ops/src/chunked_array/list/to_struct.rs +++ b/crates/polars-ops/src/chunked_array/list/to_struct.rs @@ -5,82 +5,220 @@ use polars_utils::pl_str::PlSmallStr; use super::*; -#[derive(Copy, Clone, Debug)] +#[derive(Clone, Eq, PartialEq, Hash, Debug)] +#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] +pub enum ListToStructArgs { + FixedWidth(Arc<[PlSmallStr]>), + InferWidth { + infer_field_strategy: ListToStructWidthStrategy, + get_index_name: Option, + /// If this is 0, it means unbounded. + max_fields: usize, + }, +} + +#[derive(Clone, Eq, PartialEq, Hash, Debug)] +#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] pub enum ListToStructWidthStrategy { FirstNonNull, MaxWidth, } -fn det_n_fields(ca: &ListChunked, n_fields: ListToStructWidthStrategy) -> usize { - match n_fields { - ListToStructWidthStrategy::MaxWidth => { - let mut max = 0; - - ca.downcast_iter().for_each(|arr| { - let offsets = arr.offsets().as_slice(); - let mut last = offsets[0]; - for o in &offsets[1..] { - let len = (*o - last) as usize; - max = std::cmp::max(max, len); - last = *o; +impl ListToStructArgs { + pub fn get_output_dtype(&self, input_dtype: &DataType) -> PolarsResult { + let DataType::List(inner_dtype) = input_dtype else { + polars_bail!( + InvalidOperation: + "attempted list to_struct on non-list dtype: {}", + input_dtype + ); + }; + let inner_dtype = inner_dtype.as_ref(); + + match self { + Self::FixedWidth(names) => Ok(DataType::Struct( + names + .iter() + .map(|x| Field::new(x.clone(), inner_dtype.clone())) + .collect::>(), + )), + Self::InferWidth { + get_index_name, + max_fields, + .. + } if *max_fields > 0 => { + let get_index_name_func = get_index_name.as_ref().map_or( + &_default_struct_name_gen as &dyn Fn(usize) -> PlSmallStr, + |x| x.0.as_ref(), + ); + Ok(DataType::Struct( + (0..*max_fields) + .map(|i| Field::new(get_index_name_func(i), inner_dtype.clone())) + .collect::>(), + )) + }, + Self::InferWidth { .. } => Ok(DataType::Unknown(UnknownKind::Any)), + } + } + + fn det_n_fields(&self, ca: &ListChunked) -> usize { + match self { + Self::FixedWidth(v) => v.len(), + Self::InferWidth { + infer_field_strategy, + max_fields, + .. + } => { + let inferred = match infer_field_strategy { + ListToStructWidthStrategy::MaxWidth => { + let mut max = 0; + + ca.downcast_iter().for_each(|arr| { + let offsets = arr.offsets().as_slice(); + let mut last = offsets[0]; + for o in &offsets[1..] { + let len = (*o - last) as usize; + max = std::cmp::max(max, len); + last = *o; + } + }); + max + }, + ListToStructWidthStrategy::FirstNonNull => { + let mut len = 0; + for arr in ca.downcast_iter() { + let offsets = arr.offsets().as_slice(); + let mut last = offsets[0]; + for o in &offsets[1..] { + len = (*o - last) as usize; + if len > 0 { + break; + } + last = *o; + } + if len > 0 { + break; + } + } + len + }, + }; + + if *max_fields > 0 { + inferred.min(*max_fields) + } else { + inferred } - }); - max - }, - ListToStructWidthStrategy::FirstNonNull => { - let mut len = 0; - for arr in ca.downcast_iter() { - let offsets = arr.offsets().as_slice(); - let mut last = offsets[0]; - for o in &offsets[1..] { - len = (*o - last) as usize; - if len > 0 { - break; - } - last = *o; + }, + } + } + + fn set_output_names(&self, columns: &mut [Series]) { + match self { + Self::FixedWidth(v) => { + assert_eq!(columns.len(), v.len()); + + for (c, name) in columns.iter_mut().zip(v.iter()) { + c.rename(name.clone()); } - if len > 0 { - break; + }, + Self::InferWidth { get_index_name, .. } => { + let get_index_name_func = get_index_name.as_ref().map_or( + &_default_struct_name_gen as &dyn Fn(usize) -> PlSmallStr, + |x| x.0.as_ref(), + ); + + for (i, c) in columns.iter_mut().enumerate() { + c.rename(get_index_name_func(i)); } - } - len - }, + }, + } + } +} + +#[derive(Clone)] +pub struct NameGenerator(pub Arc PlSmallStr + Send + Sync>); + +impl NameGenerator { + pub fn from_func(func: impl Fn(usize) -> PlSmallStr + Send + Sync + 'static) -> Self { + Self(Arc::new(func)) + } +} + +impl std::fmt::Debug for NameGenerator { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "list::to_struct::NameGenerator function at 0x{:016x}", + self.0.as_ref() as *const _ as *const () as usize + ) + } +} + +impl Eq for NameGenerator {} + +impl PartialEq for NameGenerator { + fn eq(&self, other: &Self) -> bool { + Arc::ptr_eq(&self.0, &other.0) } } -pub type NameGenerator = Arc PlSmallStr + Send + Sync>; +impl std::hash::Hash for NameGenerator { + fn hash(&self, state: &mut H) { + state.write_usize(Arc::as_ptr(&self.0) as *const () as usize) + } +} pub fn _default_struct_name_gen(idx: usize) -> PlSmallStr { format_pl_smallstr!("field_{idx}") } pub trait ToStruct: AsList { - fn to_struct( - &self, - n_fields: ListToStructWidthStrategy, - name_generator: Option, - ) -> PolarsResult { + fn to_struct(&self, args: &ListToStructArgs) -> PolarsResult { let ca = self.as_list(); - let n_fields = det_n_fields(ca, n_fields); + let n_fields = args.det_n_fields(ca); - let name_generator = name_generator - .as_deref() - .unwrap_or(&_default_struct_name_gen); - - let fields = POOL.install(|| { + let mut fields = POOL.install(|| { (0..n_fields) .into_par_iter() - .map(|i| { - ca.lst_get(i as i64, true).map(|mut s| { - s.rename(name_generator(i)); - s - }) - }) + .map(|i| ca.lst_get(i as i64, true)) .collect::>>() })?; + args.set_output_names(&mut fields); + StructChunked::from_series(ca.name().clone(), ca.len(), fields.iter()) } } impl ToStruct for ListChunked {} + +#[cfg(feature = "serde")] +mod _serde_impl { + use super::*; + + impl serde::Serialize for NameGenerator { + fn serialize(&self, _serializer: S) -> Result + where + S: serde::Serializer, + { + use serde::ser::Error; + Err(S::Error::custom( + "cannot serialize name generator function for to_struct, \ + consider passing a list of field names instead.", + )) + } + } + + impl<'de> serde::Deserialize<'de> for NameGenerator { + fn deserialize(_deserializer: D) -> Result + where + D: serde::Deserializer<'de>, + { + use serde::de::Error; + Err(D::Error::custom( + "invalid data: attempted to deserialize list::to_struct::NameGenerator", + )) + } + } +} diff --git a/crates/polars-ops/src/frame/join/general.rs b/crates/polars-ops/src/frame/join/general.rs index 1420d7b66062..0bf0a86cd972 100644 --- a/crates/polars-ops/src/frame/join/general.rs +++ b/crates/polars-ops/src/frame/join/general.rs @@ -56,7 +56,7 @@ pub fn _coalesce_full_join( df_left: &DataFrame, ) -> DataFrame { // No need to allocate the schema because we already - // know for certain that the column name for left left is `name` + // know for certain that the column name for left is `name` // and for right is `name + suffix` let schema_left = if keys_left == keys_right { Schema::default() diff --git a/crates/polars-ops/src/frame/pivot/positioning.rs b/crates/polars-ops/src/frame/pivot/positioning.rs index f91d537fb9d4..b7058de0a05e 100644 --- a/crates/polars-ops/src/frame/pivot/positioning.rs +++ b/crates/polars-ops/src/frame/pivot/positioning.rs @@ -240,13 +240,13 @@ pub(super) fn compute_col_idx( let col_locations = match column_agg_physical.dtype() { T::Int32 | T::UInt32 => { let Some(BitRepr::Small(ca)) = column_agg_physical.bit_repr() else { - polars_bail!(ComputeError: "Expected 32-bit bit representation to be available. This should never happen"); + polars_bail!(ComputeError: "Expected 32-bit representation to be available; this should never happen"); }; compute_col_idx_numeric(&ca) }, T::Int64 | T::UInt64 => { let Some(BitRepr::Large(ca)) = column_agg_physical.bit_repr() else { - polars_bail!(ComputeError: "Expected 64-bit bit representation to be available. This should never happen"); + polars_bail!(ComputeError: "Expected 64-bit representation to be available; this should never happen"); }; compute_col_idx_numeric(&ca) }, @@ -413,13 +413,13 @@ pub(super) fn compute_row_idx( match index_agg_physical.dtype() { T::Int32 | T::UInt32 => { let Some(BitRepr::Small(ca)) = index_agg_physical.bit_repr() else { - polars_bail!(ComputeError: "Expected 32-bit bit representation to be available. This should never happen"); + polars_bail!(ComputeError: "Expected 32-bit representation to be available; this should never happen"); }; compute_row_index(index, &ca, count, index_s.dtype()) }, T::Int64 | T::UInt64 => { let Some(BitRepr::Large(ca)) = index_agg_physical.bit_repr() else { - polars_bail!(ComputeError: "Expected 64-bit bit representation to be available. This should never happen"); + polars_bail!(ComputeError: "Expected 64-bit representation to be available; this should never happen"); }; compute_row_index(index, &ca, count, index_s.dtype()) }, diff --git a/crates/polars-parquet/Cargo.toml b/crates/polars-parquet/Cargo.toml index 544e52388e58..8ae9108a1fa7 100644 --- a/crates/polars-parquet/Cargo.toml +++ b/crates/polars-parquet/Cargo.toml @@ -24,7 +24,7 @@ hashbrown = { workspace = true } num-traits = { workspace = true } polars-compute = { workspace = true, features = ["approx_unique"] } polars-error = { workspace = true } -polars-parquet-format = { git = "https://github.com/pola-rs/parquet-format" } +polars-parquet-format = "0.1" polars-utils = { workspace = true, features = ["mmap"] } simdutf8 = { workspace = true } diff --git a/crates/polars-parquet/src/arrow/read/deserialize/binview.rs b/crates/polars-parquet/src/arrow/read/deserialize/binview.rs index 6777f7e639c9..86e46756788a 100644 --- a/crates/polars-parquet/src/arrow/read/deserialize/binview.rs +++ b/crates/polars-parquet/src/arrow/read/deserialize/binview.rs @@ -362,7 +362,7 @@ impl DeltaGatherer for StatGatherer { } } -impl<'a, 'b> BatchableCollector<(), MutableBinaryViewArray<[u8]>> for &mut DeltaCollector<'a, 'b> { +impl BatchableCollector<(), MutableBinaryViewArray<[u8]>> for &mut DeltaCollector<'_, '_> { fn reserve(target: &mut MutableBinaryViewArray<[u8]>, n: usize) { target.reserve(n); } @@ -394,7 +394,7 @@ impl<'a, 'b> BatchableCollector<(), MutableBinaryViewArray<[u8]>> for &mut Delta } } -impl<'a, 'b> DeltaCollector<'a, 'b> { +impl DeltaCollector<'_, '_> { pub fn flush(&mut self, target: &mut MutableBinaryViewArray<[u8]>) { if !self.pushed_lengths.is_empty() { let start_bytes_len = target.total_bytes_len(); @@ -428,7 +428,7 @@ impl<'a, 'b> DeltaCollector<'a, 'b> { } } -impl<'a, 'b> BatchableCollector<(), MutableBinaryViewArray<[u8]>> for DeltaBytesCollector<'a, 'b> { +impl BatchableCollector<(), MutableBinaryViewArray<[u8]>> for DeltaBytesCollector<'_, '_> { fn reserve(target: &mut MutableBinaryViewArray<[u8]>, n: usize) { target.reserve(n); } @@ -621,7 +621,7 @@ impl utils::Decoder for BinViewDecoder { max_length: &'b mut usize, } - impl<'a, 'b> BatchableCollector<(), MutableBinaryViewArray<[u8]>> for Collector<'a, 'b> { + impl BatchableCollector<(), MutableBinaryViewArray<[u8]>> for Collector<'_, '_> { fn reserve(target: &mut MutableBinaryViewArray<[u8]>, n: usize) { target.reserve(n); } @@ -709,7 +709,7 @@ impl utils::Decoder for BinViewDecoder { ) -> ParquetResult<()> { struct DictionaryTranslator<'a>(&'a [View]); - impl<'a> HybridRleGatherer for DictionaryTranslator<'a> { + impl HybridRleGatherer for DictionaryTranslator<'_> { type Target = MutableBinaryViewArray<[u8]>; fn target_reserve(&self, target: &mut Self::Target, n: usize) { @@ -803,7 +803,7 @@ impl utils::Decoder for BinViewDecoder { translator: DictionaryTranslator<'b>, } - impl<'a, 'b> BatchableCollector<(), MutableBinaryViewArray<[u8]>> for Collector<'a, 'b> { + impl BatchableCollector<(), MutableBinaryViewArray<[u8]>> for Collector<'_, '_> { fn reserve(target: &mut MutableBinaryViewArray<[u8]>, n: usize) { target.reserve(n); } diff --git a/crates/polars-parquet/src/arrow/read/deserialize/boolean.rs b/crates/polars-parquet/src/arrow/read/deserialize/boolean.rs index af2e504d2646..51026f483bd7 100644 --- a/crates/polars-parquet/src/arrow/read/deserialize/boolean.rs +++ b/crates/polars-parquet/src/arrow/read/deserialize/boolean.rs @@ -161,7 +161,7 @@ impl HybridRleGatherer for BitmapGatherer { // @TODO: The slice impl here can speed some stuff up } struct BitmapCollector<'a, 'b>(&'b mut HybridRleDecoder<'a>); -impl<'a, 'b> BatchableCollector for BitmapCollector<'a, 'b> { +impl BatchableCollector for BitmapCollector<'_, '_> { fn reserve(target: &mut MutableBitmap, n: usize) { target.reserve(n); } diff --git a/crates/polars-parquet/src/arrow/read/deserialize/dictionary.rs b/crates/polars-parquet/src/arrow/read/deserialize/dictionary.rs index de2bfe2e47f3..478c7cca0f2e 100644 --- a/crates/polars-parquet/src/arrow/read/deserialize/dictionary.rs +++ b/crates/polars-parquet/src/arrow/read/deserialize/dictionary.rs @@ -184,7 +184,7 @@ pub(crate) struct DictArrayTranslator { dict_size: usize, } -impl<'a, 'b, K: DictionaryKey> BatchableCollector<(), Vec> for DictArrayCollector<'a, 'b> { +impl BatchableCollector<(), Vec> for DictArrayCollector<'_, '_> { fn reserve(target: &mut Vec, n: usize) { target.reserve(n); } diff --git a/crates/polars-parquet/src/arrow/read/deserialize/fixed_size_binary.rs b/crates/polars-parquet/src/arrow/read/deserialize/fixed_size_binary.rs index 3825d528c8f5..5657a20dd151 100644 --- a/crates/polars-parquet/src/arrow/read/deserialize/fixed_size_binary.rs +++ b/crates/polars-parquet/src/arrow/read/deserialize/fixed_size_binary.rs @@ -154,7 +154,7 @@ impl Decoder for BinaryDecoder { size: usize, } - impl<'a, 'b> BatchableCollector<(), Vec> for FixedSizeBinaryCollector<'a, 'b> { + impl BatchableCollector<(), Vec> for FixedSizeBinaryCollector<'_, '_> { fn reserve(target: &mut Vec, n: usize) { target.reserve(n); } diff --git a/crates/polars-parquet/src/arrow/read/deserialize/nested_utils.rs b/crates/polars-parquet/src/arrow/read/deserialize/nested_utils.rs index a622153dfca8..d37a6d4bf3b1 100644 --- a/crates/polars-parquet/src/arrow/read/deserialize/nested_utils.rs +++ b/crates/polars-parquet/src/arrow/read/deserialize/nested_utils.rs @@ -215,8 +215,8 @@ pub struct BatchedNestedDecoder<'a, 'b, 'c, D: utils::NestedDecoder> { decoder: &'c mut D, } -impl<'a, 'b, 'c, D: utils::NestedDecoder> BatchableCollector<(), D::DecodedState> - for BatchedNestedDecoder<'a, 'b, 'c, D> +impl BatchableCollector<(), D::DecodedState> + for BatchedNestedDecoder<'_, '_, '_, D> { fn reserve(_target: &mut D::DecodedState, _n: usize) { unreachable!() diff --git a/crates/polars-parquet/src/arrow/read/deserialize/primitive/mod.rs b/crates/polars-parquet/src/arrow/read/deserialize/primitive/mod.rs index 54ed87c6b42b..5539595fda48 100644 --- a/crates/polars-parquet/src/arrow/read/deserialize/primitive/mod.rs +++ b/crates/polars-parquet/src/arrow/read/deserialize/primitive/mod.rs @@ -125,8 +125,8 @@ where pub(crate) _pd: std::marker::PhantomData, } -impl<'a, 'b, P, T, D: DecoderFunction> BatchableCollector<(), Vec> - for PlainDecoderFnCollector<'a, 'b, P, T, D> +impl> BatchableCollector<(), Vec> + for PlainDecoderFnCollector<'_, '_, P, T, D> where T: NativeType, P: ParquetNativeType, @@ -239,7 +239,7 @@ where } } -impl<'a, 'b, P, T, D> BatchableCollector<(), Vec> for DeltaCollector<'a, 'b, P, T, D> +impl BatchableCollector<(), Vec> for DeltaCollector<'_, '_, P, T, D> where T: NativeType, P: ParquetNativeType, diff --git a/crates/polars-parquet/src/arrow/read/deserialize/utils/array_chunks.rs b/crates/polars-parquet/src/arrow/read/deserialize/utils/array_chunks.rs index 0882cbed5cb0..f66544ee3183 100644 --- a/crates/polars-parquet/src/arrow/read/deserialize/utils/array_chunks.rs +++ b/crates/polars-parquet/src/arrow/read/deserialize/utils/array_chunks.rs @@ -47,4 +47,4 @@ impl<'a, P: ParquetNativeType> Iterator for ArrayChunks<'a, P> { } } -impl<'a, P: ParquetNativeType> ExactSizeIterator for ArrayChunks<'a, P> {} +impl ExactSizeIterator for ArrayChunks<'_, P> {} diff --git a/crates/polars-parquet/src/arrow/read/deserialize/utils/mod.rs b/crates/polars-parquet/src/arrow/read/deserialize/utils/mod.rs index dba00fc97930..7c6cf840bdce 100644 --- a/crates/polars-parquet/src/arrow/read/deserialize/utils/mod.rs +++ b/crates/polars-parquet/src/arrow/read/deserialize/utils/mod.rs @@ -433,7 +433,7 @@ where } } -impl<'a, 'b, 'c, O, T> BatchableCollector> for TranslatedHybridRle<'a, 'b, 'c, O, T> +impl BatchableCollector> for TranslatedHybridRle<'_, '_, '_, O, T> where O: Clone + Default, T: Translator, @@ -487,7 +487,7 @@ where } } -impl<'a, 'b, 'c, O, G> BatchableCollector> for GatheredHybridRle<'a, 'b, 'c, O, G> +impl BatchableCollector> for GatheredHybridRle<'_, '_, '_, O, G> where O: Clone, G: HybridRleGatherer>, @@ -516,8 +516,8 @@ where } } -impl<'a, 'b, 'c, T> BatchableCollector> - for TranslatedHybridRle<'a, 'b, 'c, View, T> +impl BatchableCollector> + for TranslatedHybridRle<'_, '_, '_, View, T> where T: Translator, { diff --git a/crates/polars-parquet/src/arrow/read/schema/mod.rs b/crates/polars-parquet/src/arrow/read/schema/mod.rs index 347cd49faefd..ea27aa03d46d 100644 --- a/crates/polars-parquet/src/arrow/read/schema/mod.rs +++ b/crates/polars-parquet/src/arrow/read/schema/mod.rs @@ -40,7 +40,7 @@ impl Default for SchemaInferenceOptions { /// /// # Error /// This function errors iff the key `"ARROW:schema"` exists but is not correctly encoded, -/// indicating that that the file's arrow metadata was incorrectly written. +/// indicating that the file's arrow metadata was incorrectly written. pub fn infer_schema(file_metadata: &FileMetadata) -> PolarsResult { infer_schema_with_options(file_metadata, &None) } diff --git a/crates/polars-parquet/src/arrow/write/dictionary.rs b/crates/polars-parquet/src/arrow/write/dictionary.rs index 17527fc488f7..fc97c268c0fd 100644 --- a/crates/polars-parquet/src/arrow/write/dictionary.rs +++ b/crates/polars-parquet/src/arrow/write/dictionary.rs @@ -13,7 +13,7 @@ use polars_error::{polars_bail, PolarsResult}; use super::binary::{ build_statistics as binary_build_statistics, encode_plain as binary_encode_plain, }; -use super::fixed_len_bytes::{ +use super::fixed_size_binary::{ build_statistics as fixed_binary_build_statistics, encode_plain as fixed_binary_encode_plain, }; use super::pages::PrimitiveNested; diff --git a/crates/polars-parquet/src/arrow/write/fixed_size_binary/basic.rs b/crates/polars-parquet/src/arrow/write/fixed_size_binary/basic.rs new file mode 100644 index 000000000000..27151ce51f70 --- /dev/null +++ b/crates/polars-parquet/src/arrow/write/fixed_size_binary/basic.rs @@ -0,0 +1,47 @@ +use arrow::array::{Array, FixedSizeBinaryArray}; +use polars_error::PolarsResult; + +use super::encode_plain; +use crate::parquet::page::DataPage; +use crate::parquet::schema::types::PrimitiveType; +use crate::parquet::statistics::FixedLenStatistics; +use crate::read::schema::is_nullable; +use crate::write::{utils, EncodeNullability, Encoding, WriteOptions}; + +pub fn array_to_page( + array: &FixedSizeBinaryArray, + options: WriteOptions, + type_: PrimitiveType, + statistics: Option, +) -> PolarsResult { + let is_optional = is_nullable(&type_.field_info); + let encode_options = EncodeNullability::new(is_optional); + + let validity = array.validity(); + + let mut buffer = vec![]; + utils::write_def_levels( + &mut buffer, + is_optional, + validity, + array.len(), + options.version, + )?; + + let definition_levels_byte_length = buffer.len(); + + encode_plain(array, encode_options, &mut buffer); + + utils::build_plain_page( + buffer, + array.len(), + array.len(), + array.null_count(), + 0, + definition_levels_byte_length, + statistics.map(|x| x.serialize()), + type_, + options, + Encoding::Plain, + ) +} diff --git a/crates/polars-parquet/src/arrow/write/fixed_len_bytes.rs b/crates/polars-parquet/src/arrow/write/fixed_size_binary/mod.rs similarity index 79% rename from crates/polars-parquet/src/arrow/write/fixed_len_bytes.rs rename to crates/polars-parquet/src/arrow/write/fixed_size_binary/mod.rs index 9277b9c78a98..58f11adfa491 100644 --- a/crates/polars-parquet/src/arrow/write/fixed_len_bytes.rs +++ b/crates/polars-parquet/src/arrow/write/fixed_size_binary/mod.rs @@ -1,12 +1,13 @@ +mod basic; +mod nested; + use arrow::array::{Array, FixedSizeBinaryArray, PrimitiveArray}; use arrow::types::i256; -use polars_error::PolarsResult; +pub use basic::array_to_page; +pub use nested::array_to_page as nested_array_to_page; use super::binary::ord_binary; -use super::{utils, EncodeNullability, StatisticsOptions, WriteOptions}; -use crate::arrow::read::schema::is_nullable; -use crate::parquet::encoding::Encoding; -use crate::parquet::page::DataPage; +use super::{EncodeNullability, StatisticsOptions}; use crate::parquet::schema::types::PrimitiveType; use crate::parquet::statistics::FixedLenStatistics; @@ -27,44 +28,6 @@ pub(crate) fn encode_plain( } } -pub fn array_to_page( - array: &FixedSizeBinaryArray, - options: WriteOptions, - type_: PrimitiveType, - statistics: Option, -) -> PolarsResult { - let is_optional = is_nullable(&type_.field_info); - let encode_options = EncodeNullability::new(is_optional); - - let validity = array.validity(); - - let mut buffer = vec![]; - utils::write_def_levels( - &mut buffer, - is_optional, - validity, - array.len(), - options.version, - )?; - - let definition_levels_byte_length = buffer.len(); - - encode_plain(array, encode_options, &mut buffer); - - utils::build_plain_page( - buffer, - array.len(), - array.len(), - array.null_count(), - 0, - definition_levels_byte_length, - statistics.map(|x| x.serialize()), - type_, - options, - Encoding::Plain, - ) -} - pub(super) fn build_statistics( array: &FixedSizeBinaryArray, primitive_type: PrimitiveType, diff --git a/crates/polars-parquet/src/arrow/write/fixed_size_binary/nested.rs b/crates/polars-parquet/src/arrow/write/fixed_size_binary/nested.rs new file mode 100644 index 000000000000..81175cf5db18 --- /dev/null +++ b/crates/polars-parquet/src/arrow/write/fixed_size_binary/nested.rs @@ -0,0 +1,39 @@ +use arrow::array::{Array, FixedSizeBinaryArray}; +use polars_error::PolarsResult; + +use super::encode_plain; +use crate::parquet::page::DataPage; +use crate::parquet::schema::types::PrimitiveType; +use crate::parquet::statistics::FixedLenStatistics; +use crate::read::schema::is_nullable; +use crate::write::{nested, utils, EncodeNullability, Encoding, Nested, WriteOptions}; + +pub fn array_to_page( + array: &FixedSizeBinaryArray, + options: WriteOptions, + type_: PrimitiveType, + nested: &[Nested], + statistics: Option, +) -> PolarsResult { + let is_optional = is_nullable(&type_.field_info); + let encode_options = EncodeNullability::new(is_optional); + + let mut buffer = vec![]; + let (repetition_levels_byte_length, definition_levels_byte_length) = + nested::write_rep_and_def(options.version, nested, &mut buffer)?; + + encode_plain(array, encode_options, &mut buffer); + + utils::build_plain_page( + buffer, + nested::num_values(nested), + nested[0].len(), + array.null_count(), + repetition_levels_byte_length, + definition_levels_byte_length, + statistics.map(|x| x.serialize()), + type_, + options, + Encoding::Plain, + ) +} diff --git a/crates/polars-parquet/src/arrow/write/mod.rs b/crates/polars-parquet/src/arrow/write/mod.rs index 02f0165d04c7..17a342ac9d67 100644 --- a/crates/polars-parquet/src/arrow/write/mod.rs +++ b/crates/polars-parquet/src/arrow/write/mod.rs @@ -17,7 +17,7 @@ mod binview; mod boolean; mod dictionary; mod file; -mod fixed_len_bytes; +mod fixed_size_binary; mod nested; mod pages; mod primitive; @@ -528,7 +528,7 @@ pub fn array_to_page_simple( array.validity().cloned(), ); let statistics = if options.has_statistics() { - Some(fixed_len_bytes::build_statistics( + Some(fixed_size_binary::build_statistics( &array, type_.clone(), &options.statistics, @@ -536,7 +536,7 @@ pub fn array_to_page_simple( } else { None }; - fixed_len_bytes::array_to_page(&array, options, type_, statistics) + fixed_size_binary::array_to_page(&array, options, type_, statistics) }, ArrowDataType::Interval(IntervalUnit::DayTime) => { let array = array @@ -555,7 +555,7 @@ pub fn array_to_page_simple( array.validity().cloned(), ); let statistics = if options.has_statistics() { - Some(fixed_len_bytes::build_statistics( + Some(fixed_size_binary::build_statistics( &array, type_.clone(), &options.statistics, @@ -563,12 +563,12 @@ pub fn array_to_page_simple( } else { None }; - fixed_len_bytes::array_to_page(&array, options, type_, statistics) + fixed_size_binary::array_to_page(&array, options, type_, statistics) }, ArrowDataType::FixedSizeBinary(_) => { let array = array.as_any().downcast_ref().unwrap(); let statistics = if options.has_statistics() { - Some(fixed_len_bytes::build_statistics( + Some(fixed_size_binary::build_statistics( array, type_.clone(), &options.statistics, @@ -577,7 +577,7 @@ pub fn array_to_page_simple( None }; - fixed_len_bytes::array_to_page(array, options, type_, statistics) + fixed_size_binary::array_to_page(array, options, type_, statistics) }, ArrowDataType::Decimal256(precision, _) => { let precision = *precision; @@ -620,7 +620,7 @@ pub fn array_to_page_simple( } else if precision <= 38 { let size = decimal_length_from_precision(precision); let statistics = if options.has_statistics() { - let stats = fixed_len_bytes::build_statistics_decimal256_with_i128( + let stats = fixed_size_binary::build_statistics_decimal256_with_i128( array, type_.clone(), size, @@ -641,7 +641,7 @@ pub fn array_to_page_simple( values.into(), array.validity().cloned(), ); - fixed_len_bytes::array_to_page(&array, options, type_, statistics) + fixed_size_binary::array_to_page(&array, options, type_, statistics) } else { let size = 32; let array = array @@ -649,7 +649,7 @@ pub fn array_to_page_simple( .downcast_ref::>() .unwrap(); let statistics = if options.has_statistics() { - let stats = fixed_len_bytes::build_statistics_decimal256( + let stats = fixed_size_binary::build_statistics_decimal256( array, type_.clone(), size, @@ -670,7 +670,7 @@ pub fn array_to_page_simple( array.validity().cloned(), ); - fixed_len_bytes::array_to_page(&array, options, type_, statistics) + fixed_size_binary::array_to_page(&array, options, type_, statistics) } }, ArrowDataType::Decimal(precision, _) => { @@ -715,7 +715,7 @@ pub fn array_to_page_simple( let size = decimal_length_from_precision(precision); let statistics = if options.has_statistics() { - let stats = fixed_len_bytes::build_statistics_decimal( + let stats = fixed_size_binary::build_statistics_decimal( array, type_.clone(), size, @@ -736,7 +736,7 @@ pub fn array_to_page_simple( values.into(), array.validity().cloned(), ); - fixed_len_bytes::array_to_page(&array, options, type_, statistics) + fixed_size_binary::array_to_page(&array, options, type_, statistics) } }, other => polars_bail!(nyi = "Writing parquet pages for data type {other:?}"), @@ -858,7 +858,7 @@ fn array_to_page_nested( let size = decimal_length_from_precision(precision); let statistics = if options.has_statistics() { - let stats = fixed_len_bytes::build_statistics_decimal( + let stats = fixed_size_binary::build_statistics_decimal( array, type_.clone(), size, @@ -879,7 +879,7 @@ fn array_to_page_nested( values.into(), array.validity().cloned(), ); - fixed_len_bytes::array_to_page(&array, options, type_, statistics) + fixed_size_binary::nested_array_to_page(&array, options, type_, nested, statistics) } }, Decimal256(precision, _) => { @@ -919,7 +919,7 @@ fn array_to_page_nested( } else if precision <= 38 { let size = decimal_length_from_precision(precision); let statistics = if options.has_statistics() { - let stats = fixed_len_bytes::build_statistics_decimal256_with_i128( + let stats = fixed_size_binary::build_statistics_decimal256_with_i128( array, type_.clone(), size, @@ -940,7 +940,7 @@ fn array_to_page_nested( values.into(), array.validity().cloned(), ); - fixed_len_bytes::array_to_page(&array, options, type_, statistics) + fixed_size_binary::nested_array_to_page(&array, options, type_, nested, statistics) } else { let size = 32; let array = array @@ -948,7 +948,7 @@ fn array_to_page_nested( .downcast_ref::>() .unwrap(); let statistics = if options.has_statistics() { - let stats = fixed_len_bytes::build_statistics_decimal256( + let stats = fixed_size_binary::build_statistics_decimal256( array, type_.clone(), size, @@ -969,7 +969,7 @@ fn array_to_page_nested( array.validity().cloned(), ); - fixed_len_bytes::array_to_page(&array, options, type_, statistics) + fixed_size_binary::nested_array_to_page(&array, options, type_, nested, statistics) } }, other => polars_bail!(nyi = "Writing nested parquet pages for data type {other:?}"), diff --git a/crates/polars-parquet/src/arrow/write/nested/dremel/mod.rs b/crates/polars-parquet/src/arrow/write/nested/dremel/mod.rs index 546efd034a9c..961393bf4ed2 100644 --- a/crates/polars-parquet/src/arrow/write/nested/dremel/mod.rs +++ b/crates/polars-parquet/src/arrow/write/nested/dremel/mod.rs @@ -79,7 +79,7 @@ pub fn num_values(nested: &[Nested]) -> usize { BufferedDremelIter::new(nested).count() } -impl<'a> Level<'a> { +impl Level<'_> { /// Fetch the number of elements given on the next level at `offset` on this level fn next_level_length(&self, offset: usize, is_valid: bool) -> usize { match self.lengths { @@ -407,7 +407,7 @@ impl<'a> BufferedDremelIter<'a> { } } -impl<'a> Iterator for BufferedDremelIter<'a> { +impl Iterator for BufferedDremelIter<'_> { type Item = DremelValue; fn next(&mut self) -> Option { diff --git a/crates/polars-parquet/src/parquet/encoding/bitpacked/decode.rs b/crates/polars-parquet/src/parquet/encoding/bitpacked/decode.rs index b453c7b6e0f6..b5ea9b815dc1 100644 --- a/crates/polars-parquet/src/parquet/encoding/bitpacked/decode.rs +++ b/crates/polars-parquet/src/parquet/encoding/bitpacked/decode.rs @@ -13,7 +13,7 @@ pub struct Decoder<'a, T: Unpackable> { _pd: std::marker::PhantomData, } -impl<'a, T: Unpackable> Default for Decoder<'a, T> { +impl Default for Decoder<'_, T> { fn default() -> Self { Self { packed: [].chunks(1), @@ -114,7 +114,7 @@ pub struct ChunkedDecoder<'a, 'b, T: Unpackable> { pub(crate) decoder: &'b mut Decoder<'a, T>, } -impl<'a, 'b, T: Unpackable> Iterator for ChunkedDecoder<'a, 'b, T> { +impl Iterator for ChunkedDecoder<'_, '_, T> { type Item = T::Unpacked; #[inline] @@ -136,9 +136,9 @@ impl<'a, 'b, T: Unpackable> Iterator for ChunkedDecoder<'a, 'b, T> { } } -impl<'a, 'b, T: Unpackable> ExactSizeIterator for ChunkedDecoder<'a, 'b, T> {} +impl ExactSizeIterator for ChunkedDecoder<'_, '_, T> {} -impl<'a, 'b, T: Unpackable> ChunkedDecoder<'a, 'b, T> { +impl ChunkedDecoder<'_, '_, T> { /// Get and consume the remainder chunk if it exists pub fn remainder(&mut self) -> Option<(T::Unpacked, usize)> { let remainder_len = self.decoder.len() % T::Unpacked::LENGTH; @@ -262,7 +262,7 @@ mod tests { use super::super::tests::case1; use super::*; - impl<'a, T: Unpackable> Decoder<'a, T> { + impl Decoder<'_, T> { pub fn collect(self) -> Vec { let mut vec = Vec::new(); self.collect_into(&mut vec); diff --git a/crates/polars-parquet/src/parquet/encoding/byte_stream_split/decoder.rs b/crates/polars-parquet/src/parquet/encoding/byte_stream_split/decoder.rs index 793fa6f111d7..1b383e9522f1 100644 --- a/crates/polars-parquet/src/parquet/encoding/byte_stream_split/decoder.rs +++ b/crates/polars-parquet/src/parquet/encoding/byte_stream_split/decoder.rs @@ -96,7 +96,7 @@ where converter: F, } -impl<'a, 'b, T, F> Iterator for DecoderIterator<'a, 'b, T, F> +impl Iterator for DecoderIterator<'_, '_, T, F> where F: Copy + Fn(&[u8]) -> T, { diff --git a/crates/polars-parquet/src/parquet/encoding/delta_byte_array/decoder.rs b/crates/polars-parquet/src/parquet/encoding/delta_byte_array/decoder.rs index 03889e0aa5d3..deb95f1dd3a2 100644 --- a/crates/polars-parquet/src/parquet/encoding/delta_byte_array/decoder.rs +++ b/crates/polars-parquet/src/parquet/encoding/delta_byte_array/decoder.rs @@ -56,7 +56,7 @@ impl<'a> Decoder<'a> { mod tests { use super::*; - impl<'a> Iterator for Decoder<'a> { + impl Iterator for Decoder<'_> { type Item = ParquetResult>; fn next(&mut self) -> Option { diff --git a/crates/polars-parquet/src/parquet/encoding/hybrid_rle/bitmap.rs b/crates/polars-parquet/src/parquet/encoding/hybrid_rle/bitmap.rs index f46f22f84adb..0d67dc935857 100644 --- a/crates/polars-parquet/src/parquet/encoding/hybrid_rle/bitmap.rs +++ b/crates/polars-parquet/src/parquet/encoding/hybrid_rle/bitmap.rs @@ -39,7 +39,7 @@ impl<'a> BitmapIter<'a> { } } -impl<'a> Iterator for BitmapIter<'a> { +impl Iterator for BitmapIter<'_> { type Item = bool; #[inline] diff --git a/crates/polars-parquet/src/parquet/encoding/hybrid_rle/buffered.rs b/crates/polars-parquet/src/parquet/encoding/hybrid_rle/buffered.rs index 824638d253ad..95d53b2769e4 100644 --- a/crates/polars-parquet/src/parquet/encoding/hybrid_rle/buffered.rs +++ b/crates/polars-parquet/src/parquet/encoding/hybrid_rle/buffered.rs @@ -44,7 +44,7 @@ impl Iterator for BufferedRle { impl ExactSizeIterator for BufferedRle {} -impl<'a> Iterator for BufferedBitpacked<'a> { +impl Iterator for BufferedBitpacked<'_> { type Item = u32; fn next(&mut self) -> Option { @@ -74,9 +74,9 @@ impl<'a> Iterator for BufferedBitpacked<'a> { } } -impl<'a> ExactSizeIterator for BufferedBitpacked<'a> {} +impl ExactSizeIterator for BufferedBitpacked<'_> {} -impl<'a> Iterator for HybridRleBuffered<'a> { +impl Iterator for HybridRleBuffered<'_> { type Item = u32; fn next(&mut self) -> Option { @@ -94,9 +94,9 @@ impl<'a> Iterator for HybridRleBuffered<'a> { } } -impl<'a> ExactSizeIterator for HybridRleBuffered<'a> {} +impl ExactSizeIterator for HybridRleBuffered<'_> {} -impl<'a> BufferedBitpacked<'a> { +impl BufferedBitpacked<'_> { fn gather_limited_into>( &mut self, target: &mut G::Target, @@ -212,7 +212,7 @@ impl BufferedRle { } } -impl<'a> HybridRleBuffered<'a> { +impl HybridRleBuffered<'_> { pub fn gather_limited_into>( &mut self, target: &mut G::Target, diff --git a/crates/polars-parquet/src/parquet/encoding/hybrid_rle/gatherer.rs b/crates/polars-parquet/src/parquet/encoding/hybrid_rle/gatherer.rs index 1548f6e50a02..c66ef5873439 100644 --- a/crates/polars-parquet/src/parquet/encoding/hybrid_rle/gatherer.rs +++ b/crates/polars-parquet/src/parquet/encoding/hybrid_rle/gatherer.rs @@ -432,7 +432,7 @@ impl Translator for UnitTranslator { /// [`HybridRleDecoder`]: super::HybridRleDecoder pub struct DictionaryTranslator<'a, T>(pub &'a [T]); -impl<'a, T: Copy> Translator for DictionaryTranslator<'a, T> { +impl Translator for DictionaryTranslator<'_, T> { fn translate(&self, value: u32) -> ParquetResult { self.0 .get(value as usize) diff --git a/crates/polars-parquet/src/parquet/read/page/reader.rs b/crates/polars-parquet/src/parquet/read/page/reader.rs index 811557aa6ccb..7dfa2e144d8d 100644 --- a/crates/polars-parquet/src/parquet/read/page/reader.rs +++ b/crates/polars-parquet/src/parquet/read/page/reader.rs @@ -97,7 +97,7 @@ impl PageReader { Self::new_with_page_meta(reader, column.into(), scratch, max_page_size) } - /// Create a a new [`PageReader`] with [`PageMetaData`]. + /// Create a new [`PageReader`] with [`PageMetaData`]. /// /// It assumes that the reader has been `sought` (`seek`) to the beginning of `column`. pub fn new_with_page_meta( diff --git a/crates/polars-parquet/src/parquet/schema/io_message/from_message.rs b/crates/polars-parquet/src/parquet/schema/io_message/from_message.rs index ae98fd694e64..ccf293e48b08 100644 --- a/crates/polars-parquet/src/parquet/schema/io_message/from_message.rs +++ b/crates/polars-parquet/src/parquet/schema/io_message/from_message.rs @@ -303,7 +303,7 @@ fn parse_timeunit( }) } -impl<'a> Parser<'a> { +impl Parser<'_> { // Entry function to parse message type, uses internal tokenizer. fn parse_message_type(&mut self) -> ParquetResult { // Check that message type starts with "message". diff --git a/crates/polars-parquet/src/parquet/write/dyn_iter.rs b/crates/polars-parquet/src/parquet/write/dyn_iter.rs index f47710b56b22..a232c06375e8 100644 --- a/crates/polars-parquet/src/parquet/write/dyn_iter.rs +++ b/crates/polars-parquet/src/parquet/write/dyn_iter.rs @@ -7,7 +7,7 @@ pub struct DynIter<'a, V> { iter: Box + 'a + Send + Sync>, } -impl<'a, V> Iterator for DynIter<'a, V> { +impl Iterator for DynIter<'_, V> { type Item = V; fn next(&mut self) -> Option { self.iter.next() @@ -35,7 +35,7 @@ pub struct DynStreamingIterator<'a, V, E> { iter: Box + 'a + Send + Sync>, } -impl<'a, V, E> FallibleStreamingIterator for DynStreamingIterator<'a, V, E> { +impl FallibleStreamingIterator for DynStreamingIterator<'_, V, E> { type Item = V; type Error = E; diff --git a/crates/polars-pipe/src/executors/sinks/group_by/generic/ooc_state.rs b/crates/polars-pipe/src/executors/sinks/group_by/generic/ooc_state.rs index 77a939c64290..280cd236afa6 100644 --- a/crates/polars-pipe/src/executors/sinks/group_by/generic/ooc_state.rs +++ b/crates/polars-pipe/src/executors/sinks/group_by/generic/ooc_state.rs @@ -8,7 +8,7 @@ use crate::pipeline::{morsels_per_sink, FORCE_OOC}; pub(super) struct OocState { // OOC // Stores available memory in the system at the start of this sink. - // and stores the memory used by this this sink. + // and stores the memory used by this sink. mem_track: MemTracker, // sort in-memory or out-of-core pub(super) ooc: bool, diff --git a/crates/polars-pipe/src/executors/sinks/group_by/ooc_state.rs b/crates/polars-pipe/src/executors/sinks/group_by/ooc_state.rs index 1f79e20bcdab..f2c664087daf 100644 --- a/crates/polars-pipe/src/executors/sinks/group_by/ooc_state.rs +++ b/crates/polars-pipe/src/executors/sinks/group_by/ooc_state.rs @@ -13,7 +13,7 @@ use crate::pipeline::morsels_per_sink; pub(super) struct OocState { // OOC // Stores available memory in the system at the start of this sink. - // and stores the memory used by this this sink. + // and stores the memory used by this sink. _mem_track: MemTracker, // sort in-memory or out-of-core pub(super) ooc: bool, diff --git a/crates/polars-pipe/src/executors/sinks/joins/cross.rs b/crates/polars-pipe/src/executors/sinks/joins/cross.rs index d6014c344978..77466578d3e9 100644 --- a/crates/polars-pipe/src/executors/sinks/joins/cross.rs +++ b/crates/polars-pipe/src/executors/sinks/joins/cross.rs @@ -111,7 +111,7 @@ impl Operator for CrossJoinProbe { _context: &PExecutionContext, chunk: &DataChunk, ) -> PolarsResult { - // Expected output is size**2, so this needs to be a a small number. + // Expected output is size**2, so this needs to be a small number. // However, if one of the DataFrames is much smaller than 250, we want // to take rather more from the other DataFrame so we don't end up with // overly small chunks. diff --git a/crates/polars-pipe/src/executors/sinks/sort/sink.rs b/crates/polars-pipe/src/executors/sinks/sort/sink.rs index 43589c9783a1..49d51cc2e2fb 100644 --- a/crates/polars-pipe/src/executors/sinks/sort/sink.rs +++ b/crates/polars-pipe/src/executors/sinks/sort/sink.rs @@ -20,7 +20,7 @@ pub struct SortSink { schema: SchemaRef, chunks: Vec, // Stores available memory in the system at the start of this sink. - // and stores the memory used by this this sink. + // and stores the memory used by this sink. mem_track: MemTracker, // sort in-memory or out-of-core ooc: bool, diff --git a/crates/polars-pipe/src/operators/chunks.rs b/crates/polars-pipe/src/operators/chunks.rs index c1f63019a611..d237510b4636 100644 --- a/crates/polars-pipe/src/operators/chunks.rs +++ b/crates/polars-pipe/src/operators/chunks.rs @@ -39,7 +39,7 @@ pub(crate) fn chunks_to_df_unchecked(chunks: Vec) -> DataFrame { /// /// The benefit of having a series of `DataFrame` that are e.g. 4MB each that /// are then made contiguous is that you're not using a lot of memory (an extra -/// 4MB), but you're still doing better than if you had a series of of 2KB +/// 4MB), but you're still doing better than if you had a series of 2KB /// `DataFrame`s. /// /// Changing the `DataFrame` into contiguous chunks is the caller's diff --git a/crates/polars-plan/Cargo.toml b/crates/polars-plan/Cargo.toml index 3d6d0009ea4d..9c5a3fe32913 100644 --- a/crates/polars-plan/Cargo.toml +++ b/crates/polars-plan/Cargo.toml @@ -186,7 +186,7 @@ month_start = ["polars-time/month_start"] month_end = ["polars-time/month_end"] offset_by = ["polars-time/offset_by"] -bigidx = ["polars-core/bigidx"] +bigidx = ["polars-core/bigidx", "polars-utils/bigidx"] polars_cloud = ["serde", "ciborium"] ir_serde = ["serde", "polars-utils/ir_serde"] diff --git a/crates/polars-plan/src/client/check.rs b/crates/polars-plan/src/client/check.rs index f76f508643f0..97b1ed23da2e 100644 --- a/crates/polars-plan/src/client/check.rs +++ b/crates/polars-plan/src/client/check.rs @@ -6,6 +6,9 @@ use crate::plans::{DslPlan, FileScan, ScanSources}; /// Assert that the given [`DslPlan`] is eligible to be executed on Polars Cloud. pub(super) fn assert_cloud_eligible(dsl: &DslPlan) -> PolarsResult<()> { + if std::env::var("POLARS_SKIP_CLIENT_CHECK").as_deref() == Ok("1") { + return Ok(()); + } for plan_node in dsl.into_iter() { match plan_node { #[cfg(feature = "python")] diff --git a/crates/polars-plan/src/dsl/function_expr/list.rs b/crates/polars-plan/src/dsl/function_expr/list.rs index b9a07d97341a..ddf8fb1fff20 100644 --- a/crates/polars-plan/src/dsl/function_expr/list.rs +++ b/crates/polars-plan/src/dsl/function_expr/list.rs @@ -4,7 +4,7 @@ use polars_ops::chunked_array::list::*; use super::*; use crate::{map, map_as_slice, wrap}; -#[derive(Clone, Copy, Eq, PartialEq, Hash, Debug)] +#[derive(Clone, Eq, PartialEq, Hash, Debug)] #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] pub enum ListFunction { Concat, @@ -56,6 +56,8 @@ pub enum ListFunction { Join(bool), #[cfg(feature = "dtype-array")] ToArray(usize), + #[cfg(feature = "list_to_struct")] + ToStruct(ListToStructArgs), } impl ListFunction { @@ -103,6 +105,8 @@ impl ListFunction { #[cfg(feature = "dtype-array")] ToArray(width) => mapper.try_map_dtype(|dt| map_list_dtype_to_array_dtype(dt, *width)), NUnique => mapper.with_dtype(IDX_DTYPE), + #[cfg(feature = "list_to_struct")] + ToStruct(args) => mapper.try_map_dtype(|x| args.get_output_dtype(x)), } } } @@ -174,6 +178,8 @@ impl Display for ListFunction { Join(_) => "join", #[cfg(feature = "dtype-array")] ToArray(_) => "to_array", + #[cfg(feature = "list_to_struct")] + ToStruct(_) => "to_struct", }; write!(f, "list.{name}") } @@ -235,6 +241,8 @@ impl From for SpecialEq> { #[cfg(feature = "dtype-array")] ToArray(width) => map!(to_array, width), NUnique => map!(n_unique), + #[cfg(feature = "list_to_struct")] + ToStruct(args) => map!(to_struct, &args), } } } @@ -650,6 +658,11 @@ pub(super) fn to_array(s: &Column, width: usize) -> PolarsResult { s.cast(&array_dtype) } +#[cfg(feature = "list_to_struct")] +pub(super) fn to_struct(s: &Column, args: &ListToStructArgs) -> PolarsResult { + Ok(s.list()?.to_struct(args)?.into_series().into()) +} + pub(super) fn n_unique(s: &Column) -> PolarsResult { Ok(s.list()?.lst_n_unique()?.into_column()) } diff --git a/crates/polars-plan/src/dsl/function_expr/range/datetime_range.rs b/crates/polars-plan/src/dsl/function_expr/range/datetime_range.rs index a61264ce7aca..e220a7107435 100644 --- a/crates/polars-plan/src/dsl/function_expr/range/datetime_range.rs +++ b/crates/polars-plan/src/dsl/function_expr/range/datetime_range.rs @@ -223,7 +223,7 @@ pub(super) fn datetime_ranges( out.cast(&to_type).map(Column::from) } -impl<'a> FieldsMapper<'a> { +impl FieldsMapper<'_> { pub(super) fn map_to_datetime_range_dtype( &self, time_unit: Option<&TimeUnit>, diff --git a/crates/polars-plan/src/dsl/functions/syntactic_sugar.rs b/crates/polars-plan/src/dsl/functions/syntactic_sugar.rs index 8363f6baa2fa..4d0e4c105014 100644 --- a/crates/polars-plan/src/dsl/functions/syntactic_sugar.rs +++ b/crates/polars-plan/src/dsl/functions/syntactic_sugar.rs @@ -55,7 +55,7 @@ pub fn is_not_null(expr: Expr) -> Expr { /// Casts the column given by `Expr` to a different type. /// /// Follows the rules of Rust casting, with the exception that integers and floats can be cast to `DataType::Date` and -/// `DataType::DateTime(_, _)`. A column consisting entirely of of `Null` can be cast to any type, regardless of the +/// `DataType::DateTime(_, _)`. A column consisting entirely of `Null` can be cast to any type, regardless of the /// nominal type of the column. pub fn cast(expr: Expr, dtype: DataType) -> Expr { Expr::Cast { diff --git a/crates/polars-plan/src/dsl/list.rs b/crates/polars-plan/src/dsl/list.rs index ceec55123fcd..3a1a37c9f393 100644 --- a/crates/polars-plan/src/dsl/list.rs +++ b/crates/polars-plan/src/dsl/list.rs @@ -1,6 +1,3 @@ -#[cfg(feature = "list_to_struct")] -use std::sync::RwLock; - use polars_core::prelude::*; #[cfg(feature = "diff")] use polars_core::series::ops::NullBehavior; @@ -281,50 +278,9 @@ impl ListNameSpace { /// an `upper_bound` of struct fields that will be set. /// If this is incorrectly downstream operation may fail. For instance an `all().sum()` expression /// will look in the current schema to determine which columns to select. - pub fn to_struct( - self, - n_fields: ListToStructWidthStrategy, - name_generator: Option, - upper_bound: usize, - ) -> Expr { - // heap allocate the output type and fill it later - let out_dtype = Arc::new(RwLock::new(None::)); - + pub fn to_struct(self, args: ListToStructArgs) -> Expr { self.0 - .map( - move |s| { - s.list()? - .to_struct(n_fields, name_generator.clone()) - .map(|s| Some(s.into_column())) - }, - // we don't yet know the fields - GetOutput::map_dtype(move |dt: &DataType| { - polars_ensure!(matches!(dt, DataType::List(_)), SchemaMismatch: "expected 'List' as input to 'list.to_struct' got {}", dt); - let out = out_dtype.read().unwrap(); - match out.as_ref() { - // dtype already set - Some(dt) => Ok(dt.clone()), - // dtype still unknown, set it - None => { - drop(out); - let mut lock = out_dtype.write().unwrap(); - - let inner = dt.inner_dtype().unwrap(); - let fields = (0..upper_bound) - .map(|i| { - let name = _default_struct_name_gen(i); - Field::new(name, inner.clone()) - }) - .collect(); - let dt = DataType::Struct(fields); - - *lock = Some(dt.clone()); - Ok(dt) - }, - } - }), - ) - .with_fmt("list.to_struct") + .map_private(FunctionExpr::ListExpr(ListFunction::ToStruct(args))) } #[cfg(feature = "is_in")] diff --git a/crates/polars-plan/src/dsl/selector.rs b/crates/polars-plan/src/dsl/selector.rs index 16e7d7b374e0..7877edb152df 100644 --- a/crates/polars-plan/src/dsl/selector.rs +++ b/crates/polars-plan/src/dsl/selector.rs @@ -11,7 +11,7 @@ pub enum Selector { Add(Box, Box), Sub(Box, Box), ExclusiveOr(Box, Box), - InterSect(Box, Box), + Intersect(Box, Box), Root(Box), } @@ -34,7 +34,7 @@ impl BitAnd for Selector { #[allow(clippy::suspicious_arithmetic_impl)] fn bitand(self, rhs: Self) -> Self::Output { - Selector::InterSect(Box::new(self), Box::new(rhs)) + Selector::Intersect(Box::new(self), Box::new(rhs)) } } diff --git a/crates/polars-plan/src/dsl/string.rs b/crates/polars-plan/src/dsl/string.rs index efa34f59c04c..2514d1a5f6a4 100644 --- a/crates/polars-plan/src/dsl/string.rs +++ b/crates/polars-plan/src/dsl/string.rs @@ -593,7 +593,7 @@ impl StringNameSpace { ) } - #[cfg(feature = "strings")] + #[cfg(feature = "regex")] pub fn escape_regex(self) -> Expr { self.0.map_many_private( FunctionExpr::StringExpr(StringFunction::EscapeRegex), diff --git a/crates/polars-plan/src/plans/conversion/expr_expansion.rs b/crates/polars-plan/src/plans/conversion/expr_expansion.rs index bec3fbe852cd..4709641662f9 100644 --- a/crates/polars-plan/src/plans/conversion/expr_expansion.rs +++ b/crates/polars-plan/src/plans/conversion/expr_expansion.rs @@ -1,5 +1,4 @@ //! this contains code used for rewriting projections, expanding wildcards, regex selection etc. -use std::ops::BitXor; use super::*; @@ -176,26 +175,28 @@ fn expand_columns( schema: &Schema, exclude: &PlHashSet, ) -> PolarsResult<()> { - let mut is_valid = true; + if !expr.into_iter().all(|e| match e { + // check for invalid expansions such as `col([a, b]) + col([c, d])` + Expr::Columns(ref members) => members.as_ref() == names, + _ => true, + }) { + polars_bail!(ComputeError: "expanding more than one `col` is not allowed"); + } for name in names { if !exclude.contains(name) { - let new_expr = expr.clone(); - let (new_expr, new_expr_valid) = replace_columns_with_column(new_expr, names, name); - is_valid &= new_expr_valid; - // we may have regex col in columns. - #[allow(clippy::collapsible_else_if)] + let new_expr = expr.clone().map_expr(|e| match e { + Expr::Columns(_) => Expr::Column((*name).clone()), + Expr::Exclude(input, _) => Arc::unwrap_or_clone(input), + e => e, + }); + #[cfg(feature = "regex")] - { - replace_regex(&new_expr, result, schema, exclude)?; - } + replace_regex(&new_expr, result, schema, exclude)?; + #[cfg(not(feature = "regex"))] - { - let new_expr = rewrite_special_aliases(new_expr)?; - result.push(new_expr) - } + result.push(rewrite_special_aliases(new_expr)?); } } - polars_ensure!(is_valid, ComputeError: "expanding more than one `col` is not allowed"); Ok(()) } @@ -246,30 +247,6 @@ fn replace_dtype_or_index_with_column( }) } -/// This replaces the columns Expr with a Column Expr. It also removes the Exclude Expr from the -/// expression chain. -pub(super) fn replace_columns_with_column( - mut expr: Expr, - names: &[PlSmallStr], - column_name: &PlSmallStr, -) -> (Expr, bool) { - let mut is_valid = true; - expr = expr.map_expr(|e| match e { - Expr::Columns(members) => { - // `col([a, b]) + col([c, d])` - if members.as_ref() == names { - Expr::Column(column_name.clone()) - } else { - is_valid = false; - Expr::Columns(members) - } - }, - Expr::Exclude(input, _) => Arc::unwrap_or_clone(input), - e => e, - }); - (expr, is_valid) -} - fn dtypes_match(d1: &DataType, d2: &DataType) -> bool { match (d1, d2) { // note: allow Datetime "*" wildcard for timezones... @@ -562,7 +539,7 @@ fn expand_function_inputs( }) } -#[derive(Copy, Clone)] +#[derive(Copy, Clone, Debug)] struct ExpansionFlags { multiple_columns: bool, has_nth: bool, @@ -819,42 +796,31 @@ fn replace_selector_inner( members.extend(scratch.drain(..)) }, Selector::Add(lhs, rhs) => { + let mut tmp_members: PlIndexSet = Default::default(); replace_selector_inner(*lhs, members, scratch, schema, keys)?; - let mut rhs_members: PlIndexSet = Default::default(); - replace_selector_inner(*rhs, &mut rhs_members, scratch, schema, keys)?; - members.extend(rhs_members) + replace_selector_inner(*rhs, &mut tmp_members, scratch, schema, keys)?; + members.extend(tmp_members) }, Selector::ExclusiveOr(lhs, rhs) => { - let mut lhs_members = Default::default(); - replace_selector_inner(*lhs, &mut lhs_members, scratch, schema, keys)?; + let mut tmp_members = Default::default(); + replace_selector_inner(*lhs, &mut tmp_members, scratch, schema, keys)?; + replace_selector_inner(*rhs, members, scratch, schema, keys)?; - let mut rhs_members = Default::default(); - replace_selector_inner(*rhs, &mut rhs_members, scratch, schema, keys)?; - - let xor_members = lhs_members.bitxor(&rhs_members); - *members = xor_members; + *members = tmp_members.symmetric_difference(members).cloned().collect(); }, - Selector::InterSect(lhs, rhs) => { - replace_selector_inner(*lhs, members, scratch, schema, keys)?; + Selector::Intersect(lhs, rhs) => { + let mut tmp_members = Default::default(); + replace_selector_inner(*lhs, &mut tmp_members, scratch, schema, keys)?; + replace_selector_inner(*rhs, members, scratch, schema, keys)?; - let mut rhs_members = Default::default(); - replace_selector_inner(*rhs, &mut rhs_members, scratch, schema, keys)?; - - *members = members.intersection(&rhs_members).cloned().collect() + *members = tmp_members.intersection(members).cloned().collect(); }, Selector::Sub(lhs, rhs) => { - replace_selector_inner(*lhs, members, scratch, schema, keys)?; + let mut tmp_members = Default::default(); + replace_selector_inner(*lhs, &mut tmp_members, scratch, schema, keys)?; + replace_selector_inner(*rhs, members, scratch, schema, keys)?; - let mut rhs_members = Default::default(); - replace_selector_inner(*rhs, &mut rhs_members, scratch, schema, keys)?; - - let mut new_members = PlIndexSet::with_capacity(members.len()); - for e in members.drain(..) { - if !rhs_members.contains(&e) { - new_members.insert(e); - } - } - *members = new_members; + *members = tmp_members.difference(members).cloned().collect(); }, } Ok(()) diff --git a/crates/polars-plan/src/plans/ir/dot.rs b/crates/polars-plan/src/plans/ir/dot.rs index 51050f2fa877..76d8559a052d 100644 --- a/crates/polars-plan/src/plans/ir/dot.rs +++ b/crates/polars-plan/src/plans/ir/dot.rs @@ -420,7 +420,7 @@ impl fmt::Display for OptionExprIRDisplay<'_> { /// Utility structure to write to a [`fmt::Formatter`] whilst escaping the output as a label name pub struct EscapeLabel<'a>(pub &'a mut dyn fmt::Write); -impl<'a> fmt::Write for EscapeLabel<'a> { +impl fmt::Write for EscapeLabel<'_> { fn write_str(&mut self, mut s: &str) -> fmt::Result { loop { let mut char_indices = s.char_indices(); diff --git a/crates/polars-plan/src/plans/ir/format.rs b/crates/polars-plan/src/plans/ir/format.rs index 4ccb74f66238..c4ff7dfffb45 100644 --- a/crates/polars-plan/src/plans/ir/format.rs +++ b/crates/polars-plan/src/plans/ir/format.rs @@ -413,13 +413,13 @@ impl<'a> ExprIRDisplay<'a> { } } -impl<'a> Display for IRDisplay<'a> { +impl Display for IRDisplay<'_> { fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { self._format(f, 0) } } -impl<'a, T: AsExpr> Display for ExprIRSliceDisplay<'a, T> { +impl Display for ExprIRSliceDisplay<'_, T> { fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { // Display items in slice delimited by a comma @@ -452,13 +452,13 @@ impl<'a, T: AsExpr> Display for ExprIRSliceDisplay<'a, T> { } } -impl<'a, T: AsExpr> fmt::Debug for ExprIRSliceDisplay<'a, T> { +impl fmt::Debug for ExprIRSliceDisplay<'_, T> { fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { Display::fmt(self, f) } } -impl<'a> Display for ExprIRDisplay<'a> { +impl Display for ExprIRDisplay<'_> { #[recursive] fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { let root = self.expr_arena.get(self.node); @@ -673,7 +673,7 @@ impl<'a> Display for ExprIRDisplay<'a> { } } -impl<'a> fmt::Debug for ExprIRDisplay<'a> { +impl fmt::Debug for ExprIRDisplay<'_> { fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { Display::fmt(self, f) } diff --git a/crates/polars-plan/src/plans/ir/scan_sources.rs b/crates/polars-plan/src/plans/ir/scan_sources.rs index 789a5c4f4811..bcb80bd6140d 100644 --- a/crates/polars-plan/src/plans/ir/scan_sources.rs +++ b/crates/polars-plan/src/plans/ir/scan_sources.rs @@ -330,4 +330,4 @@ impl<'a> Iterator for ScanSourceIter<'a> { } } -impl<'a> ExactSizeIterator for ScanSourceIter<'a> {} +impl ExactSizeIterator for ScanSourceIter<'_> {} diff --git a/crates/polars-plan/src/plans/lit.rs b/crates/polars-plan/src/plans/lit.rs index 51dd550ee9e4..74feffd60da0 100644 --- a/crates/polars-plan/src/plans/lit.rs +++ b/crates/polars-plan/src/plans/lit.rs @@ -266,7 +266,7 @@ impl Literal for String { } } -impl<'a> Literal for &'a str { +impl Literal for &str { fn lit(self) -> Expr { Expr::Literal(LiteralValue::String(PlSmallStr::from_str(self))) } @@ -278,7 +278,7 @@ impl Literal for Vec { } } -impl<'a> Literal for &'a [u8] { +impl Literal for &[u8] { fn lit(self) -> Expr { Expr::Literal(LiteralValue::Binary(self.to_vec())) } diff --git a/crates/polars-plan/src/plans/optimizer/cluster_with_columns.rs b/crates/polars-plan/src/plans/optimizer/cluster_with_columns.rs index b3f52c6e30a9..4e109903fdce 100644 --- a/crates/polars-plan/src/plans/optimizer/cluster_with_columns.rs +++ b/crates/polars-plan/src/plans/optimizer/cluster_with_columns.rs @@ -141,7 +141,7 @@ pub fn optimize(root: Node, lp_arena: &mut Arena, expr_arena: &Arena) // @NOTE: Pruning of re-assigned columns // // We checked if this expression output is also assigned by the input and - // that that assignment is not used in the current WITH_COLUMNS. + // that this assignment is not used in the current WITH_COLUMNS. // Consequently, we are free to prune the input's assignment to the output. // // We immediately prune here to simplify the later code. diff --git a/crates/polars-plan/src/plans/optimizer/collapse_joins.rs b/crates/polars-plan/src/plans/optimizer/collapse_joins.rs index 608c7122f2ec..778efee6aa9b 100644 --- a/crates/polars-plan/src/plans/optimizer/collapse_joins.rs +++ b/crates/polars-plan/src/plans/optimizer/collapse_joins.rs @@ -123,7 +123,7 @@ struct MintermIter<'a> { expr_arena: &'a Arena, } -impl<'a> Iterator for MintermIter<'a> { +impl Iterator for MintermIter<'_> { type Item = Node; fn next(&mut self) -> Option { diff --git a/crates/polars-plan/src/plans/optimizer/cse/cse_lp.rs b/crates/polars-plan/src/plans/optimizer/cse/cse_lp.rs index 075414597edf..f4522ee3a3ca 100644 --- a/crates/polars-plan/src/plans/optimizer/cse/cse_lp.rs +++ b/crates/polars-plan/src/plans/optimizer/cse/cse_lp.rs @@ -184,7 +184,7 @@ fn skip_children(lp: &IR) -> bool { } } -impl<'a> Visitor for LpIdentifierVisitor<'a> { +impl Visitor for LpIdentifierVisitor<'_> { type Node = IRNode; type Arena = IRNodeArena; @@ -265,7 +265,7 @@ impl<'a> CommonSubPlanRewriter<'a> { } } -impl<'a> RewritingVisitor for CommonSubPlanRewriter<'a> { +impl RewritingVisitor for CommonSubPlanRewriter<'_> { type Node = IRNode; type Arena = IRNodeArena; diff --git a/crates/polars-plan/src/plans/optimizer/simplify_expr/mod.rs b/crates/polars-plan/src/plans/optimizer/simplify_expr/mod.rs index 1df68a0adcfa..db123a5bd09d 100644 --- a/crates/polars-plan/src/plans/optimizer/simplify_expr/mod.rs +++ b/crates/polars-plan/src/plans/optimizer/simplify_expr/mod.rs @@ -152,7 +152,7 @@ impl OptimizationRule for SimplifyBooleanRule { AExpr::Literal(LiteralValue::Boolean(true)) ) && in_filter => { - // Only in filter as we we might change the name from "literal" + // Only in filter as we might change the name from "literal" // to whatever lhs columns is. return Ok(Some(expr_arena.get(*right).clone())); }, @@ -210,7 +210,7 @@ impl OptimizationRule for SimplifyBooleanRule { AExpr::Literal(LiteralValue::Boolean(false)) ) && in_filter => { - // Only in filter as we we might change the name from "literal" + // Only in filter as we might change the name from "literal" // to whatever lhs columns is. return Ok(Some(expr_arena.get(*right).clone())); }, diff --git a/crates/polars-python/Cargo.toml b/crates/polars-python/Cargo.toml index 8078e919c38e..16af7a3071df 100644 --- a/crates/polars-python/Cargo.toml +++ b/crates/polars-python/Cargo.toml @@ -255,7 +255,7 @@ all = [ "binary_encoding", "ffi_plugin", "polars_cloud", - # "new_streaming", + "new_streaming", ] # we cannot conditionally activate simd diff --git a/crates/polars-python/src/dataframe/general.rs b/crates/polars-python/src/dataframe/general.rs index d7bbab29177f..ac4febced0f6 100644 --- a/crates/polars-python/src/dataframe/general.rs +++ b/crates/polars-python/src/dataframe/general.rs @@ -591,11 +591,11 @@ impl PyDataFrame { every: &str, stable: bool, ) -> PyResult { + let every = Duration::try_parse(every).map_err(PyPolarsErr::from)?; let out = if stable { - self.df - .upsample_stable(by, index_column, Duration::parse(every)) + self.df.upsample_stable(by, index_column, every) } else { - self.df.upsample(by, index_column, Duration::parse(every)) + self.df.upsample(by, index_column, every) }; let out = out.map_err(PyPolarsErr::from)?; Ok(out.into()) diff --git a/crates/polars-python/src/expr/general.rs b/crates/polars-python/src/expr/general.rs index 604049f62b66..7125388e88cd 100644 --- a/crates/polars-python/src/expr/general.rs +++ b/crates/polars-python/src/expr/general.rs @@ -9,6 +9,7 @@ use pyo3::class::basic::CompareOp; use pyo3::prelude::*; use crate::conversion::{parse_fill_null_strategy, vec_extract_wrapped, Wrap}; +use crate::error::PyPolarsErr; use crate::map::lazy::map_single; use crate::PyExpr; @@ -614,15 +615,15 @@ impl PyExpr { period: &str, offset: &str, closed: Wrap, - ) -> Self { + ) -> PyResult { let options = RollingGroupOptions { index_column: index_column.into(), - period: Duration::parse(period), - offset: Duration::parse(offset), + period: Duration::try_parse(period).map_err(PyPolarsErr::from)?, + offset: Duration::try_parse(offset).map_err(PyPolarsErr::from)?, closed_window: closed.0, }; - self.inner.clone().rolling(options).into() + Ok(self.inner.clone().rolling(options).into()) } fn and_(&self, expr: Self) -> Self { @@ -812,12 +813,13 @@ impl PyExpr { }; self.inner.clone().ewm_mean(options).into() } - fn ewm_mean_by(&self, times: PyExpr, half_life: &str) -> Self { - let half_life = Duration::parse(half_life); - self.inner + fn ewm_mean_by(&self, times: PyExpr, half_life: &str) -> PyResult { + let half_life = Duration::try_parse(half_life).map_err(PyPolarsErr::from)?; + Ok(self + .inner .clone() .ewm_mean_by(times.inner, half_life) - .into() + .into()) } fn ewm_std( diff --git a/crates/polars-python/src/expr/list.rs b/crates/polars-python/src/expr/list.rs index 1bd087144634..af3be10449b1 100644 --- a/crates/polars-python/src/expr/list.rs +++ b/crates/polars-python/src/expr/list.rs @@ -4,6 +4,7 @@ use polars::prelude::*; use polars::series::ops::NullBehavior; use polars_utils::pl_str::PlSmallStr; use pyo3::prelude::*; +use pyo3::types::PySequence; use crate::conversion::Wrap; use crate::PyExpr; @@ -214,20 +215,39 @@ impl PyExpr { upper_bound: usize, ) -> PyResult { let name_gen = name_gen.map(|lambda| { - Arc::new(move |idx: usize| { + NameGenerator::from_func(move |idx: usize| { Python::with_gil(|py| { let out = lambda.call1(py, (idx,)).unwrap(); let out: PlSmallStr = out.extract::>(py).unwrap().as_ref().into(); out }) - }) as NameGenerator + }) }); Ok(self .inner .clone() .list() - .to_struct(width_strat.0, name_gen, upper_bound) + .to_struct(ListToStructArgs::InferWidth { + infer_field_strategy: width_strat.0, + get_index_name: name_gen, + max_fields: upper_bound, + }) + .into()) + } + + #[pyo3(signature = (names))] + fn list_to_struct_fixed_width(&self, names: Bound<'_, PySequence>) -> PyResult { + Ok(self + .inner + .clone() + .list() + .to_struct(ListToStructArgs::FixedWidth( + names + .iter()? + .map(|x| Ok(x?.extract::>()?.0)) + .collect::>>()?, + )) .into()) } diff --git a/crates/polars-python/src/expr/rolling.rs b/crates/polars-python/src/expr/rolling.rs index a5ef9213128f..5ef511902613 100644 --- a/crates/polars-python/src/expr/rolling.rs +++ b/crates/polars-python/src/expr/rolling.rs @@ -3,6 +3,7 @@ use pyo3::prelude::*; use pyo3::types::PyFloat; use crate::conversion::Wrap; +use crate::error::PyPolarsErr; use crate::map::lazy::call_lambda_with_series; use crate::{PyExpr, PySeries}; @@ -34,14 +35,14 @@ impl PyExpr { window_size: &str, min_periods: usize, closed: Wrap, - ) -> Self { + ) -> PyResult { let options = RollingOptionsDynamicWindow { - window_size: Duration::parse(window_size), + window_size: Duration::try_parse(window_size).map_err(PyPolarsErr::from)?, min_periods, closed_window: closed.0, fn_params: None, }; - self.inner.clone().rolling_sum_by(by.inner, options).into() + Ok(self.inner.clone().rolling_sum_by(by.inner, options).into()) } #[pyo3(signature = (window_size, weights, min_periods, center))] @@ -70,14 +71,14 @@ impl PyExpr { window_size: &str, min_periods: usize, closed: Wrap, - ) -> Self { + ) -> PyResult { let options = RollingOptionsDynamicWindow { - window_size: Duration::parse(window_size), + window_size: Duration::try_parse(window_size).map_err(PyPolarsErr::from)?, min_periods, closed_window: closed.0, fn_params: None, }; - self.inner.clone().rolling_min_by(by.inner, options).into() + Ok(self.inner.clone().rolling_min_by(by.inner, options).into()) } #[pyo3(signature = (window_size, weights, min_periods, center))] @@ -105,14 +106,14 @@ impl PyExpr { window_size: &str, min_periods: usize, closed: Wrap, - ) -> Self { + ) -> PyResult { let options = RollingOptionsDynamicWindow { - window_size: Duration::parse(window_size), + window_size: Duration::try_parse(window_size).map_err(PyPolarsErr::from)?, min_periods, closed_window: closed.0, fn_params: None, }; - self.inner.clone().rolling_max_by(by.inner, options).into() + Ok(self.inner.clone().rolling_max_by(by.inner, options).into()) } #[pyo3(signature = (window_size, weights, min_periods, center))] @@ -142,15 +143,15 @@ impl PyExpr { window_size: &str, min_periods: usize, closed: Wrap, - ) -> Self { + ) -> PyResult { let options = RollingOptionsDynamicWindow { - window_size: Duration::parse(window_size), + window_size: Duration::try_parse(window_size).map_err(PyPolarsErr::from)?, min_periods, closed_window: closed.0, fn_params: None, }; - self.inner.clone().rolling_mean_by(by.inner, options).into() + Ok(self.inner.clone().rolling_mean_by(by.inner, options).into()) } #[pyo3(signature = (window_size, weights, min_periods, center, ddof))] @@ -182,15 +183,15 @@ impl PyExpr { min_periods: usize, closed: Wrap, ddof: u8, - ) -> Self { + ) -> PyResult { let options = RollingOptionsDynamicWindow { - window_size: Duration::parse(window_size), + window_size: Duration::try_parse(window_size).map_err(PyPolarsErr::from)?, min_periods, closed_window: closed.0, fn_params: Some(RollingFnParams::Var(RollingVarParams { ddof })), }; - self.inner.clone().rolling_std_by(by.inner, options).into() + Ok(self.inner.clone().rolling_std_by(by.inner, options).into()) } #[pyo3(signature = (window_size, weights, min_periods, center, ddof))] @@ -222,15 +223,15 @@ impl PyExpr { min_periods: usize, closed: Wrap, ddof: u8, - ) -> Self { + ) -> PyResult { let options = RollingOptionsDynamicWindow { - window_size: Duration::parse(window_size), + window_size: Duration::try_parse(window_size).map_err(PyPolarsErr::from)?, min_periods, closed_window: closed.0, fn_params: Some(RollingFnParams::Var(RollingVarParams { ddof })), }; - self.inner.clone().rolling_var_by(by.inner, options).into() + Ok(self.inner.clone().rolling_var_by(by.inner, options).into()) } #[pyo3(signature = (window_size, weights, min_periods, center))] @@ -259,17 +260,18 @@ impl PyExpr { window_size: &str, min_periods: usize, closed: Wrap, - ) -> Self { + ) -> PyResult { let options = RollingOptionsDynamicWindow { - window_size: Duration::parse(window_size), + window_size: Duration::try_parse(window_size).map_err(PyPolarsErr::from)?, min_periods, closed_window: closed.0, fn_params: None, }; - self.inner + Ok(self + .inner .clone() .rolling_median_by(by.inner, options) - .into() + .into()) } #[pyo3(signature = (quantile, interpolation, window_size, weights, min_periods, center))] @@ -306,18 +308,19 @@ impl PyExpr { window_size: &str, min_periods: usize, closed: Wrap, - ) -> Self { + ) -> PyResult { let options = RollingOptionsDynamicWindow { - window_size: Duration::parse(window_size), + window_size: Duration::try_parse(window_size).map_err(PyPolarsErr::from)?, min_periods, closed_window: closed.0, fn_params: None, }; - self.inner + Ok(self + .inner .clone() .rolling_quantile_by(by.inner, interpolation.0, quantile, options) - .into() + .into()) } fn rolling_skew(&self, window_size: usize, bias: bool) -> Self { diff --git a/crates/polars-python/src/file.rs b/crates/polars-python/src/file.rs index 1aab5d9c5829..efbcbff3fc18 100644 --- a/crates/polars-python/src/file.rs +++ b/crates/polars-python/src/file.rs @@ -25,7 +25,7 @@ pub struct PyFileLikeObject { /// Wraps a `PyObject`, and implements read, seek, and write for it. impl PyFileLikeObject { /// Creates an instance of a `PyFileLikeObject` from a `PyObject`. - /// To assert the object has the required methods methods, + /// To assert the object has the required methods, /// instantiate it with `PyFileLikeObject::require` pub fn new(object: PyObject) -> Self { PyFileLikeObject { inner: object } diff --git a/crates/polars-python/src/functions/range.rs b/crates/polars-python/src/functions/range.rs index e6e421dac84c..b6eae4400dd8 100644 --- a/crates/polars-python/src/functions/range.rs +++ b/crates/polars-python/src/functions/range.rs @@ -71,12 +71,12 @@ pub fn date_range( end: PyExpr, interval: &str, closed: Wrap, -) -> PyExpr { +) -> PyResult { let start = start.inner; let end = end.inner; - let interval = Duration::parse(interval); + let interval = Duration::try_parse(interval).map_err(PyPolarsErr::from)?; let closed = closed.0; - dsl::date_range(start, end, interval, closed).into() + Ok(dsl::date_range(start, end, interval, closed).into()) } #[pyfunction] @@ -85,12 +85,12 @@ pub fn date_ranges( end: PyExpr, interval: &str, closed: Wrap, -) -> PyExpr { +) -> PyResult { let start = start.inner; let end = end.inner; - let interval = Duration::parse(interval); + let interval = Duration::try_parse(interval).map_err(PyPolarsErr::from)?; let closed = closed.0; - dsl::date_ranges(start, end, interval, closed).into() + Ok(dsl::date_ranges(start, end, interval, closed).into()) } #[pyfunction] @@ -102,14 +102,14 @@ pub fn datetime_range( closed: Wrap, time_unit: Option>, time_zone: Option>, -) -> PyExpr { +) -> PyResult { let start = start.inner; let end = end.inner; - let every = Duration::parse(every); + let every = Duration::try_parse(every).map_err(PyPolarsErr::from)?; let closed = closed.0; let time_unit = time_unit.map(|x| x.0); let time_zone = time_zone.map(|x| x.0); - dsl::datetime_range(start, end, every, closed, time_unit, time_zone).into() + Ok(dsl::datetime_range(start, end, every, closed, time_unit, time_zone).into()) } #[pyfunction] @@ -121,30 +121,40 @@ pub fn datetime_ranges( closed: Wrap, time_unit: Option>, time_zone: Option>, -) -> PyExpr { +) -> PyResult { let start = start.inner; let end = end.inner; - let every = Duration::parse(every); + let every = Duration::try_parse(every).map_err(PyPolarsErr::from)?; let closed = closed.0; let time_unit = time_unit.map(|x| x.0); let time_zone = time_zone.map(|x| x.0); - dsl::datetime_ranges(start, end, every, closed, time_unit, time_zone).into() + Ok(dsl::datetime_ranges(start, end, every, closed, time_unit, time_zone).into()) } #[pyfunction] -pub fn time_range(start: PyExpr, end: PyExpr, every: &str, closed: Wrap) -> PyExpr { +pub fn time_range( + start: PyExpr, + end: PyExpr, + every: &str, + closed: Wrap, +) -> PyResult { let start = start.inner; let end = end.inner; - let every = Duration::parse(every); + let every = Duration::try_parse(every).map_err(PyPolarsErr::from)?; let closed = closed.0; - dsl::time_range(start, end, every, closed).into() + Ok(dsl::time_range(start, end, every, closed).into()) } #[pyfunction] -pub fn time_ranges(start: PyExpr, end: PyExpr, every: &str, closed: Wrap) -> PyExpr { +pub fn time_ranges( + start: PyExpr, + end: PyExpr, + every: &str, + closed: Wrap, +) -> PyResult { let start = start.inner; let end = end.inner; - let every = Duration::parse(every); + let every = Duration::try_parse(every).map_err(PyPolarsErr::from)?; let closed = closed.0; - dsl::time_ranges(start, end, every, closed).into() + Ok(dsl::time_ranges(start, end, every, closed).into()) } diff --git a/crates/polars-python/src/lazyframe/general.rs b/crates/polars-python/src/lazyframe/general.rs index c08b838404ba..da0b597418eb 100644 --- a/crates/polars-python/src/lazyframe/general.rs +++ b/crates/polars-python/src/lazyframe/general.rs @@ -842,7 +842,7 @@ impl PyLazyFrame { offset: &str, closed: Wrap, by: Vec, - ) -> PyLazyGroupBy { + ) -> PyResult { let closed_window = closed.0; let ldf = self.ldf.clone(); let by = by @@ -854,13 +854,13 @@ impl PyLazyFrame { by, RollingGroupOptions { index_column: "".into(), - period: Duration::parse(period), - offset: Duration::parse(offset), + period: Duration::try_parse(period).map_err(PyPolarsErr::from)?, + offset: Duration::try_parse(offset).map_err(PyPolarsErr::from)?, closed_window, }, ); - PyLazyGroupBy { lgb: Some(lazy_gb) } + Ok(PyLazyGroupBy { lgb: Some(lazy_gb) }) } fn group_by_dynamic( @@ -874,7 +874,7 @@ impl PyLazyFrame { closed: Wrap, group_by: Vec, start_by: Wrap, - ) -> PyLazyGroupBy { + ) -> PyResult { let closed_window = closed.0; let group_by = group_by .into_iter() @@ -885,9 +885,9 @@ impl PyLazyFrame { index_column.inner, group_by, DynamicGroupOptions { - every: Duration::parse(every), - period: Duration::parse(period), - offset: Duration::parse(offset), + every: Duration::try_parse(every).map_err(PyPolarsErr::from)?, + period: Duration::try_parse(period).map_err(PyPolarsErr::from)?, + offset: Duration::try_parse(offset).map_err(PyPolarsErr::from)?, label: label.0, include_boundaries, closed_window, @@ -896,7 +896,7 @@ impl PyLazyFrame { }, ); - PyLazyGroupBy { lgb: Some(lazy_gb) } + Ok(PyLazyGroupBy { lgb: Some(lazy_gb) }) } fn with_context(&self, contexts: Vec) -> Self { diff --git a/crates/polars-sql/src/sql_expr.rs b/crates/polars-sql/src/sql_expr.rs index a6fe495d1ba5..5eb2bdd843b4 100644 --- a/crates/polars-sql/src/sql_expr.rs +++ b/crates/polars-sql/src/sql_expr.rs @@ -374,10 +374,9 @@ impl SQLExprVisitor<'_> { }, // identify "CAST(expr AS type) string" and/or "expr::type string" expressions (Expr::Cast { expr, dtype, .. }, Expr::Literal(LiteralValue::String(s))) => { - if let Expr::Column(name) = &**expr { - (Some(name.clone()), Some(s), Some(dtype)) - } else { - (None, Some(s), Some(dtype)) + match &**expr { + Expr::Column(name) => (Some(name.clone()), Some(s), Some(dtype)), + _ => (None, Some(s), Some(dtype)), } }, _ => (None, None, None), @@ -385,23 +384,25 @@ impl SQLExprVisitor<'_> { if expr_dtype.is_none() && self.active_schema.is_none() { right.clone() } else { - let left_dtype = expr_dtype - .unwrap_or_else(|| self.active_schema.as_ref().unwrap().get(&name).unwrap()); - + let left_dtype = expr_dtype.or_else(|| { + self.active_schema + .as_ref() + .and_then(|schema| schema.get(&name)) + }); match left_dtype { - DataType::Time if is_iso_time(s) => { + Some(DataType::Time) if is_iso_time(s) => { right.clone().str().to_time(StrptimeOptions { strict: true, ..Default::default() }) }, - DataType::Date if is_iso_date(s) => { + Some(DataType::Date) if is_iso_date(s) => { right.clone().str().to_date(StrptimeOptions { strict: true, ..Default::default() }) }, - DataType::Datetime(tu, tz) if is_iso_datetime(s) || is_iso_date(s) => { + Some(DataType::Datetime(tu, tz)) if is_iso_datetime(s) || is_iso_date(s) => { if s.len() == 10 { // handle upcast from ISO date string (10 chars) to datetime lit(format!("{}T00:00:00", s)) diff --git a/crates/polars-stream/Cargo.toml b/crates/polars-stream/Cargo.toml index ddf4e7be4f18..fc130a035140 100644 --- a/crates/polars-stream/Cargo.toml +++ b/crates/polars-stream/Cargo.toml @@ -37,6 +37,11 @@ version_check = { workspace = true } [features] nightly = [] -bitwise = ["polars-core/bitwise", "polars-plan/bitwise"] +bitwise = ["polars-core/bitwise", "polars-plan/bitwise", "polars-expr/bitwise"] merge_sorted = ["polars-plan/merge_sorted"] dynamic_group_by = [] +strings = [] + +# We need to specify default features here to match workspace defaults. +# Otherwise we get warnings with cargo check/clippy. +default = ["bitwise"] diff --git a/crates/polars-stream/src/async_executor/mod.rs b/crates/polars-stream/src/async_executor/mod.rs index 243109e5facb..23789e5a20df 100644 --- a/crates/polars-stream/src/async_executor/mod.rs +++ b/crates/polars-stream/src/async_executor/mod.rs @@ -308,7 +308,7 @@ pub struct TaskScope<'scope, 'env: 'scope> { env: PhantomData<&'env mut &'env ()>, } -impl<'scope, 'env> TaskScope<'scope, 'env> { +impl<'scope> TaskScope<'scope, '_> { // Not Drop because that extends lifetimes. fn destroy(&self) { // Make sure all tasks are cancelled. diff --git a/crates/polars-stream/src/async_executor/park_group.rs b/crates/polars-stream/src/async_executor/park_group.rs index d9da30ce7f3e..d72a474da1e4 100644 --- a/crates/polars-stream/src/async_executor/park_group.rs +++ b/crates/polars-stream/src/async_executor/park_group.rs @@ -149,7 +149,7 @@ pub struct ParkAttempt<'a> { worker: &'a mut ParkGroupWorker, } -impl<'a> ParkAttempt<'a> { +impl ParkAttempt<'_> { /// Actually park this worker. /// /// If there were calls to unpark between calling prepare_park() and park(), diff --git a/crates/polars-stream/src/async_executor/task.rs b/crates/polars-stream/src/async_executor/task.rs index 9991377eb718..1383da2edde8 100644 --- a/crates/polars-stream/src/async_executor/task.rs +++ b/crates/polars-stream/src/async_executor/task.rs @@ -118,9 +118,9 @@ where } } -impl<'a, F, S, M> Wake for Task +impl Wake for Task where - F: Future + Send + 'a, + F: Future + Send, F::Output: Send + 'static, S: Fn(Runnable) + Send + Sync + Copy + 'static, M: Send + Sync + 'static, @@ -143,9 +143,9 @@ pub trait DynTask: Send + Sync { fn schedule(self: Arc); } -impl<'a, F, S, M> DynTask for Task +impl DynTask for Task where - F: Future + Send + 'a, + F: Future + Send, F::Output: Send + 'static, S: Fn(Runnable) + Send + Sync + Copy + 'static, M: Send + Sync + 'static, @@ -202,9 +202,9 @@ trait Joinable: Send + Sync { fn poll_join(&self, ctx: &mut Context<'_>) -> Poll; } -impl<'a, F, S, M> Joinable for Task +impl Joinable for Task where - F: Future + Send + 'a, + F: Future + Send, F::Output: Send + 'static, S: Fn(Runnable) + Send + Sync + Copy + 'static, M: Send + Sync + 'static, @@ -233,9 +233,9 @@ trait Cancellable: Send + Sync { fn cancel(&self); } -impl<'a, F, S, M> Cancellable for Task +impl Cancellable for Task where - F: Future + Send + 'a, + F: Future + Send, F::Output: Send + 'static, S: Send + Sync + 'static, M: Send + Sync + 'static, diff --git a/crates/polars-stream/src/async_primitives/connector.rs b/crates/polars-stream/src/async_primitives/connector.rs index 94999fff4e7a..8b53193b95f1 100644 --- a/crates/polars-stream/src/async_primitives/connector.rs +++ b/crates/polars-stream/src/async_primitives/connector.rs @@ -217,7 +217,7 @@ pin_project! { } } -unsafe impl<'a, T: Send> Send for SendFuture<'a, T> {} +unsafe impl Send for SendFuture<'_, T> {} impl Sender { /// Returns a future that when awaited will send the value to the [`Receiver`]. @@ -255,7 +255,7 @@ pin_project! { } } -unsafe impl<'a, T: Send> Send for RecvFuture<'a, T> {} +unsafe impl Send for RecvFuture<'_, T> {} impl Receiver { /// Returns a future that when awaited will return `Ok(value)` once the diff --git a/crates/polars-stream/src/async_primitives/task_parker.rs b/crates/polars-stream/src/async_primitives/task_parker.rs index 9e48b79e468b..d6cde679980b 100644 --- a/crates/polars-stream/src/async_primitives/task_parker.rs +++ b/crates/polars-stream/src/async_primitives/task_parker.rs @@ -43,7 +43,7 @@ pub struct TaskParkFuture<'a> { parker: &'a TaskParker, } -impl<'a> Future for TaskParkFuture<'a> { +impl Future for TaskParkFuture<'_> { type Output = (); fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll { diff --git a/crates/polars-stream/src/async_primitives/wait_group.rs b/crates/polars-stream/src/async_primitives/wait_group.rs index 716363528505..e08f556d3b95 100644 --- a/crates/polars-stream/src/async_primitives/wait_group.rs +++ b/crates/polars-stream/src/async_primitives/wait_group.rs @@ -62,7 +62,7 @@ impl Future for WaitGroupFuture<'_> { } } -impl<'a> Drop for WaitGroupFuture<'a> { +impl Drop for WaitGroupFuture<'_> { fn drop(&mut self) { self.inner.is_waiting.store(false, Ordering::Relaxed); } diff --git a/crates/polars-stream/src/nodes/filter.rs b/crates/polars-stream/src/nodes/filter.rs index 9f0b0301ef91..f89a53adbf23 100644 --- a/crates/polars-stream/src/nodes/filter.rs +++ b/crates/polars-stream/src/nodes/filter.rs @@ -27,14 +27,14 @@ impl ComputeNode for FilterNode { fn spawn<'env, 's>( &'env mut self, scope: &'s TaskScope<'s, 'env>, - recv: &mut [Option>], - send: &mut [Option>], + recv_ports: &mut [Option>], + send_ports: &mut [Option>], state: &'s ExecutionState, join_handles: &mut Vec>>, ) { - assert!(recv.len() == 1 && send.len() == 1); - let receivers = recv[0].take().unwrap().parallel(); - let senders = send[0].take().unwrap().parallel(); + assert!(recv_ports.len() == 1 && send_ports.len() == 1); + let receivers = recv_ports[0].take().unwrap().parallel(); + let senders = send_ports[0].take().unwrap().parallel(); for (mut recv, mut send) in receivers.into_iter().zip(senders) { let slf = &*self; diff --git a/crates/polars-stream/src/nodes/group_by.rs b/crates/polars-stream/src/nodes/group_by.rs index 6954263bb99b..c534924d1433 100644 --- a/crates/polars-stream/src/nodes/group_by.rs +++ b/crates/polars-stream/src/nodes/group_by.rs @@ -1,9 +1,14 @@ +use std::mem::ManuallyDrop; use std::sync::Arc; use polars_core::prelude::IntoColumn; use polars_core::schema::Schema; +use polars_core::utils::accumulate_dataframes_vertical_unchecked; use polars_expr::groups::Grouper; use polars_expr::reduce::GroupedReduction; +use polars_utils::itertools::Itertools; +use polars_utils::sync::SyncPtr; +use rayon::prelude::*; use super::compute_node_prelude::*; use crate::async_primitives::connector::Receiver; @@ -77,12 +82,13 @@ impl GroupBySinkState { } } - fn into_source(mut self, output_schema: &Schema) -> PolarsResult { - // TODO: parallelize this with partitions. + fn combine_locals( + output_schema: &Schema, + mut locals: Vec, + ) -> PolarsResult { let mut group_idxs = Vec::new(); - let num_pipelines = self.local.len(); - let mut combined = self.local.pop().unwrap(); - for local in self.local { + let mut combined = locals.pop().unwrap(); + for local in locals { combined.grouper.combine(&*local.grouper, &mut group_idxs); for (l, r) in combined .grouped_reductions @@ -102,10 +108,73 @@ impl GroupBySinkState { out.with_column_unchecked(r.finalize()?.with_name(name.clone()).into_column()); } } - let mut source_node = InMemorySourceNode::new(Arc::new(out)); - source_node.initialize(num_pipelines); + Ok(out) + } + + fn into_source_parallel(self, output_schema: &Schema) -> PolarsResult { + let num_partitions = self.local.len(); + let seed = 0xdeadbeef; + let partitioned_locals: Vec<_> = self + .local + .into_par_iter() + .with_max_len(1) + .map(|local| { + let mut partition_idxs = Vec::new(); + let p_groupers = local + .grouper + .partition(seed, num_partitions, &mut partition_idxs); + let partition_sizes = p_groupers.iter().map(|g| g.num_groups()).collect_vec(); + let grouped_reductions_p = local + .grouped_reductions + .into_iter() + .map(|r| unsafe { r.partition(&partition_sizes, &partition_idxs) }) + .collect_vec(); + (p_groupers, grouped_reductions_p) + }) + .collect(); + + let frames = unsafe { + let mut partitioned_locals = ManuallyDrop::new(partitioned_locals); + let partitioned_locals_ptr = SyncPtr::new(partitioned_locals.as_mut_ptr()); + (0..num_partitions) + .into_par_iter() + .with_max_len(1) + .map(|p| { + let locals_in_p = (0..num_partitions) + .map(|l| { + let partitioned_local = &*partitioned_locals_ptr.get().add(l); + let (p_groupers, grouped_reductions_p) = partitioned_local; + LocalGroupBySinkState { + grouper: p_groupers.as_ptr().add(p).read(), + grouped_reductions: grouped_reductions_p + .iter() + .map(|r| r.as_ptr().add(p).read()) + .collect(), + } + }) + .collect(); + Self::combine_locals(output_schema, locals_in_p) + }) + .collect::>>() + }; + + let df = accumulate_dataframes_vertical_unchecked(frames?); + let mut source_node = InMemorySourceNode::new(Arc::new(df)); + source_node.initialize(num_partitions); Ok(source_node) } + + fn into_source(self, output_schema: &Schema) -> PolarsResult { + if std::env::var("POLARS_PARALLEL_GROUPBY_FINALIZE").as_deref() == Ok("1") { + self.into_source_parallel(output_schema) + } else { + let num_pipelines = self.local.len(); + let df = Self::combine_locals(output_schema, self.local); + let mut source_node = InMemorySourceNode::new(Arc::new(df?)); + source_node.initialize(num_pipelines); + Ok(source_node) + } + } } enum GroupByState { @@ -195,25 +264,25 @@ impl ComputeNode for GroupByNode { fn spawn<'env, 's>( &'env mut self, scope: &'s TaskScope<'s, 'env>, - recv: &mut [Option>], - send: &mut [Option>], + recv_ports: &mut [Option>], + send_ports: &mut [Option>], state: &'s ExecutionState, join_handles: &mut Vec>>, ) { - assert!(send.len() == 1 && recv.len() == 1); + assert!(send_ports.len() == 1 && recv_ports.len() == 1); match &mut self.state { GroupByState::Sink(sink) => { - assert!(send[0].is_none()); + assert!(send_ports[0].is_none()); sink.spawn( scope, - recv[0].take().unwrap().parallel(), + recv_ports[0].take().unwrap().parallel(), state, join_handles, ) }, GroupByState::Source(source) => { - assert!(recv[0].is_none()); - source.spawn(scope, &mut [], send, state, join_handles); + assert!(recv_ports[0].is_none()); + source.spawn(scope, &mut [], send_ports, state, join_handles); }, GroupByState::Done => unreachable!(), } diff --git a/crates/polars-stream/src/nodes/in_memory_map.rs b/crates/polars-stream/src/nodes/in_memory_map.rs index 3a8bff496a18..27af6be9aa87 100644 --- a/crates/polars-stream/src/nodes/in_memory_map.rs +++ b/crates/polars-stream/src/nodes/in_memory_map.rs @@ -86,16 +86,16 @@ impl ComputeNode for InMemoryMapNode { fn spawn<'env, 's>( &'env mut self, scope: &'s TaskScope<'s, 'env>, - recv: &mut [Option>], - send: &mut [Option>], + recv_ports: &mut [Option>], + send_ports: &mut [Option>], state: &'s ExecutionState, join_handles: &mut Vec>>, ) { match self { Self::Sink { sink_node, .. } => { - sink_node.spawn(scope, recv, &mut [], state, join_handles) + sink_node.spawn(scope, recv_ports, &mut [], state, join_handles) }, - Self::Source(source) => source.spawn(scope, &mut [], send, state, join_handles), + Self::Source(source) => source.spawn(scope, &mut [], send_ports, state, join_handles), Self::Done => unreachable!(), } } diff --git a/crates/polars-stream/src/nodes/in_memory_sink.rs b/crates/polars-stream/src/nodes/in_memory_sink.rs index afd6ccfd95cc..58d2f9e8ffe6 100644 --- a/crates/polars-stream/src/nodes/in_memory_sink.rs +++ b/crates/polars-stream/src/nodes/in_memory_sink.rs @@ -45,13 +45,13 @@ impl ComputeNode for InMemorySinkNode { fn spawn<'env, 's>( &'env mut self, scope: &'s TaskScope<'s, 'env>, - recv: &mut [Option>], - send: &mut [Option>], + recv_ports: &mut [Option>], + send_ports: &mut [Option>], _state: &'s ExecutionState, join_handles: &mut Vec>>, ) { - assert!(recv.len() == 1 && send.is_empty()); - let receivers = recv[0].take().unwrap().parallel(); + assert!(recv_ports.len() == 1 && send_ports.is_empty()); + let receivers = recv_ports[0].take().unwrap().parallel(); for mut recv in receivers { let slf = &*self; diff --git a/crates/polars-stream/src/nodes/in_memory_source.rs b/crates/polars-stream/src/nodes/in_memory_source.rs index 5ab6b0f75d50..c8dfec9d0032 100644 --- a/crates/polars-stream/src/nodes/in_memory_source.rs +++ b/crates/polars-stream/src/nodes/in_memory_source.rs @@ -60,13 +60,13 @@ impl ComputeNode for InMemorySourceNode { fn spawn<'env, 's>( &'env mut self, scope: &'s TaskScope<'s, 'env>, - recv: &mut [Option>], - send: &mut [Option>], + recv_ports: &mut [Option>], + send_ports: &mut [Option>], _state: &'s ExecutionState, join_handles: &mut Vec>>, ) { - assert!(recv.is_empty() && send.len() == 1); - let senders = send[0].take().unwrap().parallel(); + assert!(recv_ports.is_empty() && send_ports.len() == 1); + let senders = send_ports[0].take().unwrap().parallel(); let source = self.source.as_ref().unwrap(); // TODO: can this just be serial, using the work distributor? diff --git a/crates/polars-stream/src/nodes/input_independent_select.rs b/crates/polars-stream/src/nodes/input_independent_select.rs index f1a9113d05d4..9df4c1ab5281 100644 --- a/crates/polars-stream/src/nodes/input_independent_select.rs +++ b/crates/polars-stream/src/nodes/input_independent_select.rs @@ -36,13 +36,13 @@ impl ComputeNode for InputIndependentSelectNode { fn spawn<'env, 's>( &'env mut self, scope: &'s TaskScope<'s, 'env>, - recv: &mut [Option>], - send: &mut [Option>], + recv_ports: &mut [Option>], + send_ports: &mut [Option>], state: &'s ExecutionState, join_handles: &mut Vec>>, ) { - assert!(recv.is_empty() && send.len() == 1); - let mut sender = send[0].take().unwrap().serial(); + assert!(recv_ports.is_empty() && send_ports.len() == 1); + let mut sender = send_ports[0].take().unwrap().serial(); join_handles.push(scope.spawn_task(TaskPriority::Low, async move { let empty_df = DataFrame::empty(); diff --git a/crates/polars-stream/src/nodes/io_sinks/ipc.rs b/crates/polars-stream/src/nodes/io_sinks/ipc.rs index 6bc009480aa3..5587221d894c 100644 --- a/crates/polars-stream/src/nodes/io_sinks/ipc.rs +++ b/crates/polars-stream/src/nodes/io_sinks/ipc.rs @@ -61,14 +61,14 @@ impl ComputeNode for IpcSinkNode { fn spawn<'env, 's>( &'env mut self, scope: &'s TaskScope<'s, 'env>, - recv: &mut [Option>], - send: &mut [Option>], + recv_ports: &mut [Option>], + send_ports: &mut [Option>], _state: &'s ExecutionState, join_handles: &mut Vec>>, ) { - assert!(send.is_empty()); - assert!(recv.len() == 1); - let mut receiver = recv[0].take().unwrap().serial(); + assert!(send_ports.is_empty()); + assert!(recv_ports.len() == 1); + let mut receiver = recv_ports[0].take().unwrap().serial(); join_handles.push(scope.spawn_task(TaskPriority::High, async move { while let Ok(morsel) = receiver.recv().await { diff --git a/crates/polars-stream/src/nodes/map.rs b/crates/polars-stream/src/nodes/map.rs index 007dfa921672..c1994d1e4a9a 100644 --- a/crates/polars-stream/src/nodes/map.rs +++ b/crates/polars-stream/src/nodes/map.rs @@ -29,14 +29,14 @@ impl ComputeNode for MapNode { fn spawn<'env, 's>( &'env mut self, scope: &'s TaskScope<'s, 'env>, - recv: &mut [Option>], - send: &mut [Option>], + recv_ports: &mut [Option>], + send_ports: &mut [Option>], _state: &'s ExecutionState, join_handles: &mut Vec>>, ) { - assert!(recv.len() == 1 && send.len() == 1); - let receivers = recv[0].take().unwrap().parallel(); - let senders = send[0].take().unwrap().parallel(); + assert!(recv_ports.len() == 1 && send_ports.len() == 1); + let receivers = recv_ports[0].take().unwrap().parallel(); + let senders = send_ports[0].take().unwrap().parallel(); for (mut recv, mut send) in receivers.into_iter().zip(senders) { let slf = &*self; diff --git a/crates/polars-stream/src/nodes/mod.rs b/crates/polars-stream/src/nodes/mod.rs index e63c73f4939d..559e4717c4e9 100644 --- a/crates/polars-stream/src/nodes/mod.rs +++ b/crates/polars-stream/src/nodes/mod.rs @@ -63,8 +63,8 @@ pub trait ComputeNode: Send { fn spawn<'env, 's>( &'env mut self, scope: &'s TaskScope<'s, 'env>, - recv: &mut [Option>], - send: &mut [Option>], + recv_ports: &mut [Option>], + send_ports: &mut [Option>], state: &'s ExecutionState, join_handles: &mut Vec>>, ); diff --git a/crates/polars-stream/src/nodes/multiplexer.rs b/crates/polars-stream/src/nodes/multiplexer.rs index 65f2e752d28d..d4e4ac62cf01 100644 --- a/crates/polars-stream/src/nodes/multiplexer.rs +++ b/crates/polars-stream/src/nodes/multiplexer.rs @@ -92,13 +92,13 @@ impl ComputeNode for MultiplexerNode { fn spawn<'env, 's>( &'env mut self, scope: &'s TaskScope<'s, 'env>, - recv: &mut [Option>], - send: &mut [Option>], + recv_ports: &mut [Option>], + send_ports: &mut [Option>], _state: &'s ExecutionState, join_handles: &mut Vec>>, ) { - assert!(recv.len() == 1 && !send.is_empty()); - assert!(self.buffers.len() == send.len()); + assert!(recv_ports.len() == 1 && !send_ports.is_empty()); + assert!(self.buffers.len() == send_ports.len()); enum Listener<'a> { Active(UnboundedSender), @@ -114,7 +114,7 @@ impl ComputeNode for MultiplexerNode { .enumerate() .map(|(port_idx, buffer)| { if let BufferedStream::Open(buf) = buffer { - if send[port_idx].is_some() { + if send_ports[port_idx].is_some() { // TODO: replace with a bounded channel and store data // out-of-core beyond a certain size. let (rx, tx) = unbounded_channel(); @@ -129,7 +129,7 @@ impl ComputeNode for MultiplexerNode { .unzip(); // TODO: parallel multiplexing. - if let Some(mut receiver) = recv[0].take().map(|r| r.serial()) { + if let Some(mut receiver) = recv_ports[0].take().map(|r| r.serial()) { let buffered_source_token = buffered_source_token.clone(); join_handles.push(scope.spawn_task(TaskPriority::High, async move { loop { @@ -176,7 +176,7 @@ impl ComputeNode for MultiplexerNode { })); } - for (send_port, opt_buf_recv) in send.iter_mut().zip(buf_receivers) { + for (send_port, opt_buf_recv) in send_ports.iter_mut().zip(buf_receivers) { if let Some((buf, mut rx)) = opt_buf_recv { let mut sender = send_port.take().unwrap().serial(); diff --git a/crates/polars-stream/src/nodes/ordered_union.rs b/crates/polars-stream/src/nodes/ordered_union.rs index 3c72d9cc6e15..cb65175292e2 100644 --- a/crates/polars-stream/src/nodes/ordered_union.rs +++ b/crates/polars-stream/src/nodes/ordered_union.rs @@ -52,15 +52,15 @@ impl ComputeNode for OrderedUnionNode { fn spawn<'env, 's>( &'env mut self, scope: &'s TaskScope<'s, 'env>, - recv: &mut [Option>], - send: &mut [Option>], + recv_ports: &mut [Option>], + send_ports: &mut [Option>], _state: &'s ExecutionState, join_handles: &mut Vec>>, ) { - let ready_count = recv.iter().filter(|r| r.is_some()).count(); - assert!(ready_count == 1 && send.len() == 1); - let receivers = recv[self.cur_input_idx].take().unwrap().parallel(); - let senders = send[0].take().unwrap().parallel(); + let ready_count = recv_ports.iter().filter(|r| r.is_some()).count(); + assert!(ready_count == 1 && send_ports.len() == 1); + let receivers = recv_ports[self.cur_input_idx].take().unwrap().parallel(); + let senders = send_ports[0].take().unwrap().parallel(); let mut inner_handles = Vec::new(); for (mut recv, mut send) in receivers.into_iter().zip(senders) { diff --git a/crates/polars-stream/src/nodes/parquet_source/init.rs b/crates/polars-stream/src/nodes/parquet_source/init.rs index 3187bbe797e4..a722186ff497 100644 --- a/crates/polars-stream/src/nodes/parquet_source/init.rs +++ b/crates/polars-stream/src/nodes/parquet_source/init.rs @@ -1,3 +1,4 @@ +use std::collections::VecDeque; use std::future::Future; use std::sync::Arc; @@ -14,7 +15,6 @@ use super::{AsyncTaskData, ParquetSourceNode}; use crate::async_executor; use crate::async_primitives::connector::connector; use crate::async_primitives::wait_group::{WaitGroup, WaitToken}; -use crate::morsel::get_ideal_morsel_size; use crate::nodes::{MorselSeq, TaskPriority}; use crate::utils::task_handles_ext; @@ -118,6 +118,8 @@ impl ParquetSourceNode { let row_group_decoder = self.init_row_group_decoder(); let row_group_decoder = Arc::new(row_group_decoder); + let ideal_morsel_size = self.config.ideal_morsel_size; + // Distributes morsels across pipelines. This does not perform any CPU or I/O bound work - // it is purely a dispatch loop. let raw_morsel_distributor_task_handle = io_runtime.spawn(async move { @@ -191,25 +193,31 @@ impl ParquetSourceNode { ); let morsel_seq_ref = &mut MorselSeq::default(); - let mut dfs = vec![].into_iter(); + let mut dfs = VecDeque::with_capacity(1); 'main: loop { let Some(mut indexed_wait_group) = wait_groups.next().await else { break; }; - if dfs.len() == 0 { + while dfs.is_empty() { let Some(v) = df_stream.next().await else { - break; + break 'main; }; - let v = v?; - assert!(!v.is_empty()); + let df = v?; + + if df.is_empty() { + continue; + } - dfs = v.into_iter(); + let (iter, n) = split_to_morsels(&df, ideal_morsel_size); + + dfs.reserve(n); + dfs.extend(iter); } - let mut df = dfs.next().unwrap(); + let mut df = dfs.pop_front().unwrap(); let morsel_seq = *morsel_seq_ref; *morsel_seq_ref = morsel_seq.successor(); @@ -270,7 +278,6 @@ impl ParquetSourceNode { let projected_arrow_schema = self.projected_arrow_schema.clone().unwrap(); let row_index = self.file_options.row_index.clone(); let physical_predicate = self.physical_predicate.clone(); - let ideal_morsel_size = get_ideal_morsel_size(); let min_values_per_thread = self.config.min_values_per_thread; let mut use_prefiltered = physical_predicate.is_some() @@ -348,7 +355,6 @@ impl ParquetSourceNode { predicate_arrow_field_indices, non_predicate_arrow_field_indices, predicate_arrow_field_mask, - ideal_morsel_size, min_values_per_thread, } } @@ -402,6 +408,28 @@ fn filtered_range(exclude: &[usize], len: usize) -> Vec { .collect() } +/// Note: The 2nd return is an upper bound on the number of morsels rather than an exact count. +fn split_to_morsels( + df: &DataFrame, + ideal_morsel_size: usize, +) -> (impl Iterator + '_, usize) { + let n_morsels = if df.height() > 3 * ideal_morsel_size / 2 { + // num_rows > (1.5 * ideal_morsel_size) + (df.height() / ideal_morsel_size).max(2) + } else { + 1 + }; + + let rows_per_morsel = 1 + df.height() / n_morsels; + + ( + (0..i64::try_from(df.height()).unwrap()) + .step_by(rows_per_morsel) + .map(move |offset| df.slice(offset, rows_per_morsel)), + n_morsels, + ) +} + mod tests { #[test] diff --git a/crates/polars-stream/src/nodes/parquet_source/mod.rs b/crates/polars-stream/src/nodes/parquet_source/mod.rs index 44fc4e1f1239..a5efa4cb3b89 100644 --- a/crates/polars-stream/src/nodes/parquet_source/mod.rs +++ b/crates/polars-stream/src/nodes/parquet_source/mod.rs @@ -18,7 +18,7 @@ use polars_plan::prelude::FileScanOptions; use super::compute_node_prelude::*; use super::{MorselSeq, TaskPriority}; use crate::async_primitives::wait_group::WaitToken; -use crate::morsel::SourceToken; +use crate::morsel::{get_ideal_morsel_size, SourceToken}; use crate::utils::task_handles_ext; mod init; @@ -70,6 +70,7 @@ struct Config { /// Minimum number of values for a parallel spawned task to process to amortize /// parallelism overhead. min_values_per_thread: usize, + ideal_morsel_size: usize, } #[allow(clippy::too_many_arguments)] @@ -110,6 +111,7 @@ impl ParquetSourceNode { metadata_decode_ahead_size: 0, row_group_prefetch_size: 0, min_values_per_thread: 0, + ideal_morsel_size: 0, }, verbose, physical_predicate: None, @@ -142,6 +144,7 @@ impl ComputeNode for ParquetSourceNode { let min_values_per_thread = std::env::var("POLARS_MIN_VALUES_PER_THREAD") .map(|x| x.parse::().expect("integer").max(1)) .unwrap_or(16_777_216); + let ideal_morsel_size = get_ideal_morsel_size(); Config { num_pipelines, @@ -149,6 +152,7 @@ impl ComputeNode for ParquetSourceNode { metadata_decode_ahead_size, row_group_prefetch_size, min_values_per_thread, + ideal_morsel_size, } }; @@ -198,18 +202,18 @@ impl ComputeNode for ParquetSourceNode { fn spawn<'env, 's>( &'env mut self, scope: &'s TaskScope<'s, 'env>, - recv: &mut [Option>], - send: &mut [Option>], + recv_ports: &mut [Option>], + send_ports: &mut [Option>], _state: &'s ExecutionState, join_handles: &mut Vec>>, ) { use std::sync::atomic::Ordering; - assert!(recv.is_empty()); - assert_eq!(send.len(), 1); + assert!(recv_ports.is_empty()); + assert_eq!(send_ports.len(), 1); assert!(!self.is_finished.load(Ordering::Relaxed)); - let morsel_senders = send[0].take().unwrap().parallel(); + let morsel_senders = send_ports[0].take().unwrap().parallel(); let mut async_task_data_guard = self.async_task_data.try_lock().unwrap(); let (raw_morsel_receivers, _) = async_task_data_guard.as_mut().unwrap(); diff --git a/crates/polars-stream/src/nodes/parquet_source/row_group_decode.rs b/crates/polars-stream/src/nodes/parquet_source/row_group_decode.rs index 975ff6de22cb..d31f1e51f71e 100644 --- a/crates/polars-stream/src/nodes/parquet_source/row_group_decode.rs +++ b/crates/polars-stream/src/nodes/parquet_source/row_group_decode.rs @@ -38,7 +38,6 @@ pub(super) struct RowGroupDecoder { pub(super) non_predicate_arrow_field_indices: Vec, /// The nth bit is set to `true` if the field at that index is used in the predicate. pub(super) predicate_arrow_field_mask: Vec, - pub(super) ideal_morsel_size: usize, pub(super) min_values_per_thread: usize, } @@ -46,7 +45,7 @@ impl RowGroupDecoder { pub(super) async fn row_group_data_to_df( &self, row_group_data: RowGroupData, - ) -> PolarsResult> { + ) -> PolarsResult { if self.use_prefiltered.is_some() { self.row_group_data_to_df_prefiltered(row_group_data).await } else { @@ -57,7 +56,7 @@ impl RowGroupDecoder { async fn row_group_data_to_df_impl( &self, row_group_data: RowGroupData, - ) -> PolarsResult> { + ) -> PolarsResult { let row_group_data = Arc::new(row_group_data); let out_width = self.row_index.is_some() as usize @@ -131,7 +130,7 @@ impl RowGroupDecoder { assert_eq!(df.width(), out_width); // `out_width` should have been calculated correctly - Ok(self.split_to_morsels(df)) + Ok(df) } async fn shared_file_state_init_func(&self, row_group_data: &RowGroupData) -> SharedFileState { @@ -308,26 +307,6 @@ impl RowGroupDecoder { Ok(()) } - - fn split_to_morsels(&self, df: DataFrame) -> Vec { - let n_morsels = if df.height() > 3 * self.ideal_morsel_size / 2 { - // num_rows > (1.5 * ideal_morsel_size) - (df.height() / self.ideal_morsel_size).max(2) - } else { - 1 - } as u64; - - if n_morsels == 1 { - return vec![df]; - } - - let rows_per_morsel = 1 + df.height() / n_morsels as usize; - - (0..i64::try_from(df.height()).unwrap()) - .step_by(rows_per_morsel) - .map(|offset| df.slice(offset, rows_per_morsel)) - .collect::>() - } } fn decode_column( @@ -478,7 +457,7 @@ impl RowGroupDecoder { async fn row_group_data_to_df_prefiltered( &self, row_group_data: RowGroupData, - ) -> PolarsResult> { + ) -> PolarsResult { debug_assert!(row_group_data.slice.is_none()); // Invariant of the optimizer. assert!(self.predicate_arrow_field_indices.len() <= self.projected_arrow_schema.len()); @@ -614,7 +593,7 @@ impl RowGroupDecoder { assert_eq!(dead_rem.len(), 0); let df = unsafe { DataFrame::new_no_checks(expected_num_rows, out_columns) }; - Ok(self.split_to_morsels(df)) + Ok(df) } } diff --git a/crates/polars-stream/src/nodes/reduce.rs b/crates/polars-stream/src/nodes/reduce.rs index ded581a4cf38..565854e97b81 100644 --- a/crates/polars-stream/src/nodes/reduce.rs +++ b/crates/polars-stream/src/nodes/reduce.rs @@ -162,24 +162,24 @@ impl ComputeNode for ReduceNode { fn spawn<'env, 's>( &'env mut self, scope: &'s TaskScope<'s, 'env>, - recv: &mut [Option>], - send: &mut [Option>], + recv_ports: &mut [Option>], + send_ports: &mut [Option>], state: &'s ExecutionState, join_handles: &mut Vec>>, ) { - assert!(send.len() == 1 && recv.len() == 1); + assert!(send_ports.len() == 1 && recv_ports.len() == 1); match &mut self.state { ReduceState::Sink { selectors, reductions, } => { - assert!(send[0].is_none()); - let recv_port = recv[0].take().unwrap(); + assert!(send_ports[0].is_none()); + let recv_port = recv_ports[0].take().unwrap(); Self::spawn_sink(selectors, reductions, scope, recv_port, state, join_handles) }, ReduceState::Source(df) => { - assert!(recv[0].is_none()); - let send_port = send[0].take().unwrap(); + assert!(recv_ports[0].is_none()); + let send_port = send_ports[0].take().unwrap(); Self::spawn_source(df, scope, send_port, join_handles) }, ReduceState::Done => unreachable!(), diff --git a/crates/polars-stream/src/nodes/select.rs b/crates/polars-stream/src/nodes/select.rs index 3b060e78e654..bf12904ff12c 100644 --- a/crates/polars-stream/src/nodes/select.rs +++ b/crates/polars-stream/src/nodes/select.rs @@ -36,14 +36,14 @@ impl ComputeNode for SelectNode { fn spawn<'env, 's>( &'env mut self, scope: &'s TaskScope<'s, 'env>, - recv: &mut [Option>], - send: &mut [Option>], + recv_ports: &mut [Option>], + send_ports: &mut [Option>], state: &'s ExecutionState, join_handles: &mut Vec>>, ) { - assert!(recv.len() == 1 && send.len() == 1); - let receivers = recv[0].take().unwrap().parallel(); - let senders = send[0].take().unwrap().parallel(); + assert!(recv_ports.len() == 1 && send_ports.len() == 1); + let receivers = recv_ports[0].take().unwrap().parallel(); + let senders = send_ports[0].take().unwrap().parallel(); for (mut recv, mut send) in receivers.into_iter().zip(senders) { let slf = &*self; diff --git a/crates/polars-stream/src/nodes/simple_projection.rs b/crates/polars-stream/src/nodes/simple_projection.rs index 95f002df2889..00cd8ed55ad0 100644 --- a/crates/polars-stream/src/nodes/simple_projection.rs +++ b/crates/polars-stream/src/nodes/simple_projection.rs @@ -33,14 +33,14 @@ impl ComputeNode for SimpleProjectionNode { fn spawn<'env, 's>( &'env mut self, scope: &'s TaskScope<'s, 'env>, - recv: &mut [Option>], - send: &mut [Option>], + recv_ports: &mut [Option>], + send_ports: &mut [Option>], _state: &'s ExecutionState, join_handles: &mut Vec>>, ) { - assert!(recv.len() == 1 && send.len() == 1); - let receivers = recv[0].take().unwrap().parallel(); - let senders = send[0].take().unwrap().parallel(); + assert!(recv_ports.len() == 1 && send_ports.len() == 1); + let receivers = recv_ports[0].take().unwrap().parallel(); + let senders = send_ports[0].take().unwrap().parallel(); for (mut recv, mut send) in receivers.into_iter().zip(senders) { let slf = &*self; diff --git a/crates/polars-stream/src/nodes/streaming_slice.rs b/crates/polars-stream/src/nodes/streaming_slice.rs index 950b39331588..5d9f5a003340 100644 --- a/crates/polars-stream/src/nodes/streaming_slice.rs +++ b/crates/polars-stream/src/nodes/streaming_slice.rs @@ -43,14 +43,14 @@ impl ComputeNode for StreamingSliceNode { fn spawn<'env, 's>( &'env mut self, scope: &'s TaskScope<'s, 'env>, - recv: &mut [Option>], - send: &mut [Option>], + recv_ports: &mut [Option>], + send_ports: &mut [Option>], _state: &'s ExecutionState, join_handles: &mut Vec>>, ) { - assert!(recv.len() == 1 && send.len() == 1); - let mut recv = recv[0].take().unwrap().serial(); - let mut send = send[0].take().unwrap().serial(); + assert!(recv_ports.len() == 1 && send_ports.len() == 1); + let mut recv = recv_ports[0].take().unwrap().serial(); + let mut send = send_ports[0].take().unwrap().serial(); join_handles.push(scope.spawn_task(TaskPriority::High, async move { let stop_offset = self.start_offset + self.length; diff --git a/crates/polars-stream/src/nodes/with_row_index.rs b/crates/polars-stream/src/nodes/with_row_index.rs index 942d23219fec..fe075120963d 100644 --- a/crates/polars-stream/src/nodes/with_row_index.rs +++ b/crates/polars-stream/src/nodes/with_row_index.rs @@ -35,14 +35,14 @@ impl ComputeNode for WithRowIndexNode { fn spawn<'env, 's>( &'env mut self, scope: &'s TaskScope<'s, 'env>, - recv: &mut [Option>], - send: &mut [Option>], + recv_ports: &mut [Option>], + send_ports: &mut [Option>], _state: &'s ExecutionState, join_handles: &mut Vec>>, ) { - assert!(recv.len() == 1 && send.len() == 1); - let mut receiver = recv[0].take().unwrap().serial(); - let senders = send[0].take().unwrap().parallel(); + assert!(recv_ports.len() == 1 && send_ports.len() == 1); + let mut receiver = recv_ports[0].take().unwrap().serial(); + let senders = send_ports[0].take().unwrap().parallel(); let (mut distributor, distr_receivers) = distributor_channel(senders.len(), DEFAULT_DISTRIBUTOR_BUFFER_SIZE); diff --git a/crates/polars-stream/src/nodes/zip.rs b/crates/polars-stream/src/nodes/zip.rs index cd72a3567442..614c7b506128 100644 --- a/crates/polars-stream/src/nodes/zip.rs +++ b/crates/polars-stream/src/nodes/zip.rs @@ -205,20 +205,20 @@ impl ComputeNode for ZipNode { fn spawn<'env, 's>( &'env mut self, scope: &'s TaskScope<'s, 'env>, - recv: &mut [Option>], - send: &mut [Option>], + recv_ports: &mut [Option>], + send_ports: &mut [Option>], _state: &'s ExecutionState, join_handles: &mut Vec>>, ) { - assert!(send.len() == 1); - assert!(!recv.is_empty()); - let mut sender = send[0].take().unwrap().serial(); + assert!(send_ports.len() == 1); + assert!(!recv_ports.is_empty()); + let mut sender = send_ports[0].take().unwrap().serial(); - let mut receivers = recv + let mut receivers = recv_ports .iter_mut() - .map(|r| { + .map(|recv_port| { // Add buffering to each receiver to reduce contention between input heads. - let mut serial_recv = r.take()?.serial(); + let mut serial_recv = recv_port.take()?.serial(); let (buf_send, buf_recv) = tokio::sync::mpsc::channel(DEFAULT_ZIP_HEAD_BUFFER_SIZE); join_handles.push(scope.spawn_task(TaskPriority::High, async move { while let Ok(morsel) = serial_recv.recv().await { diff --git a/crates/polars-stream/src/physical_plan/lower_expr.rs b/crates/polars-stream/src/physical_plan/lower_expr.rs index 2505e033aeec..3af80df16f9f 100644 --- a/crates/polars-stream/src/physical_plan/lower_expr.rs +++ b/crates/polars-stream/src/physical_plan/lower_expr.rs @@ -98,6 +98,7 @@ pub(crate) fn is_elementwise( match function { // Non-strict strptime must be done in-memory to ensure the format // is consistent across the entire dataframe. + #[cfg(feature = "strings")] FunctionExpr::StringExpr(StringFunction::Strptime(_, opts)) => opts.strict, _ => { options.is_elementwise() diff --git a/crates/polars-stream/src/pipe.rs b/crates/polars-stream/src/pipe.rs index 019d8779d18a..21b6a5672618 100644 --- a/crates/polars-stream/src/pipe.rs +++ b/crates/polars-stream/src/pipe.rs @@ -20,7 +20,7 @@ pub enum PhysicalPipe { pub struct SendPort<'a>(&'a mut PhysicalPipe); pub struct RecvPort<'a>(&'a mut PhysicalPipe); -impl<'a> RecvPort<'a> { +impl RecvPort<'_> { pub fn serial(self) -> Receiver { let PhysicalPipe::Uninit(num_pipelines) = self.0 else { unreachable!() @@ -41,7 +41,7 @@ impl<'a> RecvPort<'a> { } } -impl<'a> SendPort<'a> { +impl SendPort<'_> { #[allow(unused)] pub fn is_receiver_serial(&self) -> bool { matches!(self.0, PhysicalPipe::SerialReceiver(..)) diff --git a/crates/polars-time/src/windows/duration.rs b/crates/polars-time/src/windows/duration.rs index 4f300f733100..56ce3e4bdcd5 100644 --- a/crates/polars-time/src/windows/duration.rs +++ b/crates/polars-time/src/windows/duration.rs @@ -153,7 +153,7 @@ impl Duration { /// # Panics /// If the given str is invalid for any reason. pub fn parse(duration: &str) -> Self { - Self::_parse(duration, false) + Self::try_parse(duration).unwrap() } #[doc(hidden)] @@ -161,23 +161,31 @@ impl Duration { /// units (such as 'year', 'minutes', etc.) and whitespace, as /// well as being case-insensitive. pub fn parse_interval(interval: &str) -> Self { + Self::try_parse_interval(interval).unwrap() + } + + pub fn try_parse(duration: &str) -> PolarsResult { + Self::_parse(duration, false) + } + + pub fn try_parse_interval(interval: &str) -> PolarsResult { Self::_parse(&interval.to_ascii_lowercase(), true) } - fn _parse(s: &str, as_interval: bool) -> Self { + fn _parse(s: &str, as_interval: bool) -> PolarsResult { let s = if as_interval { s.trim_start() } else { s }; let parse_type = if as_interval { "interval" } else { "duration" }; let num_minus_signs = s.matches('-').count(); if num_minus_signs > 1 { - panic!("{} string can only have a single minus sign", parse_type) + polars_bail!(InvalidOperation: "{} string can only have a single minus sign", parse_type); } if num_minus_signs > 0 { if as_interval { // TODO: intervals need to support per-element minus signs - panic!("minus signs are not currently supported in interval strings") + polars_bail!(InvalidOperation: "minus signs are not currently supported in interval strings"); } else if !s.starts_with('-') { - panic!("only a single minus sign is allowed, at the front of the string") + polars_bail!(InvalidOperation: "only a single minus sign is allowed, at the front of the string"); } } let mut months = 0; @@ -211,12 +219,12 @@ impl Duration { while let Some((i, mut ch)) = iter.next() { if !ch.is_ascii_digit() { - let n = s[start..i].parse::().unwrap_or_else(|_| { - panic!( + let Ok(n) = s[start..i].parse::() else { + polars_bail!(InvalidOperation: "expected leading integer in the {} string, found {}", parse_type, ch - ) - }); + ); + }; loop { match ch { @@ -233,10 +241,10 @@ impl Duration { } } if unit.is_empty() { - panic!( + polars_bail!(InvalidOperation: "expected a unit to follow integer in the {} string '{}'", parse_type, s - ) + ); } match &*unit { // matches that are allowed for both duration/interval @@ -270,24 +278,25 @@ impl Duration { "year" | "years" => months += n * 12, _ => { let valid_units = "'year', 'month', 'quarter', 'week', 'day', 'hour', 'minute', 'second', 'millisecond', 'microsecond', 'nanosecond'"; - panic!("unit: '{unit}' not supported; available units include: {} (and their plurals)", valid_units) + polars_bail!(InvalidOperation: "unit: '{unit}' not supported; available units include: {} (and their plurals)", valid_units); }, }, _ => { - panic!("unit: '{unit}' not supported; available units are: 'y', 'mo', 'q', 'w', 'd', 'h', 'm', 's', 'ms', 'us', 'ns'") + polars_bail!(InvalidOperation: "unit: '{unit}' not supported; available units are: 'y', 'mo', 'q', 'w', 'd', 'h', 'm', 's', 'ms', 'us', 'ns'"); }, } unit.clear(); } } - Duration { + + Ok(Duration { nsecs: nsecs.abs(), days: days.abs(), weeks: weeks.abs(), months: months.abs(), negative, parsed_int, - } + }) } fn to_positive(v: i64) -> (bool, i64) { diff --git a/crates/polars-time/src/windows/window.rs b/crates/polars-time/src/windows/window.rs index 90afe791e4d2..c7a29b846c58 100644 --- a/crates/polars-time/src/windows/window.rs +++ b/crates/polars-time/src/windows/window.rs @@ -316,7 +316,7 @@ impl<'a> BoundsIter<'a> { } } -impl<'a> Iterator for BoundsIter<'a> { +impl Iterator for BoundsIter<'_> { type Item = Bounds; fn next(&mut self) -> Option { diff --git a/crates/polars-utils/src/hashing.rs b/crates/polars-utils/src/hashing.rs index 12e59bf52f26..63f4c661a2c3 100644 --- a/crates/polars-utils/src/hashing.rs +++ b/crates/polars-utils/src/hashing.rs @@ -2,6 +2,11 @@ use std::hash::{Hash, Hasher}; use crate::nulls::IsNull; +pub const fn folded_multiply(a: u64, b: u64) -> u64 { + let full = (a as u128).wrapping_mul(b as u128); + (full as u64) ^ ((full >> 64) as u64) +} + /// Contains a byte slice and a precomputed hash for that string. /// During rehashes, we will rehash the hash instead of the string, that makes /// rehashing cheap and allows cache coherent small hash tables. @@ -33,13 +38,13 @@ impl<'a> IsNull for BytesHash<'a> { } } -impl<'a> Hash for BytesHash<'a> { +impl Hash for BytesHash<'_> { fn hash(&self, state: &mut H) { state.write_u64(self.hash) } } -impl<'a> PartialEq for BytesHash<'a> { +impl PartialEq for BytesHash<'_> { #[inline] fn eq(&self, other: &Self) -> bool { (self.hash == other.hash) && (self.payload == other.payload) @@ -94,7 +99,7 @@ impl DirtyHash for i128 { } } -impl<'a> DirtyHash for BytesHash<'a> { +impl DirtyHash for BytesHash<'_> { fn dirty_hash(&self) -> u64 { self.hash } diff --git a/crates/polars-utils/src/total_ord.rs b/crates/polars-utils/src/total_ord.rs index cfaa05f0141d..982dc707e3de 100644 --- a/crates/polars-utils/src/total_ord.rs +++ b/crates/polars-utils/src/total_ord.rs @@ -453,7 +453,7 @@ impl TotalOrd for (T, U) { } } -impl<'a> TotalHash for BytesHash<'a> { +impl TotalHash for BytesHash<'_> { #[inline(always)] fn tot_hash(&self, state: &mut H) where @@ -463,7 +463,7 @@ impl<'a> TotalHash for BytesHash<'a> { } } -impl<'a> TotalEq for BytesHash<'a> { +impl TotalEq for BytesHash<'_> { #[inline(always)] fn tot_eq(&self, other: &Self) -> bool { self == other diff --git a/crates/polars-utils/src/vec.rs b/crates/polars-utils/src/vec.rs index 108e7d573d1c..9060a348230c 100644 --- a/crates/polars-utils/src/vec.rs +++ b/crates/polars-utils/src/vec.rs @@ -20,7 +20,7 @@ impl IntoRawParts for Vec { } } -/// Fill current allocation if if > 0 +/// Fill current allocation if > 0 /// otherwise realloc pub trait ResizeFaster { fn fill_or_alloc(&mut self, new_len: usize, value: T); diff --git a/crates/polars/Cargo.toml b/crates/polars/Cargo.toml index f66762f041e6..685ed71d8306 100644 --- a/crates/polars/Cargo.toml +++ b/crates/polars/Cargo.toml @@ -235,7 +235,7 @@ true_div = ["polars-lazy?/true_div"] unique_counts = ["polars-ops/unique_counts", "polars-lazy?/unique_counts"] zip_with = ["polars-core/zip_with"] -bigidx = ["polars-core/bigidx", "polars-lazy?/bigidx", "polars-ops/big_idx"] +bigidx = ["polars-core/bigidx", "polars-lazy?/bigidx", "polars-ops/big_idx", "polars-utils/bigidx"] polars_cloud = ["polars-lazy?/polars_cloud"] ir_serde = ["polars-plan/ir_serde"] diff --git a/crates/polars/tests/it/arrow/bitmap/utils/fmt.rs b/crates/polars/tests/it/arrow/bitmap/utils/fmt.rs index fa138cd528c9..08cfbf31c62e 100644 --- a/crates/polars/tests/it/arrow/bitmap/utils/fmt.rs +++ b/crates/polars/tests/it/arrow/bitmap/utils/fmt.rs @@ -2,7 +2,7 @@ use arrow::bitmap::utils::fmt; struct A<'a>(&'a [u8], usize, usize); -impl<'a> std::fmt::Debug for A<'a> { +impl std::fmt::Debug for A<'_> { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { fmt(self.0, self.1, self.2, f) } diff --git a/crates/polars/tests/it/io/avro/read_async.rs b/crates/polars/tests/it/io/avro/read_async.rs index 049910c0ce28..d50fd7595c58 100644 --- a/crates/polars/tests/it/io/avro/read_async.rs +++ b/crates/polars/tests/it/io/avro/read_async.rs @@ -26,25 +26,16 @@ async fn test(codec: Codec) -> PolarsResult<()> { Ok(()) } -// Issue with clippy interacting with tokio. See: -// https://github.com/rust-lang/rust-clippy/issues/13458 -#[allow(clippy::needless_return)] #[tokio::test] async fn read_without_codec() -> PolarsResult<()> { test(Codec::Null).await } -// Issue with clippy interacting with tokio. See: -// https://github.com/rust-lang/rust-clippy/issues/13458 -#[allow(clippy::needless_return)] #[tokio::test] async fn read_deflate() -> PolarsResult<()> { test(Codec::Deflate).await } -// Issue with clippy interacting with tokio. See: -// https://github.com/rust-lang/rust-clippy/issues/13458 -#[allow(clippy::needless_return)] #[tokio::test] async fn read_snappy() -> PolarsResult<()> { test(Codec::Snappy).await diff --git a/crates/polars/tests/it/io/avro/write_async.rs b/crates/polars/tests/it/io/avro/write_async.rs index 1be109e2733a..77cb212f89db 100644 --- a/crates/polars/tests/it/io/avro/write_async.rs +++ b/crates/polars/tests/it/io/avro/write_async.rs @@ -42,9 +42,6 @@ async fn roundtrip(compression: Option) -> PolarsResult<()> { Ok(()) } -// Issue with clippy interacting with tokio. See: -// https://github.com/rust-lang/rust-clippy/issues/13458 -#[allow(clippy::needless_return)] #[tokio::test] async fn no_compression() -> PolarsResult<()> { roundtrip(None).await diff --git a/crates/polars/tests/it/io/parquet/read/primitive_nested.rs b/crates/polars/tests/it/io/parquet/read/primitive_nested.rs index 0c3a15922a24..430df46d1239 100644 --- a/crates/polars/tests/it/io/parquet/read/primitive_nested.rs +++ b/crates/polars/tests/it/io/parquet/read/primitive_nested.rs @@ -179,7 +179,7 @@ pub struct DecoderIter<'a, T: Unpackable> { pub(crate) unpacked_end: usize, } -impl<'a, T: Unpackable> Iterator for DecoderIter<'a, T> { +impl Iterator for DecoderIter<'_, T> { type Item = T; fn next(&mut self) -> Option { @@ -203,7 +203,7 @@ impl<'a, T: Unpackable> Iterator for DecoderIter<'a, T> { } } -impl<'a, T: Unpackable> ExactSizeIterator for DecoderIter<'a, T> {} +impl ExactSizeIterator for DecoderIter<'_, T> {} impl<'a, T: Unpackable> DecoderIter<'a, T> { pub fn new(packed: &'a [u8], num_bits: usize, length: usize) -> ParquetResult { diff --git a/docs/source/src/python/user-guide/io/cloud-storage.py b/docs/source/src/python/user-guide/io/cloud-storage.py index 73cf597ec84e..12b02df28e61 100644 --- a/docs/source/src/python/user-guide/io/cloud-storage.py +++ b/docs/source/src/python/user-guide/io/cloud-storage.py @@ -7,7 +7,16 @@ df = pl.read_parquet(source) # --8<-- [end:read_parquet] -# --8<-- [start:scan_parquet] +# --8<-- [start:scan_parquet_query] +import polars as pl + +source = "s3://bucket/*.parquet" + +df = pl.scan_parquet(source).filter(pl.col("id") < 100).select("id","value").collect() +# --8<-- [end:scan_parquet_query] + + +# --8<-- [start:scan_parquet_storage_options_aws] import polars as pl source = "s3://bucket/*.parquet" @@ -17,17 +26,42 @@ "aws_secret_access_key": "", "aws_region": "us-east-1", } -df = pl.scan_parquet(source, storage_options=storage_options) -# --8<-- [end:scan_parquet] +df = pl.scan_parquet(source, storage_options=storage_options).collect() +# --8<-- [end:scan_parquet_storage_options_aws] + +# --8<-- [start:credential_provider_class] +lf = pl.scan_parquet( + "s3://.../...", + credential_provider=pl.CredentialProviderAWS( + profile_name="..." + assume_role={ + "RoleArn": f"...", + "RoleSessionName": "...", + } + ), +) -# --8<-- [start:scan_parquet_query] -import polars as pl +df = lf.collect() +# --8<-- [end:credential_provider_class] -source = "s3://bucket/*.parquet" +# --8<-- [start:credential_provider_custom_func] +def get_credentials() -> pl.CredentialProviderFunctionReturn: + expiry = None + return { + "aws_access_key_id": "...", + "aws_secret_access_key": "...", + "aws_session_token": "...", + }, expiry -df = pl.scan_parquet(source).filter(pl.col("id") < 100).select("id","value").collect() -# --8<-- [end:scan_parquet_query] + +lf = pl.scan_parquet( + "s3://.../...", + credential_provider=get_credentials, +) + +df = lf.collect() +# --8<-- [end:credential_provider_custom_func] # --8<-- [start:scan_pyarrow_dataset] import polars as pl diff --git a/docs/source/src/rust/user-guide/io/cloud-storage.rs b/docs/source/src/rust/user-guide/io/cloud-storage.rs index 5c297739eeee..2df882a39c00 100644 --- a/docs/source/src/rust/user-guide/io/cloud-storage.rs +++ b/docs/source/src/rust/user-guide/io/cloud-storage.rs @@ -1,7 +1,3 @@ -// Issue with clippy interacting with tokio. See: -// https://github.com/rust-lang/rust-clippy/issues/13458 -#![allow(clippy::needless_return)] - // --8<-- [start:read_parquet] use aws_config::BehaviorVersion; use polars::prelude::*; @@ -31,12 +27,18 @@ async fn main() { } // --8<-- [end:read_parquet] -// --8<-- [start:scan_parquet] -// --8<-- [end:scan_parquet] - // --8<-- [start:scan_parquet_query] // --8<-- [end:scan_parquet_query] +// --8<-- [start:scan_parquet_storage_options_aws] +// --8<-- [end:scan_parquet_storage_options_aws] + +// --8<-- [start:credential_provider_class] +// --8<-- [end:credential_provider_class] + +// --8<-- [start:credential_provider_custom_func] +// --8<-- [end:credential_provider_custom_func] + // --8<-- [start:scan_pyarrow_dataset] // --8<-- [end:scan_pyarrow_dataset] diff --git a/docs/source/user-guide/io/cloud-storage.md b/docs/source/user-guide/io/cloud-storage.md index ba686a5a0f11..f3b5d7a8fb09 100644 --- a/docs/source/user-guide/io/cloud-storage.md +++ b/docs/source/user-guide/io/cloud-storage.md @@ -18,23 +18,39 @@ To read from cloud storage, additional dependencies may be needed depending on t ## Reading from cloud storage -Polars can read a CSV, IPC or Parquet file in eager mode from cloud storage. +Polars supports reading Parquet, CSV, IPC and NDJSON files from cloud storage: {{code_block('user-guide/io/cloud-storage','read_parquet',['read_parquet','read_csv','read_ipc'])}} -This eager query downloads the file to a buffer in memory and creates a `DataFrame` from there. Polars uses `fsspec` to manage this download internally for all cloud storage providers. - ## Scanning from cloud storage with query optimisation -Polars can scan a Parquet file in lazy mode from cloud storage. We may need to provide further details beyond the source url such as authentication details or storage region. Polars looks for these as environment variables but we can also do this manually by passing a `dict` as the `storage_options` argument. +Using `pl.scan_*` functions to read from cloud storage can benefit from [predicate and projection pushdowns](../lazy/optimizations.md), where the query optimizer will apply them before the file is downloaded. This can significantly reduce the amount of data that needs to be downloaded. The query evaluation is triggered by calling `collect`. -{{code_block('user-guide/io/cloud-storage','scan_parquet',['scan_parquet'])}} +{{code_block('user-guide/io/cloud-storage','scan_parquet_query',[])}} -This query creates a `LazyFrame` without downloading the file. In the `LazyFrame` we have access to file metadata such as the schema. Polars uses the `object_store.rs` library internally to manage the interface with the cloud storage providers and so no extra dependencies are required in Python to scan a cloud Parquet file. +## Cloud authentication -If we create a lazy query with [predicate and projection pushdowns](../lazy/optimizations.md), the query optimizer will apply them before the file is downloaded. This can significantly reduce the amount of data that needs to be downloaded. The query evaluation is triggered by calling `collect`. +Polars is able to automatically load default credential configurations for some cloud providers. For +cases when this does not happen, it is possible to manually configure the credentials for Polars to +use for authentication. This can be done in a few ways: -{{code_block('user-guide/io/cloud-storage','scan_parquet_query',[])}} +### Using `storage_options`: + +- Credentials can be passed as configuration keys in a dict with the `storage_options` parameter: + +{{code_block('user-guide/io/cloud-storage','scan_parquet_storage_options_aws',['scan_parquet'])}} + +### Using one of the available `CredentialProvider*` utility classes + +- There may be a utility class `pl.CredentialProvider*` that provides the required authentication functionality. For example, `pl.CredentialProviderAWS` supports selecting AWS profiles, as well as assuming an IAM role: + +{{code_block('user-guide/io/cloud-storage','credential_provider_class',['scan_parquet'])}} + +### Using a custom `credential_provider` function + +- Some environments may require custom authentication logic (e.g. AWS IAM role-chaining). For these cases a Python function can be provided for Polars to use to retrieve credentials: + +{{code_block('user-guide/io/cloud-storage','credential_provider_custom_func',['scan_parquet'])}} ## Scanning with PyArrow diff --git a/py-polars/Cargo.toml b/py-polars/Cargo.toml index f9087ea3ea1e..fc3e520e5ecc 100644 --- a/py-polars/Cargo.toml +++ b/py-polars/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "py-polars" -version = "1.11.0" +version = "1.12.0" edition = "2021" [lib] diff --git a/py-polars/Makefile b/py-polars/Makefile index 3c98adab08cb..3d9e5d7ffddc 100644 --- a/py-polars/Makefile +++ b/py-polars/Makefile @@ -23,39 +23,23 @@ requirements-all: .venv ## Install/refresh all Python requirements (including t @$(MAKE) -s -C .. $@ .PHONY: build -build: .venv ## Compile and install Polars for development - @$(MAKE) -s -C .. $@ - -.PHONY: build-debug-opt -build-debug-opt: .venv ## Compile and install Polars with minimal optimizations turned on - @$(MAKE) -s -C .. $@ - -.PHONY: build-debug-opt-subset -build-debug-opt-subset: .venv ## Compile and install Polars with minimal optimizations turned on and no default features - @$(MAKE) -s -C .. $@ - -.PHONY: build-opt -build-opt: .venv ## Compile and install Polars with nearly full optimization on and debug assertions turned off, but with debug symbols on +build: .venv ## Compile and install Python Polars for development @$(MAKE) -s -C .. $@ .PHONY: build-release -build-release: .venv ## Compile and install a faster Polars binary with full optimizations - @$(MAKE) -s -C .. $@ - -.PHONY: build-native -build-native: .venv ## Same as build, except with native CPU optimizations turned on +build-release: .venv ## Compile and install Python Polars binary with optimizations, with minimal debug symbols @$(MAKE) -s -C .. $@ -.PHONY: build-debug-opt-native -build-debug-opt-native: .venv ## Same as build-debug-opt, except with native CPU optimizations turned on +.PHONY: build-nodebug-release +build-nodebug-release: .venv ## Same as build-release, but without any debug symbols at all (a bit faster to build) @$(MAKE) -s -C .. $@ -.PHONY: build-opt-native -build-opt-native: .venv ## Same as build-opt, except with native CPU optimizations turned on +.PHONY: build-debug-release +build-debug-release: .venv ## Same as build-release, but with full debug symbols turned on (a bit slower to build) @$(MAKE) -s -C .. $@ -.PHONY: build-release-native -build-release-native: .venv ## Same as build-release, except with native CPU optimizations turned on +.PHONY: build-dist-release +build-dist-release: .venv ## Compile and install Python Polars binary with super slow extra optimization turned on, for distribution @$(MAKE) -s -C .. $@ .PHONY: lint diff --git a/py-polars/polars/_typing.py b/py-polars/polars/_typing.py index da5c30ef996e..67a06a2c689e 100644 --- a/py-polars/polars/_typing.py +++ b/py-polars/polars/_typing.py @@ -160,9 +160,7 @@ def __arrow_c_stream__(self, requested_schema: object | None = None) -> object: RollingInterpolationMethod: TypeAlias = Literal[ "nearest", "higher", "lower", "midpoint", "linear" ] # QuantileInterpolOptions -ToStructStrategy: TypeAlias = Literal[ - "first_non_null", "max_width" -] # ListToStructWidthStrategy +ListToStructWidthStrategy: TypeAlias = Literal["first_non_null", "max_width"] # The following have no equivalent on the Rust side ConcatMethod = Literal[ diff --git a/py-polars/polars/convert/general.py b/py-polars/polars/convert/general.py index 13adff9c4cd3..f80526e5a2c9 100644 --- a/py-polars/polars/convert/general.py +++ b/py-polars/polars/convert/general.py @@ -4,7 +4,7 @@ import itertools import re from collections.abc import Iterable, Sequence -from typing import TYPE_CHECKING, Any, overload +from typing import TYPE_CHECKING, Any, Literal, overload import polars._reexport as pl from polars import functions as F @@ -487,15 +487,26 @@ def from_pandas( @overload def from_pandas( - data: pd.Series[Any] | pd.Index[Any], + data: pd.Series[Any] | pd.Index[Any] | pd.DatetimeIndex, *, schema_overrides: SchemaDict | None = ..., rechunk: bool = ..., nan_to_null: bool = ..., - include_index: bool = ..., + include_index: Literal[False] = ..., ) -> Series: ... +@overload +def from_pandas( + data: pd.Series[Any], + *, + schema_overrides: SchemaDict | None = ..., + rechunk: bool = ..., + nan_to_null: bool = ..., + include_index: Literal[True] = ..., +) -> DataFrame: ... + + def from_pandas( data: pd.DataFrame | pd.Series[Any] | pd.Index[Any] | pd.DatetimeIndex, *, @@ -525,8 +536,8 @@ def from_pandas( Load any non-default pandas indexes as columns. .. note:: - If the input is a pandas ``Series`` or ``DataFrame`` and has a nameless - index which just enumerates the rows, then it will not be included in the + If the input is a pandas ``DataFrame`` and has a nameless index + which just enumerates the rows, then it will not be included in the result, regardless of this parameter. If you want to be sure to include it, please call ``.reset_index()`` prior to calling this function. @@ -566,6 +577,9 @@ def from_pandas( 3 ] """ + if include_index and isinstance(data, pd.Series): + data = data.reset_index() + if isinstance(data, (pd.Series, pd.Index, pd.DatetimeIndex)): return wrap_s(pandas_to_pyseries("", data, nan_to_null=nan_to_null)) elif isinstance(data, pd.DataFrame): @@ -724,6 +738,7 @@ def _from_dataframe_repr(m: re.Match[str]) -> DataFrame: if schema and data and (n_extend_cols := (len(schema) - len(data))) > 0: empty_data = [None] * len(data[0]) data.extend((pl.Series(empty_data, dtype=String)) for _ in range(n_extend_cols)) + for dtype in set(schema.values()): if dtype in (List, Struct, Object): msg = ( diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py index ebcc0e6d3687..5e5730d23054 100644 --- a/py-polars/polars/dataframe/frame.py +++ b/py-polars/polars/dataframe/frame.py @@ -1979,7 +1979,7 @@ def to_jax( Create the Array on a specific GPU device: - >>> gpu_device = jax.devices("gpu")[1]) # doctest: +SKIP + >>> gpu_device = jax.devices("gpu")[1] # doctest: +SKIP >>> a = df.to_jax(device=gpu_device) # doctest: +SKIP >>> a.device() # doctest: +SKIP GpuDevice(id=1, process_index=0) diff --git a/py-polars/polars/expr/expr.py b/py-polars/polars/expr/expr.py index 514c9205398b..4613aabd4beb 100644 --- a/py-polars/polars/expr/expr.py +++ b/py-polars/polars/expr/expr.py @@ -407,9 +407,16 @@ def to_physical(self) -> Expr: - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` - `List(inner)` -> `List(physical of inner)` + - `Array(inner)` -> `Struct(physical of inner)` + - `Struct(fields)` -> `Array(physical of fields)` Other data types will be left unchanged. + Warning + ------- + The physical representations are an implementation detail + and not guaranteed to be stable. + Examples -------- Replicating the pandas diff --git a/py-polars/polars/expr/list.py b/py-polars/polars/expr/list.py index 48b4d1da9c49..4d239460d6b5 100644 --- a/py-polars/polars/expr/list.py +++ b/py-polars/polars/expr/list.py @@ -16,8 +16,8 @@ from polars._typing import ( IntoExpr, IntoExprColumn, + ListToStructWidthStrategy, NullBehavior, - ToStructStrategy, ) @@ -1092,7 +1092,7 @@ def to_array(self, width: int) -> Expr: def to_struct( self, - n_field_strategy: ToStructStrategy = "first_non_null", + n_field_strategy: ListToStructWidthStrategy = "first_non_null", fields: Sequence[str] | Callable[[int], str] | None = None, upper_bound: int = 0, ) -> Expr: @@ -1180,9 +1180,8 @@ def to_struct( [{'n': {'one': 0, 'two': 1}}, {'n': {'one': 2, 'two': 3}}] """ if isinstance(fields, Sequence): - field_names = list(fields) - pyexpr = self._pyexpr.list_to_struct(n_field_strategy, None, upper_bound) - return wrap_expr(pyexpr).struct.rename_fields(field_names) + pyexpr = self._pyexpr.list_to_struct_fixed_width(fields) + return wrap_expr(pyexpr) else: pyexpr = self._pyexpr.list_to_struct(n_field_strategy, fields, upper_bound) return wrap_expr(pyexpr) diff --git a/py-polars/polars/io/database/functions.py b/py-polars/polars/io/database/functions.py index aa686e2f813e..21e436dc0557 100644 --- a/py-polars/polars/io/database/functions.py +++ b/py-polars/polars/io/database/functions.py @@ -25,10 +25,12 @@ except ImportError: Selectable: TypeAlias = Any # type: ignore[no-redef] + from sqlalchemy.sql.elements import TextClause + @overload def read_database( - query: str | Selectable, + query: str | TextClause | Selectable, connection: ConnectionOrCursor | str, *, iter_batches: Literal[False] = ..., @@ -41,7 +43,7 @@ def read_database( @overload def read_database( - query: str | Selectable, + query: str | TextClause | Selectable, connection: ConnectionOrCursor | str, *, iter_batches: Literal[True], @@ -54,7 +56,7 @@ def read_database( @overload def read_database( - query: str | Selectable, + query: str | TextClause | Selectable, connection: ConnectionOrCursor | str, *, iter_batches: bool, @@ -66,7 +68,7 @@ def read_database( def read_database( - query: str | Selectable, + query: str | TextClause | Selectable, connection: ConnectionOrCursor | str, *, iter_batches: bool = False, diff --git a/py-polars/polars/series/list.py b/py-polars/polars/series/list.py index cf70f5225f56..0c4b08982606 100644 --- a/py-polars/polars/series/list.py +++ b/py-polars/polars/series/list.py @@ -14,8 +14,8 @@ from polars._typing import ( IntoExpr, IntoExprColumn, + ListToStructWidthStrategy, NullBehavior, - ToStructStrategy, ) from polars.polars import PySeries @@ -855,7 +855,7 @@ def to_array(self, width: int) -> Series: def to_struct( self, - n_field_strategy: ToStructStrategy = "first_non_null", + n_field_strategy: ListToStructWidthStrategy = "first_non_null", fields: Callable[[int], str] | Sequence[str] | None = None, ) -> Series: """ diff --git a/py-polars/polars/series/series.py b/py-polars/polars/series/series.py index ea37a64aa778..8e27b3470b16 100644 --- a/py-polars/polars/series/series.py +++ b/py-polars/polars/series/series.py @@ -4047,8 +4047,15 @@ def to_physical(self) -> Series: - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` - `List(inner)` -> `List(physical of inner)` + - `Array(inner)` -> `Array(physical of inner)` + - `Struct(fields)` -> `Struct(physical of fields)` - Other data types will be left unchanged. + Warning + ------- + The physical representations are an implementation detail + and not guaranteed to be stable. + Examples -------- Replicating the pandas diff --git a/py-polars/tests/unit/functions/range/test_date_range.py b/py-polars/tests/unit/functions/range/test_date_range.py index 0531ab1878d5..a88287bedf41 100644 --- a/py-polars/tests/unit/functions/range/test_date_range.py +++ b/py-polars/tests/unit/functions/range/test_date_range.py @@ -7,7 +7,7 @@ import pytest import polars as pl -from polars.exceptions import ComputeError, PanicException +from polars.exceptions import ComputeError, InvalidOperationError from polars.testing import assert_frame_equal, assert_series_equal if TYPE_CHECKING: @@ -21,7 +21,7 @@ def test_date_range() -> None: def test_date_range_invalid_time_unit() -> None: - with pytest.raises(PanicException, match="'x' not supported"): + with pytest.raises(InvalidOperationError, match="'x' not supported"): pl.date_range( start=date(2021, 12, 16), end=date(2021, 12, 18), diff --git a/py-polars/tests/unit/functions/range/test_datetime_range.py b/py-polars/tests/unit/functions/range/test_datetime_range.py index 8dbc9da15c29..ce99cd27b802 100644 --- a/py-polars/tests/unit/functions/range/test_datetime_range.py +++ b/py-polars/tests/unit/functions/range/test_datetime_range.py @@ -9,7 +9,7 @@ import polars as pl from polars.datatypes import DTYPE_TEMPORAL_UNITS -from polars.exceptions import ComputeError, PanicException, SchemaError +from polars.exceptions import ComputeError, InvalidOperationError, SchemaError from polars.testing import assert_frame_equal, assert_series_equal if TYPE_CHECKING: @@ -96,7 +96,7 @@ def test_datetime_range_precision( def test_datetime_range_invalid_time_unit() -> None: - with pytest.raises(PanicException, match="'x' not supported"): + with pytest.raises(InvalidOperationError, match="'x' not supported"): pl.datetime_range( start=datetime(2021, 12, 16), end=datetime(2021, 12, 16, 3), diff --git a/py-polars/tests/unit/interop/test_from_pandas.py b/py-polars/tests/unit/interop/test_from_pandas.py index aa0ab7e8210a..50ef4f2ac0ad 100644 --- a/py-polars/tests/unit/interop/test_from_pandas.py +++ b/py-polars/tests/unit/interop/test_from_pandas.py @@ -190,6 +190,18 @@ def test_from_pandas_include_indexes() -> None: assert df.to_dict(as_series=False) == data +def test_from_pandas_series_include_indexes() -> None: + # no default index + pd_series = pd.Series({"a": 1, "b": 2}, name="number").rename_axis(["letter"]) + df = pl.from_pandas(pd_series, include_index=True) + assert df.to_dict(as_series=False) == {"letter": ["a", "b"], "number": [1, 2]} + + # default index + pd_series = pd.Series(range(2)) + df = pl.from_pandas(pd_series, include_index=True) + assert df.to_dict(as_series=False) == {"index": [0, 1], "0": [0, 1]} + + def test_duplicate_cols_diff_types() -> None: df = pd.DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]], columns=["0", 0, "1", 1]) with pytest.raises( diff --git a/py-polars/tests/unit/io/database/test_read.py b/py-polars/tests/unit/io/database/test_read.py index fb6fa8dca0ed..69e7853172a1 100644 --- a/py-polars/tests/unit/io/database/test_read.py +++ b/py-polars/tests/unit/io/database/test_read.py @@ -12,7 +12,7 @@ import pyarrow as pa import pytest import sqlalchemy -from sqlalchemy import Integer, MetaData, Table, create_engine, func, select +from sqlalchemy import Integer, MetaData, Table, create_engine, func, select, text from sqlalchemy.orm import sessionmaker from sqlalchemy.sql.expression import cast as alchemy_cast @@ -383,6 +383,39 @@ def test_read_database_alchemy_selectable(tmp_sqlite_db: Path) -> None: assert_frame_equal(batches[0], expected) +def test_read_database_alchemy_textclause(tmp_sqlite_db: Path) -> None: + # various flavours of alchemy connection + alchemy_engine = create_engine(f"sqlite:///{tmp_sqlite_db}") + alchemy_session: ConnectionOrCursor = sessionmaker(bind=alchemy_engine)() + alchemy_conn: ConnectionOrCursor = alchemy_engine.connect() + + # establish sqlalchemy "textclause" and validate usage + textclause_query = text(""" + SELECT CAST(STRFTIME('%Y',"date") AS INT) as "year", name, value + FROM test_data + WHERE value < 0 + """) + + expected = pl.DataFrame({"year": [2021], "name": ["other"], "value": [-99.5]}) + + for conn in (alchemy_session, alchemy_engine, alchemy_conn): + assert_frame_equal( + pl.read_database(textclause_query, connection=conn), + expected, + ) + + batches = list( + pl.read_database( + textclause_query, + connection=conn, + iter_batches=True, + batch_size=1, + ) + ) + assert len(batches) == 1 + assert_frame_equal(batches[0], expected) + + def test_read_database_parameterised(tmp_sqlite_db: Path) -> None: # raw cursor "execute" only takes positional params, alchemy cursor takes kwargs alchemy_engine = create_engine(f"sqlite:///{tmp_sqlite_db}") diff --git a/py-polars/tests/unit/io/test_parquet.py b/py-polars/tests/unit/io/test_parquet.py index 564c20aadde2..bf9d8ac4fad8 100644 --- a/py-polars/tests/unit/io/test_parquet.py +++ b/py-polars/tests/unit/io/test_parquet.py @@ -1,9 +1,10 @@ from __future__ import annotations +import decimal import io from datetime import datetime, time, timezone from decimal import Decimal -from typing import IO, TYPE_CHECKING, Any, Literal, cast +from typing import IO, TYPE_CHECKING, Any, Callable, Literal, cast import fsspec import numpy as np @@ -1255,9 +1256,6 @@ def test_parquet_list_element_field_name() -> None: assert "child 0, element: int64" in schema_str -@pytest.mark.skip( - reason="TODO: fix Parquet writing Decimal-in-struct with precision > 18" -) def test_nested_decimal() -> None: df = pl.DataFrame( { @@ -1995,6 +1993,49 @@ def test_nested_nonnullable_19158() -> None: assert_frame_equal(pl.read_parquet(f), pl.DataFrame(tbl)) +D = Decimal + + +@pytest.mark.parametrize("precision", range(1, 37, 2)) +@pytest.mark.parametrize( + "nesting", + [ + # Struct + lambda t: ([{"x": None}, None], pl.Struct({"x": t})), + lambda t: ([None, {"x": None}], pl.Struct({"x": t})), + lambda t: ([{"x": D("1.5")}, None], pl.Struct({"x": t})), + lambda t: ([{"x": D("1.5")}, {"x": D("4.8")}], pl.Struct({"x": t})), + # Array + lambda t: ([[None, None, D("8.2")], None], pl.Array(t, 3)), + lambda t: ([None, [None, D("8.9"), None]], pl.Array(t, 3)), + lambda t: ([[D("1.5"), D("3.7"), D("4.1")], None], pl.Array(t, 3)), + lambda t: ( + [[D("1.5"), D("3.7"), D("4.1")], [D("2.8"), D("5.2"), D("8.9")]], + pl.Array(t, 3), + ), + # List + lambda t: ([[None, D("8.2")], None], pl.List(t)), + lambda t: ([None, [D("8.9"), None]], pl.List(t)), + lambda t: ([[D("1.5"), D("4.1")], None], pl.List(t)), + lambda t: ([[D("1.5"), D("3.7"), D("4.1")], [D("2.8"), D("8.9")]], pl.List(t)), + ], +) +def test_decimal_precision_nested_roundtrip( + nesting: Callable[[pl.DataType], tuple[list[Any], pl.DataType]], + precision: int, +) -> None: + # Limit the context as to not disturb any other tests + with decimal.localcontext() as ctx: + ctx.prec = precision + + decimal_dtype = pl.Decimal(precision=precision) + values, dtype = nesting(decimal_dtype) + + df = pl.Series("a", values, dtype).to_frame() + + test_round_trip(df) + + @pytest.mark.parametrize("parallel", ["prefiltered", "columns", "row_groups", "auto"]) def test_conserve_sortedness( monkeypatch: Any, capfd: Any, parallel: pl.ParallelStrategy diff --git a/py-polars/tests/unit/operations/namespaces/list/test_list.py b/py-polars/tests/unit/operations/namespaces/list/test_list.py index 03642fce41ed..1264a5ed8773 100644 --- a/py-polars/tests/unit/operations/namespaces/list/test_list.py +++ b/py-polars/tests/unit/operations/namespaces/list/test_list.py @@ -7,7 +7,11 @@ import pytest import polars as pl -from polars.exceptions import ComputeError, OutOfBoundsError, SchemaError +from polars.exceptions import ( + ComputeError, + OutOfBoundsError, + SchemaError, +) from polars.testing import assert_frame_equal, assert_series_equal @@ -653,6 +657,26 @@ def test_list_to_struct() -> None: {"n": {"one": 0, "two": 1, "three": None}}, ] + q = df.lazy().select( + pl.col("n").list.to_struct(fields=["a", "b"]).struct.field("a") + ) + + assert_frame_equal(q.collect(), pl.DataFrame({"a": [0, 0]})) + + # Check that: + # * Specifying an upper bound calls the field name getter function to + # retrieve the lazy schema + # * The upper bound is respected during execution + q = df.lazy().select( + pl.col("n").list.to_struct(fields=str, upper_bound=2).struct.unnest() + ) + assert q.collect_schema() == {"0": pl.Int64, "1": pl.Int64} + assert_frame_equal(q.collect(), pl.DataFrame({"0": [0, 0], "1": [1, 1]})) + + assert df.lazy().select(pl.col("n").list.to_struct()).collect_schema() == { + "n": pl.Unknown + } + def test_select_from_list_to_struct_11143() -> None: ldf = pl.LazyFrame({"some_col": [[1.0, 2.0], [1.5, 3.0]]}) diff --git a/py-polars/tests/unit/operations/test_cast.py b/py-polars/tests/unit/operations/test_cast.py index 9b7d3322baca..dca9eeb3e767 100644 --- a/py-polars/tests/unit/operations/test_cast.py +++ b/py-polars/tests/unit/operations/test_cast.py @@ -672,3 +672,9 @@ def test_cast_consistency() -> None: assert pl.DataFrame().with_columns(a=pl.lit(0.0)).with_columns( b=pl.col("a").cast(pl.String), c=pl.lit(0.0).cast(pl.String) ).to_dict(as_series=False) == {"a": [0.0], "b": ["0.0"], "c": ["0.0"]} + + +def test_cast_int_to_string_unsets_sorted_flag_19424() -> None: + s = pl.Series([1, 2]).set_sorted() + assert s.flags["SORTED_ASC"] + assert not s.cast(pl.String).flags["SORTED_ASC"] diff --git a/py-polars/tests/unit/operations/test_gather.py b/py-polars/tests/unit/operations/test_gather.py index 595b2bfee246..4c74f60a7ff9 100644 --- a/py-polars/tests/unit/operations/test_gather.py +++ b/py-polars/tests/unit/operations/test_gather.py @@ -188,3 +188,16 @@ def test_gather_array() -> None: v = s[[0, 1, None, 3]] # type: ignore[list-item] assert v[2] is None + + +def test_gather_array_outer_validity_19482() -> None: + s = ( + pl.Series([[1], [1]], dtype=pl.Array(pl.Int64, 1)) + .to_frame() + .select(pl.when(pl.int_range(pl.len()) == 0).then(pl.first())) + .to_series() + ) + + expect = pl.Series([[1], None], dtype=pl.Array(pl.Int64, 1)) + assert_series_equal(s, expect) + assert_series_equal(s.gather([0, 1]), expect) diff --git a/py-polars/tests/unit/series/test_scatter.py b/py-polars/tests/unit/series/test_scatter.py index c3c0b38d6805..95e4aa5b31e3 100644 --- a/py-polars/tests/unit/series/test_scatter.py +++ b/py-polars/tests/unit/series/test_scatter.py @@ -43,7 +43,7 @@ def test_scatter() -> None: assert s.to_list() == ["a", "x", "x"] assert s.scatter([0, 2], 0.12345).to_list() == ["0.12345", "x", "0.12345"] - # set multiple values values + # set multiple values s = pl.Series(["z", "z", "z"]) assert s.scatter([0, 1], ["a", "b"]).to_list() == ["a", "b", "z"] s = pl.Series([True, False, True]) diff --git a/py-polars/tests/unit/sql/test_miscellaneous.py b/py-polars/tests/unit/sql/test_miscellaneous.py index 95ba8461bebe..f7d0615e13c6 100644 --- a/py-polars/tests/unit/sql/test_miscellaneous.py +++ b/py-polars/tests/unit/sql/test_miscellaneous.py @@ -7,7 +7,7 @@ import pytest import polars as pl -from polars.exceptions import SQLInterfaceError, SQLSyntaxError +from polars.exceptions import ColumnNotFoundError, SQLInterfaceError, SQLSyntaxError from polars.testing import assert_frame_equal if TYPE_CHECKING: @@ -362,3 +362,26 @@ def test_global_variable_inference_17398() -> None: eager=True, ) assert_frame_equal(res, users) + + +@pytest.mark.parametrize( + "query", + [ + "SELECT invalid_column FROM self", + "SELECT key, invalid_column FROM self", + "SELECT invalid_column * 2 FROM self", + "SELECT * FROM self ORDER BY invalid_column", + "SELECT * FROM self WHERE invalid_column = 200", + "SELECT * FROM self WHERE invalid_column = '200'", + "SELECT key, SUM(n) AS sum_n FROM self GROUP BY invalid_column", + ], +) +def test_invalid_cols(query: str) -> None: + df = pl.DataFrame( + { + "key": ["xx", "xx", "yy"], + "n": ["100", "200", "300"], + } + ) + with pytest.raises(ColumnNotFoundError, match="invalid_column"): + df.sql(query) diff --git a/py-polars/tests/unit/test_errors.py b/py-polars/tests/unit/test_errors.py index c730ee8d30a7..0ca504d20e20 100644 --- a/py-polars/tests/unit/test_errors.py +++ b/py-polars/tests/unit/test_errors.py @@ -16,7 +16,6 @@ ComputeError, InvalidOperationError, OutOfBoundsError, - PanicException, SchemaError, SchemaFieldNotFoundError, ShapeError, @@ -116,7 +115,7 @@ def test_string_numeric_comp_err() -> None: def test_panic_error() -> None: with pytest.raises( - PanicException, + InvalidOperationError, match="unit: 'k' not supported", ): pl.datetime_range( @@ -696,7 +695,7 @@ def test_no_panic_pandas_nat() -> None: def test_list_to_struct_invalid_type() -> None: - with pytest.raises(pl.exceptions.SchemaError): + with pytest.raises(pl.exceptions.InvalidOperationError): pl.DataFrame({"a": 1}).select(pl.col("a").list.to_struct()) @@ -708,3 +707,15 @@ def test_raise_invalid_agg() -> None: .group_by("index") .agg(pl.col("foo").filter(pl.col("i_do_not_exist"))) ).collect() + + +def test_err_mean_horizontal_lists() -> None: + df = pl.DataFrame( + { + "experiment_id": [1, 2], + "sensor1": [[1, 2, 3], [7, 8, 9]], + "sensor2": [[4, 5, 6], [10, 11, 12]], + } + ) + with pytest.raises(pl.exceptions.InvalidOperationError): + df.with_columns(pl.mean_horizontal("sensor1", "sensor2").alias("avg_sensor")) diff --git a/rust-toolchain.toml b/rust-toolchain.toml index 179cd9e58d86..90221c3e2edb 100644 --- a/rust-toolchain.toml +++ b/rust-toolchain.toml @@ -1,2 +1,2 @@ [toolchain] -channel = "nightly-2024-09-29" +channel = "nightly-2024-10-28"