diff --git a/.envrc b/.envrc new file mode 100644 index 0000000..3550a30 --- /dev/null +++ b/.envrc @@ -0,0 +1 @@ +use flake diff --git a/.github/Dockerfile.x86_64-unknown-linux-gnu b/.github/Dockerfile.x86_64-unknown-linux-gnu new file mode 100644 index 0000000..9f777ac --- /dev/null +++ b/.github/Dockerfile.x86_64-unknown-linux-gnu @@ -0,0 +1,10 @@ +FROM quay.io/pypa/manylinux2014_x86_64:latest + +RUN echo -e '#!/usr/bin/env bash\nsource scl_source enable llvm-toolset-7\nexec "$@"' > /entrypoint.sh && chmod +x /entrypoint.sh +RUN yum install -y llvm-toolset-7 ragel && yum clean all +RUN curl -L https://boostorg.jfrog.io/artifactory/main/release/1.80.0/source/boost_1_80_0.tar.gz -o boost.tar.gz \ + && test 4b2136f98bdd1f5857f1c3dea9ac2018effe65286cf251534b6ae20cc45e1847 == $(sha256sum -b boost.tar.gz | cut -c1-64) \ + || { echo "Checksum mismatch"; exit 11; } \ + && tar --strip-components 1 -xf boost.tar.gz -C /usr/include \ + && rm boost.tar.gz +ENTRYPOINT ["/entrypoint.sh"] diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..7f92ae2 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,247 @@ +name: CI + +on: + push: + branches: + - main + tags: + - v*.*.* + pull_request: + +jobs: + checks: + name: Checks + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - name: Set up cache + uses: actions/cache@v3 + with: + path: | + ~/.cargo/bin/ + ~/.cargo/registry/index/ + ~/.cargo/registry/cache/ + ~/.cargo/git/db/ + ~/.cache/pre-commit + target/ + key: checks-cargo-${{ hashFiles('**/Cargo.lock') }} + restore-keys: checks-cargo- + - name: Python code format + uses: psf/black@stable + - name: Installing nightly Rust toolchain + uses: dtolnay/rust-toolchain@nightly + with: + components: clippy, rustfmt + - name: Installing dependencies + run: | + sudo apt install -y libhyperscan-dev libhyperscan5 + pip install '.[test]' + - uses: pre-commit-ci/lite-action@v1.0.0 + - uses: pre-commit/action@v3.0.0 + - name: Rust code checks + run: | + rm -f rust-toolchain.toml + cargo fmt --all -- --check + cargo clippy -- -D warnings + - name: Python type check + run: mypy . --exclude hyperscan-sys + - name: Python tests + uses: dariocurr/pytest-summary@main + + build-nix: + needs: + - checks + strategy: + matrix: + include: + - os: ubuntu-latest + arch: x86_64-linux + drv: "shared" + - os: ubuntu-latest + arch: x86_64-linux + drv: "hyperscan" + - os: ubuntu-latest + arch: x86_64-linux + drv: "vectorscan" + - os: macos-latest + arch: x86_64-darwin + drv: vectorscan + name: Nix - ${{ matrix.arch }}.${{ matrix.drv }} + runs-on: ${{ matrix.os }} + steps: + - uses: actions/checkout@v3 + with: + submodules: true + - uses: cachix/install-nix-action@v18 + - uses: cachix/cachix-action@v12 + with: + name: vlaci + authToken: "${{ secrets.CACHIX_AUTH_TOKEN }}" + - name: Build nix shell + run: | + nix build -L .#devShell.${{ matrix.arch }} + - name: Build nix package + run: nix build -L .?submodules=1#defaultPackage.${{ matrix.arch }}.${{ matrix.drv }} + + prepare-build-env: + needs: + - checks + strategy: + matrix: + include: + - target: x86_64-unknown-linux-gnu + name: Build Docker env - ${{ matrix.target }} + runs-on: ubuntu-latest + outputs: + image: ${{ steps.build.outputs.image }} + tag: ${{ steps.build.outputs.tag }} + steps: + - uses: actions/checkout@v3 + - name: Prepare build + id: build + if: ${{ runner.os == 'Linux' }} + run: | + IMAGE=ghcr.io/vlaci/pyperscan-builder + TAG=${{ hashFiles(format('.github/Dockerfile.{0}', matrix.target)) }} + REF=$IMAGE-${{ matrix.target }}:$TAG + echo ref=$REF > $GITHUB_ENV + echo image=$IMAGE > $GITHUB_OUTPUT + echo tag=$TAG >> $GITHUB_OUTPUT + if ! docker manifest inspect $REF; then + echo build=true >> $GITHUB_ENV + fi + - name: Login to GitHub Container Registry + if: ${{ env.build }} + uses: docker/login-action@v2 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + - name: Build and push + if: ${{ env.build }} + uses: docker/build-push-action@v3 + with: + file: ./.github/Dockerfile.${{ matrix.target }} + push: true + tags: ${{ env.ref }} + + sdist: + name: Build - sdist + runs-on: ubuntu-latest + needs: + - checks + steps: + - uses: actions/checkout@v3 + - name: Set up cargo cache + uses: actions/cache@v3 + with: + path: | + ~/.cargo/bin/ + ~/.cargo/registry/index/ + ~/.cargo/registry/cache/ + ~/.cargo/git/db/ + target/ + key: sdist-cargo-${{ hashFiles('**/Cargo.lock') }} + restore-keys: sdist-cargo- + - uses: PyO3/maturin-action@v1 + with: + container: off + command: sdist + args: -o dist + - name: Upload sdist + uses: actions/upload-artifact@v3 + with: + name: sdist + path: dist + + build-wheels: + needs: + - prepare-build-env + strategy: + matrix: + include: + - os: ubuntu-latest + maturin: "-F hyperscan" + target: x86_64-unknown-linux-gnu + kind: static-hyperscan + - os: ubuntu-latest + maturin: "-F vectorscan" + target: x86_64-unknown-linux-gnu + kind: static-vectorscan + - os: macos-latest + target: universal2-apple-darwin + maturin: "-F vectorscan --universal2" + kind: static-vectorscan + name: Build - ${{ matrix.target }} ${{ matrix.kind }} + runs-on: ${{ matrix.os }} + steps: + - uses: actions/checkout@v3 + with: + submodules: true + - name: Set up cargo cache + uses: actions/cache@v3 + with: + path: | + ~/.cargo/bin/ + ~/.cargo/registry/index/ + ~/.cargo/registry/cache/ + ~/.cargo/git/db/ + target/ + key: ${{ matrix.target }}-${{ matrix.kind }}-cargo-${{ hashFiles('**/Cargo.lock') }} + restore-keys: ${{ matrix.target }}-${{ matrix.kind }}-cargo- + - name: Install dependencies + if: ${{ runner.os == 'macOS' }} + run: | + brew install boost ragel + echo container= > $GITHUB_ENV + - name: Building builder container + if: ${{ runner.os == 'Linux' }} + run: | + CONTAINER=${{ needs.prepare-build-env.outputs.image }}-${{ matrix.target }}:${{ needs.prepare-build-env.outputs.tag }} + echo container=$CONTAINER > $GITHUB_ENV + - name: Disable tools + run: | + rm hyperscan-sys/*/tools/CMakeLists.txt + - uses: PyO3/maturin-action@v1 + with: + container: ${{ env.container }} + command: build + args: --release --strip -o dist -vv ${{ matrix.maturin }} + - uses: actions/setup-python@v4 + with: + python-version: "3.8" + - name: Installing built wheel + run: | + whl=(dist/pyperscan-*.whl) + pip install "$whl[test]" + - uses: dariocurr/pytest-summary@main + - name: Upload wheels + uses: actions/upload-artifact@v3 + with: + name: wheels-${{ matrix.target }}.${{ matrix.kind }} + path: dist + + release: + name: Release + runs-on: ubuntu-latest + if: "startsWith(github.ref, 'refs/tags/')" + needs: + - sdist + - build-wheels + steps: + - uses: actions/download-artifact@v3 + with: + name: sdist + - uses: actions/download-artifact@v3 + with: + name: wheels-x86_64-unknown-linux-gnu.static-vectorscan + - uses: actions/download-artifact@v3 + with: + name: wheels-universal2-apple-darwin.static-vectorscan + - name: Publish to PyPI + uses: messense/maturin-action@v1 + env: + MATURIN_PYPI_TOKEN: ${{ secrets.PYPI_API_TOKEN }} + with: + command: upload + args: --skip-existing * diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..c8f0442 --- /dev/null +++ b/.gitignore @@ -0,0 +1,72 @@ +/target + +# Byte-compiled / optimized / DLL files +__pycache__/ +.pytest_cache/ +*.py[cod] + +# C extensions +*.so + +# Distribution / packaging +.Python +.venv/ +env/ +bin/ +build/ +develop-eggs/ +dist/ +eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +include/ +man/ +venv/ +*.egg-info/ +.installed.cfg +*.egg + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt +pip-selfcheck.json + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.cache +nosetests.xml +coverage.xml + +# Translations +*.mo + +# Mr Developer +.mr.developer.cfg +.project +.pydevproject + +# Rope +.ropeproject + +# Django stuff: +*.log +*.pot + +.DS_Store + +# Sphinx documentation +docs/_build/ + +# PyCharm +.idea/ + +# VSCode +.vscode/ + +# Pyenv +.python-version diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..e7685f0 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,6 @@ +[submodule "hyperscan-src/vectorscan"] + path = hyperscan-sys/vectorscan + url = https://github.com/VectorCamp/vectorscan +[submodule "hyperscan-src/hyperscan"] + path = hyperscan-sys/hyperscan + url = https://git.sr.ht/~vlaci/hyperscan diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..3060199 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,59 @@ +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.0.1 + hooks: + - id: trailing-whitespace + exclude: ".*\\.md" + - id: end-of-file-fixer + - id: check-toml + - id: check-yaml + - id: check-added-large-files + + - repo: https://github.com/pre-commit/mirrors-prettier + rev: v2.7.1 + hooks: + - id: prettier + + - repo: https://github.com/python-jsonschema/check-jsonschema + rev: 0.19.2 + hooks: + - id: check-github-workflows + + - repo: https://github.com/psf/black + rev: 22.10.0 + hooks: + - id: black + name: Check black + + - repo: https://github.com/asottile/pyupgrade + rev: v3.2.2 + hooks: + - id: pyupgrade + args: [--py38-plus] + + - repo: https://github.com/PyCQA/isort + rev: 5.10.1 + hooks: + - id: isort + name: Check isort + + - repo: https://github.com/pre-commit/mirrors-mypy + rev: "v0.991" + hooks: + - id: mypy + + - repo: https://github.com/DanielNoord/pydocstringformatter + rev: v0.7.2 + hooks: + - id: pydocstringformatter + + - repo: https://github.com/doublify/pre-commit-rust + rev: v1.0 + hooks: + - id: fmt + - id: clippy + + - repo: https://github.com/nix-community/nixpkgs-fmt + rev: v1.3.0 + hooks: + - id: nixpkgs-fmt diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..2275a1a --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,520 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "autocfg" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" + +[[package]] +name = "bindgen" +version = "0.61.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a022e58a142a46fea340d68012b9201c094e93ec3d033a944a24f8fd4a4f09a" +dependencies = [ + "bitflags", + "cexpr", + "clang-sys", + "lazy_static", + "lazycell", + "log", + "peeking_take_while", + "proc-macro2", + "quote", + "regex", + "rustc-hash", + "shlex", + "syn", + "which", +] + +[[package]] +name = "bitflags" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" + +[[package]] +name = "cc" +version = "1.0.76" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76a284da2e6fe2092f2353e51713435363112dfd60030e22add80be333fb928f" + +[[package]] +name = "cexpr" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6fac387a98bb7c37292057cffc56d62ecb629900026402633ae9160df93a8766" +dependencies = [ + "nom", +] + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "clang-sys" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa2e27ae6ab525c3d369ded447057bca5438d86dc3a68f6faafb8269ba82ebf3" +dependencies = [ + "glob", + "libc", + "libloading", +] + +[[package]] +name = "cmake" +version = "0.1.49" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "db34956e100b30725f2eb215f90d4871051239535632f84fea3bc92722c66b7c" +dependencies = [ + "cc", +] + +[[package]] +name = "either" +version = "1.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "90e5c1c8368803113bf0c9584fc495a58b86dc8a29edbf8fe877d21d9507e797" + +[[package]] +name = "foreign-types" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d737d9aa519fb7b749cbc3b962edcf310a8dd1f4b67c91c4f83975dbdd17d965" +dependencies = [ + "foreign-types-macros", + "foreign-types-shared", +] + +[[package]] +name = "foreign-types-macros" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8469d0d40519bc608ec6863f1cc88f3f1deee15913f2f3b3e573d81ed38cccc" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "foreign-types-shared" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aa9a19cbb55df58761df49b23516a86d432839add4af60fc256da840f66ed35b" + +[[package]] +name = "glob" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b919933a397b79c37e33b77bb2aa3dc8eb6e165ad809e58ff75bc7db2e34574" + +[[package]] +name = "hyperscan-sys" +version = "0.1.0" +dependencies = [ + "bindgen", + "cmake", +] + +[[package]] +name = "indoc" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "adab1eaa3408fb7f0c777a73e7465fd5656136fc93b670eb6df3c88c2c1344e3" + +[[package]] +name = "lazy_static" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" + +[[package]] +name = "lazycell" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" + +[[package]] +name = "libc" +version = "0.2.137" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc7fcc620a3bff7cdd7a365be3376c97191aeaccc2a603e600951e452615bf89" + +[[package]] +name = "libloading" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b67380fd3b2fbe7527a606e18729d21c6f3951633d0500574c4dc22d2d638b9f" +dependencies = [ + "cfg-if", + "winapi", +] + +[[package]] +name = "lock_api" +version = "0.4.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "435011366fe56583b16cf956f9df0095b405b82d76425bc8981c0e22e60ec4df" +dependencies = [ + "autocfg", + "scopeguard", +] + +[[package]] +name = "log" +version = "0.4.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "abb12e687cfb44aa40f41fc3978ef76448f9b6038cad6aef4259d3c095a2382e" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "memchr" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d" + +[[package]] +name = "memoffset" +version = "0.6.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5aa361d4faea93603064a027415f07bd8e1d5c88c9fbf68bf56a285428fd79ce" +dependencies = [ + "autocfg", +] + +[[package]] +name = "minimal-lexical" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" + +[[package]] +name = "nom" +version = "7.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a8903e5a29a317527874d0402f867152a3d21c908bb0b933e416c65e301d4c36" +dependencies = [ + "memchr", + "minimal-lexical", +] + +[[package]] +name = "once_cell" +version = "1.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "86f0b0d4bf799edbc74508c1e8bf170ff5f41238e5f8225603ca7caaae2b7860" + +[[package]] +name = "parking_lot" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f" +dependencies = [ + "lock_api", + "parking_lot_core", +] + +[[package]] +name = "parking_lot_core" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4dc9e0dc2adc1c69d09143aff38d3d30c5c3f0df0dad82e6d25547af174ebec0" +dependencies = [ + "cfg-if", + "libc", + "redox_syscall", + "smallvec", + "windows-sys", +] + +[[package]] +name = "peeking_take_while" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099" + +[[package]] +name = "proc-macro2" +version = "1.0.47" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5ea3d908b0e36316caf9e9e2c4625cdde190a7e6f440d794667ed17a1855e725" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "pyo3" +version = "0.17.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "268be0c73583c183f2b14052337465768c07726936a260f480f0857cb95ba543" +dependencies = [ + "cfg-if", + "indoc", + "libc", + "memoffset", + "parking_lot", + "pyo3-build-config", + "pyo3-ffi", + "pyo3-macros", + "unindent", +] + +[[package]] +name = "pyo3-build-config" +version = "0.17.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "28fcd1e73f06ec85bf3280c48c67e731d8290ad3d730f8be9dc07946923005c8" +dependencies = [ + "once_cell", + "target-lexicon", +] + +[[package]] +name = "pyo3-ffi" +version = "0.17.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0f6cb136e222e49115b3c51c32792886defbfb0adead26a688142b346a0b9ffc" +dependencies = [ + "libc", + "pyo3-build-config", +] + +[[package]] +name = "pyo3-macros" +version = "0.17.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94144a1266e236b1c932682136dc35a9dee8d3589728f68130c7c3861ef96b28" +dependencies = [ + "proc-macro2", + "pyo3-macros-backend", + "quote", + "syn", +] + +[[package]] +name = "pyo3-macros-backend" +version = "0.17.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8df9be978a2d2f0cdebabb03206ed73b11314701a5bfe71b0d753b81997777f" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "pyperscan" +version = "0.1.0" +dependencies = [ + "bitflags", + "foreign-types", + "hyperscan-sys", + "pyo3", + "thiserror", +] + +[[package]] +name = "quote" +version = "1.0.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbe448f377a7d6961e30f5955f9b8d106c3f5e449d493ee1b125c1d43c2b5179" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "redox_syscall" +version = "0.2.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fb5a58c1855b4b6819d59012155603f0b22ad30cad752600aadfcb695265519a" +dependencies = [ + "bitflags", +] + +[[package]] +name = "regex" +version = "1.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e076559ef8e241f2ae3479e36f97bd5741c0330689e217ad51ce2c76808b868a" +dependencies = [ + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.6.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "456c603be3e8d448b072f410900c09faf164fbce2d480456f50eea6e25f9c848" + +[[package]] +name = "rustc-hash" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" + +[[package]] +name = "scopeguard" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" + +[[package]] +name = "shlex" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43b2853a4d09f215c24cc5489c992ce46052d359b5109343cbafbf26bc62f8a3" + +[[package]] +name = "smallvec" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a507befe795404456341dfab10cef66ead4c041f62b8b11bbb92bffe5d0953e0" + +[[package]] +name = "syn" +version = "1.0.103" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a864042229133ada95abf3b54fdc62ef5ccabe9515b64717bcb9a1919e59445d" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "target-lexicon" +version = "0.12.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9410d0f6853b1d94f0e519fb95df60f29d2c1eff2d921ffdf01a4c8a3b54f12d" + +[[package]] +name = "thiserror" +version = "1.0.37" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "10deb33631e3c9018b9baf9dcbbc4f737320d2b576bac10f6aefa048fa407e3e" +dependencies = [ + "thiserror-impl", +] + +[[package]] +name = "thiserror-impl" +version = "1.0.37" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "982d17546b47146b28f7c22e3d08465f6b8903d0ea13c1660d9d84a6e7adcdbb" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "unicode-ident" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ceab39d59e4c9499d4e5a8ee0e2735b891bb7308ac83dfb4e80cad195c9f6f3" + +[[package]] +name = "unindent" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "58ee9362deb4a96cef4d437d1ad49cffc9b9e92d202b6995674e928ce684f112" + +[[package]] +name = "which" +version = "4.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1c831fbbee9e129a8cf93e7747a82da9d95ba8e16621cae60ec2cdc849bacb7b" +dependencies = [ + "either", + "libc", + "once_cell", +] + +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" + +[[package]] +name = "windows-sys" +version = "0.42.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a3e1820f08b8513f676f7ab6c1f99ff312fb97b553d30ff4dd86f9f15728aa7" +dependencies = [ + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.42.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41d2aa71f6f0cbe00ae5167d90ef3cfe66527d6f613ca78ac8024c3ccab9a19e" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.42.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dd0f252f5a35cac83d6311b2e795981f5ee6e67eb1f9a7f64eb4500fbc4dcdb4" + +[[package]] +name = "windows_i686_gnu" +version = "0.42.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fbeae19f6716841636c28d695375df17562ca208b2b7d0dc47635a50ae6c5de7" + +[[package]] +name = "windows_i686_msvc" +version = "0.42.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "84c12f65daa39dd2babe6e442988fc329d6243fdce47d7d2d155b8d874862246" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.42.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf7b1b21b5362cbc318f686150e5bcea75ecedc74dd157d874d754a2ca44b0ed" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.42.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09d525d2ba30eeb3297665bd434a54297e4170c7f1a44cad4ef58095b4cd2028" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.42.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f40009d85759725a34da6d89a94e63d7bdc50a862acf0dbc7c8e488f1edcb6f5" diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..4364379 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,31 @@ +[package] +name = "pyperscan" +authors.workspace = true +version.workspace = true +edition.workspace = true +license.workspace = true + +[lib] +name = "pyperscan" +crate-type = ["cdylib"] + +[dependencies] +bitflags = "1.3.2" +foreign-types = "0.5.0" +pyo3 = { version = "0.17", features = ["extension-module", "abi3-py38"] } +thiserror = "1.0.37" +hyperscan-sys = { path = "./hyperscan-sys" } + +[features] +hyperscan = ["hyperscan-sys/hyperscan"] +vectorscan = ["hyperscan-sys/vectorscan"] + +[workspace] +resolver = "2" +members = [".", "hyperscan-sys"] + +[workspace.package] +version = "0.1.0" +authors = ["László Vaskó <1771332+vlaci@users.noreply.github.com>"] +license = "MIT OR Apache-2.0" +edition = "2021" diff --git a/LICENSE-APACHE b/LICENSE-APACHE new file mode 100644 index 0000000..1b5ec8b --- /dev/null +++ b/LICENSE-APACHE @@ -0,0 +1,176 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + +TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + +1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + +2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + +3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + +4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + +5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + +6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + +7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + +8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + +9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + +END OF TERMS AND CONDITIONS diff --git a/LICENSE-MIT b/LICENSE-MIT new file mode 100644 index 0000000..31aa793 --- /dev/null +++ b/LICENSE-MIT @@ -0,0 +1,23 @@ +Permission is hereby granted, free of charge, to any +person obtaining a copy of this software and associated +documentation files (the "Software"), to deal in the +Software without restriction, including without +limitation the rights to use, copy, modify, merge, +publish, distribute, sublicense, and/or sell copies of +the Software, and to permit persons to whom the Software +is furnished to do so, subject to the following +conditions: + +The above copyright notice and this permission notice +shall be included in all copies or substantial portions +of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF +ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED +TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A +PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT +SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR +IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..a6aec32 --- /dev/null +++ b/README.md @@ -0,0 +1,18 @@ +# pyperscan + +## License + +Licensed under either of + +- Apache License, Version 2.0 + ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0) +- MIT license + ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT) + +at your option. + +## Contribution + +Unless you explicitly state otherwise, any contribution intentionally submitted +for inclusion in the work by you, as defined in the Apache-2.0 license, shall be +dual licensed as above, without any additional terms or conditions. diff --git a/default.nix b/default.nix new file mode 100644 index 0000000..89308a3 --- /dev/null +++ b/default.nix @@ -0,0 +1,10 @@ +(import + ( + fetchTarball { + url = "https://github.com/edolstra/flake-compat/archive/99f1c2157fba4bfe6211a321fd0ee43199025dbf.tar.gz"; + sha256 = "0x2jn3vrawwv9xp15674wjz9pixwjyj3j771izayl962zziivbx2"; + } + ) + { + src = ./.; + }).defaultNix diff --git a/extension-module/Cargo.toml b/extension-module/Cargo.toml new file mode 100644 index 0000000..6262220 --- /dev/null +++ b/extension-module/Cargo.toml @@ -0,0 +1,21 @@ +[package] +name = "pyperscan" +version.workspace = true +edition.workspace = true +license.workspace = true + +[lib] +name = "pyperscan" +crate-type = ["cdylib"] + +[dependencies] +bitflags = "1.3.2" +foreign-types = "0.5.0" +pyo3 = { version = "0.17", features = ["extension-module", "abi3-py38"] } +thiserror = "1.0.37" + +hyperscan-sys.workspace = true + +[features] +hyperscan = ["hyperscan-sys/hyperscan"] +vectorscan = ["hyperscan-sys/vectorscan"] diff --git a/flake.lock b/flake.lock new file mode 100644 index 0000000..c116f71 --- /dev/null +++ b/flake.lock @@ -0,0 +1,111 @@ +{ + "nodes": { + "flake-utils": { + "locked": { + "lastModified": 1659877975, + "narHash": "sha256-zllb8aq3YO3h8B/U0/J1WBgAL8EX5yWf5pMj3G0NAmc=", + "owner": "numtide", + "repo": "flake-utils", + "rev": "c0e246b9b83f637f4681389ecabcb2681b4f3af0", + "type": "github" + }, + "original": { + "owner": "numtide", + "repo": "flake-utils", + "type": "github" + } + }, + "maturin": { + "flake": false, + "locked": { + "lastModified": 1669212782, + "narHash": "sha256-k2XP0pzIjHTzp4Zouwj/lqir5uOCMpWu2iWbVjsoaf0=", + "owner": "PyO3", + "repo": "maturin", + "rev": "a5e72368334ed41878b2befbd77fa6ac698312c5", + "type": "github" + }, + "original": { + "owner": "PyO3", + "repo": "maturin", + "type": "github" + } + }, + "nixpkgs": { + "locked": { + "lastModified": 1669165918, + "narHash": "sha256-hIVruk2+0wmw/Kfzy11rG3q7ev3VTi/IKVODeHcVjFo=", + "owner": "NixOS", + "repo": "nixpkgs", + "rev": "3b400a525d92e4085e46141ff48cbf89fd89739e", + "type": "github" + }, + "original": { + "owner": "NixOS", + "ref": "nixpkgs-unstable", + "repo": "nixpkgs", + "type": "github" + } + }, + "nixpkgs_2": { + "locked": { + "lastModified": 1665296151, + "narHash": "sha256-uOB0oxqxN9K7XGF1hcnY+PQnlQJ+3bP2vCn/+Ru/bbc=", + "owner": "NixOS", + "repo": "nixpkgs", + "rev": "14ccaaedd95a488dd7ae142757884d8e125b3363", + "type": "github" + }, + "original": { + "owner": "NixOS", + "ref": "nixpkgs-unstable", + "repo": "nixpkgs", + "type": "github" + } + }, + "root": { + "inputs": { + "maturin": "maturin", + "nixpkgs": "nixpkgs", + "rust-overlay": "rust-overlay", + "utils": "utils" + } + }, + "rust-overlay": { + "inputs": { + "flake-utils": "flake-utils", + "nixpkgs": "nixpkgs_2" + }, + "locked": { + "lastModified": 1669516540, + "narHash": "sha256-KzAKsPr6s77I2/0wWJ2tSY7Ca5Av/bqfw3aUv8lbuH4=", + "owner": "oxalica", + "repo": "rust-overlay", + "rev": "d99b1e8e21de25b97cbadb413d3510cec0ba5bc5", + "type": "github" + }, + "original": { + "owner": "oxalica", + "repo": "rust-overlay", + "type": "github" + } + }, + "utils": { + "locked": { + "lastModified": 1667395993, + "narHash": "sha256-nuEHfE/LcWyuSWnS8t12N1wc105Qtau+/OdUAjtQ0rA=", + "owner": "numtide", + "repo": "flake-utils", + "rev": "5aed5285a952e0b949eb3ba02c12fa4fcfef535f", + "type": "github" + }, + "original": { + "owner": "numtide", + "repo": "flake-utils", + "type": "github" + } + } + }, + "root": "root", + "version": 7 +} diff --git a/flake.nix b/flake.nix new file mode 100644 index 0000000..6f1aad7 --- /dev/null +++ b/flake.nix @@ -0,0 +1,72 @@ +{ + inputs = { + nixpkgs.url = "github:NixOS/nixpkgs/nixpkgs-unstable"; + utils.url = "github:numtide/flake-utils"; + maturin.url = "github:PyO3/maturin"; + maturin.flake = false; + rust-overlay.url = "github:oxalica/rust-overlay"; + }; + + outputs = { self, nixpkgs, utils, maturin, rust-overlay }: + utils.lib.eachDefaultSystem + (system: + let + overlays = [ + rust-overlay.overlays.default + (final: prev: { + maturin = prev.maturin.overrideAttrs (super: { + version = maturin.shortRev; + src = maturin; + cargoDeps = final.rustPlatform.importCargoLock { + lockFile = "${maturin}/Cargo.lock"; + }; + }); + }) + ]; + pkgs = import nixpkgs { inherit overlays system; }; + inherit (pkgs.lib) optionals; + + rust-toolchain_toml = (builtins.fromTOML (builtins.readFile ./rust-toolchain.toml)).toolchain; + rustToolchain = pkgs.rust-bin.fromRustupToolchain rust-toolchain_toml; + rustToolchainDev = rustToolchain.override { + extensions = (rust-toolchain_toml.components or [ ]) ++ [ "rust-src" ]; + }; + pyperscan = + let + drv = pkgs.callPackage ./nix/pyperscan.nix { + rustPlatform = pkgs.makeRustPlatform { cargo = rustToolchain; rustc = rustToolchain; }; + }; + in + drv.overrideAttrs (_: { + passthru = { + shared = drv; + hyperscan = drv.override { vendorHyperscan = true; }; + vectorscan = drv.override { vendorVectorscan = true; }; + }; + }); + in + { + packages = pkgs; + defaultPackage = pyperscan; + devShell = + let + inherit (pkgs.lib) filter hasSuffix; + noHooks = filter (drv: !(hasSuffix "hook.sh" drv.name)); + pyperscan' = pyperscan.override { python3Packages = pkgs.python38Packages; }; + + in + with pkgs; mkShell { + nativeBuildInputs = noHooks pyperscan'.hyperscan.nativeBuildInputs; + buildInputs = [ + just + pre-commit + rustToolchainDev + pkgs.maturin + pkgs.rust-bin.nightly.latest.rust-analyzer + ] + ++ pyperscan'.hyperscan.buildInputs + ++ (optionals (system == "x86_64-linux") + pyperscan'.buildInputs); + }; + }); +} diff --git a/hyperscan-sys/Cargo.toml b/hyperscan-sys/Cargo.toml new file mode 100644 index 0000000..72068a6 --- /dev/null +++ b/hyperscan-sys/Cargo.toml @@ -0,0 +1,15 @@ +[package] +name = "hyperscan-sys" +authors.workspace = true +version.workspace = true +edition.workspace = true +license.workspace = true +build = "build.rs" + +[features] +hyperscan = ["cmake"] +vectorscan = ["cmake"] + +[build-dependencies] +bindgen = "0.61.0" +cmake = { version = "0.1", optional = true } diff --git a/hyperscan-sys/build.rs b/hyperscan-sys/build.rs new file mode 100644 index 0000000..e6fbfa1 --- /dev/null +++ b/hyperscan-sys/build.rs @@ -0,0 +1,87 @@ +use std::env; +use std::path; + +#[cfg(feature = "hyperscan")] +const SOURCE: &str = "hyperscan"; + +#[cfg(feature = "vectorscan")] +const SOURCE: &str = "vectorscan"; + +fn main() { + let out_path = path::PathBuf::from(env::var("OUT_DIR").unwrap()); + #[allow(unused_mut)] + let mut config = bindgen::Builder::default() + .allowlist_function("hs_.*") + .allowlist_type("hs_.*") + .allowlist_var("HS_.*") + .header("wrapper.h"); + + #[cfg(any(feature = "hyperscan", feature = "vectorscan"))] + { + let src_dir = path::Path::new(env!("CARGO_MANIFEST_DIR")).join(SOURCE); + src_dir + .try_exists() + .expect("Hyperscan source directory doesn't exist"); + let include_dir = out_path + .join("include") + .into_os_string() + .into_string() + .unwrap(); + let out = String::from_utf8( + std::process::Command::new("c++") + .args(["-v"]) + .output() + .expect("Cannot find C++ compiler") + .stderr, + ) + .unwrap(); + + if out.contains("gcc") { + println!("cargo:rustc-link-lib=stdc++"); + } else if out.contains("clang") { + println!("cargo:rustc-link-lib=c++"); + } else { + panic!("No compatible compiler found. Either clang or gcc is needed."); + } + + let arch = env::var("CARGO_CFG_TARGET_ARCH").unwrap(); + let vendor = env::var("CARGO_CFG_TARGET_VENDOR").unwrap(); + // TODO: this could work on intel apple targets if build scripts wouldn't be that fragile + let toggle = if arch == "x86_64" && vendor != "apple" { + "ON" + } else { + "OFF" + }; + + let dst = cmake::Config::new(&src_dir) + .profile("release") + .define("CMAKE_INSTALL_INCLUDEDIR", &include_dir) + .define("FAT_RUNTIME", toggle) + .define("BUILD_AVX512", toggle) + .build(); + + println!("cargo:rerun-if-changed={}", file!()); + println!("cargo:rerun-if-changed={}", src_dir.to_str().unwrap()); + println!("cargo:rustc-link-lib=static=hs"); + println!( + "cargo:rustc-link-search={}", + dst.join("lib").to_str().unwrap() + ); + println!( + "cargo:rustc-link-search={}", + dst.join("lib64").to_str().unwrap() + ); + + config = config.clang_arg(format!("-I{}", &include_dir)); + } + #[cfg(not(any(feature = "hyperscan", feature = "vectorscan")))] + { + println!("cargo:rustc-link-lib=hs"); + } + + config + .generate() + .expect("Unable to generate bindings") + .write_to_file(out_path.join("bindings.rs")) + .expect("Couldn't write bindings!"); +} diff --git a/hyperscan-sys/hyperscan b/hyperscan-sys/hyperscan new file mode 160000 index 0000000..371276f --- /dev/null +++ b/hyperscan-sys/hyperscan @@ -0,0 +1 @@ +Subproject commit 371276f4f516aaaa650a9137744d8aab73f084cd diff --git a/hyperscan-sys/src/lib.rs b/hyperscan-sys/src/lib.rs new file mode 100755 index 0000000..a38a13a --- /dev/null +++ b/hyperscan-sys/src/lib.rs @@ -0,0 +1,5 @@ +#![allow(non_upper_case_globals)] +#![allow(non_camel_case_types)] +#![allow(non_snake_case)] + +include!(concat!(env!("OUT_DIR"), "/bindings.rs")); diff --git a/hyperscan-sys/vectorscan b/hyperscan-sys/vectorscan new file mode 160000 index 0000000..6d8599e --- /dev/null +++ b/hyperscan-sys/vectorscan @@ -0,0 +1 @@ +Subproject commit 6d8599eece531d843c8f02c69c2eb6ecef75900e diff --git a/hyperscan-sys/wrapper.h b/hyperscan-sys/wrapper.h new file mode 100644 index 0000000..5d9100a --- /dev/null +++ b/hyperscan-sys/wrapper.h @@ -0,0 +1 @@ +#include diff --git a/justfile b/justfile new file mode 100644 index 0000000..19ff1bf --- /dev/null +++ b/justfile @@ -0,0 +1,24 @@ +set positional-arguments + +help: + @just --list --unsorted + +clean: + rm -fr -- .venv target + +dev: + python3 -m venv .venv + . .venv/bin/activate && maturin develop -E test + +check: + cargo clippy + +test *args="--": + .venv/bin/py.test "$@" + +build-shared: _build +build-static-hyperscan: (_build "-F" "hyperscan") +build-static-vectorscan: (_build "-F" "vectorscan") + +_build *args="--": + maturin build "$@" diff --git a/nix/pyperscan.nix b/nix/pyperscan.nix new file mode 100644 index 0000000..ad6ac68 --- /dev/null +++ b/nix/pyperscan.nix @@ -0,0 +1,51 @@ +{ lib +, python3Packages +, rustPlatform +, hyperscan +, boost +, cmake +, pkg-config +, ragel +, util-linux +, vendorHyperscan ? false +, vendorVectorscan ? false +}: + +assert vendorHyperscan -> !vendorVectorscan; +assert vendorVectorscan -> !vendorHyperscan; + +let + inherit (lib) optionals; + vendor = vendorHyperscan || vendorVectorscan; + cargo_toml = builtins.fromTOML (builtins.readFile ../Cargo.toml); +in +python3Packages.buildPythonPackage { + inherit (cargo_toml.workspace.package) version; + + pname = "pyperscan"; + format = "pyproject"; + + src = builtins.path { name = "pyperscan-source"; path = ../.; filter = p: t: !(t == "directory" && baseNameOf p == "target"); }; + + cargoDeps = rustPlatform.importCargoLock { + lockFile = ../Cargo.lock; + }; + + maturinBuildFlags = (optionals vendorHyperscan [ "-F hyperscan" ]) ++ (optionals vendorVectorscan [ "-F vectorscan" ]); + + buildInputs = if vendor then [ boost util-linux ] else [ hyperscan ]; + + nativeBuildInputs = + (with rustPlatform; [ + bindgenHook + cargoSetupHook + maturinBuildHook + pkg-config + ] ++ (optionals vendor [ cmake ragel util-linux ])); + dontUseCmakeConfigure = true; + + checkInputs = [ python3Packages.pytest ]; + checkPhase = '' + py.test + ''; +} diff --git a/pyperscan.pyi b/pyperscan.pyi new file mode 100644 index 0000000..68f4e10 --- /dev/null +++ b/pyperscan.pyi @@ -0,0 +1,48 @@ +from collections.abc import Callable, Collection +from mmap import mmap +from typing import Any, Generic, TypeVar, Union + +class Flag: + CASELESS = ... + DOTALL = ... + MULTILINE = ... + SINGLEMATCH = ... + ALLOWEMPTY = ... + UTF8 = ... + UCP = ... + PREFILTER = ... + SOM_LEFTMOST = ... + COMBINATION = ... + QUIET = ... + +class Scan: + Continue = ... + Terminate = ... + +class Pattern: + def __new__(cls, expression: bytes, *flags: Flag, tag=None): ... + +Buffer = Union[bytes, mmap] + +class BlockScanner: + def scan(self, data: Buffer) -> Scan: ... + +C = TypeVar("C") + +class Database: + def __new__(cls, *patterns: Pattern): ... + def build( + self, context: C, on_match: Callable[[C, Any, int, int], Scan] + ) -> BlockScanner: ... + +class BlockDatabase(Database): ... + +class VectoredScanner: + def scan(self, data: Collection[Buffer]) -> Scan: ... + +class VectoredDatabase(Database): ... + +class StreamScanner: + def scan(self, data: Buffer) -> Scan: ... + +class StreamDatabase(Database): ... diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..f2a59bf --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,21 @@ +[project] +name = "pyperscan" +requires-python = ">=3.8" +classifiers = [ + "Programming Language :: Rust", + "Programming Language :: Python :: Implementation :: CPython", + "Programming Language :: Python :: Implementation :: PyPy", +] + +[project.optional-dependencies] +test = [ + "pytest ~= 7.0.0", + "mypy ~= 0.991", +] + +[tool.isort] +profile = "black" + +[build-system] +requires = ["maturin>=0.14,<0.15"] +build-backend = "maturin" diff --git a/rust-toolchain.toml b/rust-toolchain.toml new file mode 100644 index 0000000..6824925 --- /dev/null +++ b/rust-toolchain.toml @@ -0,0 +1,3 @@ +[toolchain] +channel = "1.65.0" +profile = "default" diff --git a/shell.nix b/shell.nix new file mode 100644 index 0000000..47458ad --- /dev/null +++ b/shell.nix @@ -0,0 +1,10 @@ +(import + ( + fetchTarball { + url = "https://github.com/edolstra/flake-compat/archive/99f1c2157fba4bfe6211a321fd0ee43199025dbf.tar.gz"; + sha256 = "0x2jn3vrawwv9xp15674wjz9pixwjyj3j771izayl962zziivbx2"; + } + ) + { + src = ./.; + }).shellNix diff --git a/src/hyperscan/error.rs b/src/hyperscan/error.rs new file mode 100644 index 0000000..9e56170 --- /dev/null +++ b/src/hyperscan/error.rs @@ -0,0 +1,154 @@ +use hyperscan_sys as ffi; +use pyo3::{pyclass, PyErr}; +use thiserror::Error; + +/// Hyperscan Error Codes +#[derive(Debug, Error)] +pub enum Error { + #[error("Pattern expression contains NULL byte")] + Nul(#[from] std::ffi::NulError), + + #[error("Error originating from Hyperscan API")] + Hyperscan(HyperscanErrorCode, i32), + + #[error("Pattern comilation failed, {0} at {1}")] + HypercanCompile(String, i32), + + #[error("Exception raised from Python callback")] + PythonError(#[from] PyErr), +} + +#[derive(Debug, PartialEq, Eq)] +#[pyclass] +pub enum HyperscanErrorCode { + /// A parameter passed to this function was invalid. + /// + /// This error is only returned in cases where the function can + /// detect an invalid parameter it cannot be relied upon to detect + /// (for example) pointers to freed memory or other invalid data. + Invalid, + + /// A memory allocation failed. + Nomem, + + /// The engine was terminated by callback. + /// + /// This return value indicates that the target buffer was + /// partially scanned, but that the callback function requested + /// that scanning cease after a match was located. + ScanTerminated, + + /// The pattern compiler failed, and the hs_compile_error_t should + /// be inspected for more detail. + CompilerError, + + /// The given database was built for a different version of Hyperscan. + DbVersionError, + + /// The given database was built for a different platform (i.e., CPU type). + DbPlatformError, + + /// The given database was built for a different mode of + /// operation. This error is returned when streaming calls are + /// used with a block or vectored database and vice versa. + DbModeError, + + /// A parameter passed to this function was not correctly aligned. + BadAlign, + + /// The memory allocator (either malloc() or the allocator set + /// with hs_set_allocator()) did not correctly return memory + /// suitably aligned for the largest representable data type on + /// this platform. + BadAlloc, + + /// The scratch region was already in use. + /// + /// s error is returned when Hyperscan is able to detect that the + /// scratch region given is already in use by another Hyperscan + /// API call. + /// + /// A separate scratch region, allocated with hs_alloc_scratch() + /// or hs_clone_scratch(), is required for every concurrent caller + /// of the Hyperscan API. + /// + /// For example, this error might be returned when hs_scan() has + /// been called inside a callback delivered by a + /// currently-executing hs_scan() call using the same scratch + /// region. + /// + /// Note: Not all concurrent uses of scratch regions may be + /// detected. This error is intended as a best-effort debugging + /// tool, not a guarantee. + ScratchInUse, + + /// Unsupported CPU architecture. + /// + /// This error is returned when Hyperscan is able to detect that + /// the current system does not support the required instruction + /// set. + /// + /// At a minimum, Hyperscan requires Supplemental Streaming SIMD + /// Extensions 3 (SSSE3). + ArchError, + + /// Provided buffer was too small. + /// + /// This error indicates that there was insufficient space in the + /// buffer. The call should be repeated with a larger provided + /// buffer. + /// + /// Note: in this situation, it is normal for the amount of space + /// required to be returned in the same manner as the used space + /// would have been returned if the call was successful. + InsufficientSpace, + + /// Unexpected internal error. + /// + /// This error indicates that there was unexpected matching + /// behaviors. This could be related to invalid usage of stream + /// and scratch space or invalid memory operations by users. + UnknownError, + + UnknownErrorCode, +} + +impl From for HyperscanErrorCode { + fn from(err: ffi::hs_error_t) -> Self { + match err { + ffi::HS_INVALID => Self::Invalid, + ffi::HS_NOMEM => Self::Nomem, + ffi::HS_SCAN_TERMINATED => Self::ScanTerminated, + ffi::HS_COMPILER_ERROR => Self::CompilerError, + ffi::HS_DB_VERSION_ERROR => Self::DbVersionError, + ffi::HS_DB_PLATFORM_ERROR => Self::DbPlatformError, + ffi::HS_DB_MODE_ERROR => Self::DbModeError, + ffi::HS_BAD_ALIGN => Self::BadAlign, + ffi::HS_BAD_ALLOC => Self::BadAlloc, + ffi::HS_SCRATCH_IN_USE => Self::ScratchInUse, + ffi::HS_ARCH_ERROR => Self::ArchError, + ffi::HS_INSUFFICIENT_SPACE => Self::InsufficientSpace, + ffi::HS_UNKNOWN_ERROR => Self::UnknownError, + _ => Self::UnknownErrorCode, + } + } +} + +impl From for Error { + fn from(err: ffi::hs_error_t) -> Self { + Error::Hyperscan(err.into(), err) + } +} +pub trait AsResult: Sized { + fn ok(self) -> Result<(), Error>; +} + +impl AsResult for ffi::hs_error_t { + fn ok(self) -> Result<(), Error> { + if self == ffi::HS_SUCCESS as ffi::hs_error_t { + Ok(()) + } else { + Err(self.into()) + } + } +} diff --git a/src/hyperscan/mod.rs b/src/hyperscan/mod.rs new file mode 100644 index 0000000..0e5d325 --- /dev/null +++ b/src/hyperscan/mod.rs @@ -0,0 +1,7 @@ +mod error; +mod native; +mod wrapper; + +pub use error::{AsResult, Error, HyperscanErrorCode}; +pub use native::*; +pub use wrapper::{Flag, Pattern, ScanMode}; diff --git a/src/hyperscan/native.rs b/src/hyperscan/native.rs new file mode 100644 index 0000000..d23a824 --- /dev/null +++ b/src/hyperscan/native.rs @@ -0,0 +1,238 @@ +use foreign_types::ForeignType; +use hyperscan_sys as hs; +use std::ffi::c_void; + +use super::{wrapper, AsResult, Error, HyperscanErrorCode, Pattern, ScanMode}; + +pub enum Scan { + Continue, + Terminate, +} + +pub trait MatchEventHandler: Fn(&mut T, u32, u64, u64) -> Result {} + +impl Result> MatchEventHandler for F {} + +pub struct BlockDatabase { + db: wrapper::Database, +} + +pub struct BlockScanner { + scratch: wrapper::Scratch, + database: wrapper::Database, + context: Context, +} + +impl BlockDatabase { + pub fn new(patterns: Vec) -> Result { + let db = wrapper::Database::new(patterns, ScanMode::BLOCK)?; + Ok(Self { db }) + } + + pub fn create_scanner( + &self, + context: Context, + ) -> Result, Error> { + BlockScanner::new(self, context) + } +} + +pub struct VectoredDatabase { + db: wrapper::Database, +} + +pub struct VectoredScanner { + scratch: wrapper::Scratch, + database: wrapper::Database, + context: Context, +} + +impl VectoredDatabase { + pub fn new(patterns: Vec) -> Result { + let db = wrapper::Database::new(patterns, ScanMode::VECTORED)?; + Ok(Self { db }) + } + + pub fn create_scanner( + &self, + context: Context, + ) -> Result, Error> { + VectoredScanner::new(self, context) + } +} + +pub struct StreamDatabase { + db: wrapper::Database, +} + +pub struct StreamScanner { + scratch: wrapper::Scratch, + stream: wrapper::Stream, + context: Context, +} + +impl StreamDatabase { + pub fn new(patterns: Vec) -> Result { + let db = wrapper::Database::new(patterns, ScanMode::STREAM | ScanMode::SOM_LARGE)?; + Ok(Self { db }) + } + + pub fn create_scanner( + &self, + context: Context, + ) -> Result, Error> { + StreamScanner::new(self, context) + } +} + +pub struct Context { + user_data: U, + match_error: Option, + match_event_handler: Box>, +} + +impl Context { + pub fn new(user_data: U, match_event_handler: impl MatchEventHandler + 'static) -> Self { + Self { + user_data, + match_error: None, + match_event_handler: Box::new(match_event_handler), + } + } +} + +impl BlockScanner { + pub fn new(db: &BlockDatabase, context: Context) -> Result { + let scratch = wrapper::Scratch::new(&db.db)?; + + Ok(Self { + database: db.db.try_clone()?, + scratch, + context, + }) + } +} + +impl VectoredScanner { + pub fn new(db: &VectoredDatabase, context: Context) -> Result { + let scratch = wrapper::Scratch::new(&db.db)?; + + Ok(Self { + database: db.db.try_clone()?, + scratch, + context, + }) + } +} + +impl StreamScanner { + pub fn new(db: &StreamDatabase, context: Context) -> Result { + let scratch = wrapper::Scratch::new(&db.db)?; + let stream = wrapper::Stream::new(&db.db)?; + + Ok(Self { + scratch, + stream, + context, + }) + } +} + +impl StreamScanner { + pub fn scan(&mut self, data: &[u8]) -> Result { + unsafe { + hs::hs_scan_stream( + self.stream.as_ptr(), + data.as_ptr() as *const _, + data.len() as u32, + 0, + self.scratch.as_ptr(), + Some(on_match::), + &mut self.context as *mut _ as *mut c_void, + ) + .ok() + .to_scan_result(self.context.match_error.take()) + } + } +} + +impl BlockScanner { + pub fn scan(&mut self, data: &[u8]) -> Result { + unsafe { + hs::hs_scan( + self.database.as_ptr(), + data.as_ptr() as *const _, + data.len() as u32, + 0, + self.scratch.as_ptr(), + Some(on_match::), + &mut self.context as *mut _ as *mut c_void, + ) + .ok() + .to_scan_result(self.context.match_error.take()) + } + } +} + +impl VectoredScanner { + pub fn scan(&mut self, data: Vec<&[u8]>) -> Result { + let (len, data): (Vec<_>, Vec<_>) = + data.iter().map(|d| (d.len() as u32, d.as_ptr())).unzip(); + unsafe { + hs::hs_scan_vector( + self.database.as_ptr(), + data.as_ptr() as *const *const _, + len.as_ptr(), + len.len() as u32, + 0, + self.scratch.as_ptr(), + Some(on_match::), + &mut self.context as *mut _ as *mut c_void, + ) + .ok() + .to_scan_result(self.context.match_error.take()) + } + } +} + +trait ScanResult: Sized { + fn to_scan_result(self, inner_err: Option) -> Result; +} + +impl ScanResult for Result<(), Error> { + fn to_scan_result(self, inner_err: Option) -> Result { + if let Some(inner) = inner_err { + Err(inner) + } else { + match self { + Ok(_) => Ok(Scan::Continue), + Err(err) => match err { + Error::Hyperscan(HyperscanErrorCode::ScanTerminated, _) => Ok(Scan::Terminate), + err => Err(err), + }, + } + } + } +} + +unsafe extern "C" fn on_match( + id: u32, + from: u64, + to: u64, + _flags: u32, + ctx: *mut c_void, +) -> i32 { + let context = (ctx as *mut Context) + .as_mut() + .expect("Context object unset"); + (context.match_event_handler)(&mut context.user_data, id, from, to).map_or_else( + |err| { + context.match_error = Some(err); + -1 + }, + |rv| match rv { + Scan::Continue => 0, + Scan::Terminate => 1, + }, + ) +} diff --git a/src/hyperscan/wrapper.rs b/src/hyperscan/wrapper.rs new file mode 100755 index 0000000..337f549 --- /dev/null +++ b/src/hyperscan/wrapper.rs @@ -0,0 +1,177 @@ +use crate::hyperscan::{AsResult, Error}; +use bitflags::bitflags; +use foreign_types::{foreign_type, ForeignType}; +use hyperscan_sys as hs; +use std::{ffi::CString, mem::MaybeUninit, ptr}; + +foreign_type! { + unsafe type CompileError { + type CType = hs::hs_compile_error_t; + fn drop = hs::hs_free_compile_error; + } + + pub unsafe type Database: Send + Sync { + type CType = hs::hs_database_t; + fn drop = hs::hs_free_database; + } + + pub unsafe type Scratch { + type CType = hs::hs_scratch_t; + fn drop = hs::hs_free_scratch; + } + + pub unsafe type Stream { + type CType = hs::hs_stream_t; + fn drop = stream_drop; + } +} + +unsafe fn stream_drop(stream: *mut hs::hs_stream_t) { + let _ = hs::hs_close_stream(stream, ptr::null_mut(), None, ptr::null_mut()); +} + +bitflags! { + #[derive(Default)] + pub struct Flag: u32 { + const CASELESS = hs::HS_FLAG_CASELESS; + const DOTALL = hs::HS_FLAG_DOTALL; + const MULTILINE = hs::HS_FLAG_MULTILINE; + const SINGLEMATCH = hs::HS_FLAG_SINGLEMATCH; + const ALLOWEMPTY = hs::HS_FLAG_ALLOWEMPTY; + const UTF8 = hs::HS_FLAG_UTF8; + const UCP = hs::HS_FLAG_UCP; + const PREFILTER = hs::HS_FLAG_PREFILTER; + const SOM_LEFTMOST = hs::HS_FLAG_SOM_LEFTMOST; + const COMBINATION = hs::HS_FLAG_COMBINATION; + const QUIET = hs::HS_FLAG_QUIET; + } +} + +pub struct Pattern { + expression: Vec, + flags: Flag, + id: Option, +} + +impl Pattern { + pub fn new(expression: Vec, flags: Flag, id: Option) -> Self { + Self { + expression, + flags, + id, + } + } +} + +impl Database { + pub fn new(patterns: Vec, mode: ScanMode) -> Result { + let mut c_exprs = Vec::with_capacity(patterns.len()); + let mut c_flags = Vec::with_capacity(patterns.len()); + let mut c_ids = Vec::with_capacity(patterns.len()); + for Pattern { + expression, + flags, + id, + } in patterns + { + // have to keep the original strings until the db is created + let c_expr = CString::new(expression)?; + c_exprs.push(c_expr); + c_flags.push(flags.bits()); + c_ids.push(id.unwrap_or(0)); + } + + let mut db = MaybeUninit::uninit(); + let mut err = MaybeUninit::uninit(); + unsafe { + hs::hs_compile_ext_multi( + c_exprs + .iter() + .map(|expr| expr.as_ptr()) + .collect::>() + .as_ptr(), + c_flags.as_ptr(), + c_ids.as_ptr(), + ptr::null(), + c_exprs.len() as u32, + mode.bits(), + ptr::null(), + db.as_mut_ptr(), + err.as_mut_ptr(), + ) + .ok() + .map_err(|_| err.assume_init())?; + Ok(Database::from_ptr(db.assume_init())) + } + } + + pub fn try_clone(&self) -> Result { + let mut buf = MaybeUninit::uninit(); + let mut len = 0usize; + unsafe { + hs::hs_serialize_database(self.as_ptr(), buf.as_mut_ptr(), &mut len).ok()?; + let buf = buf.assume_init(); + let mut copy = MaybeUninit::uninit(); + hs::hs_deserialize_database(buf, len, copy.as_mut_ptr()).ok()?; + let copy = copy.assume_init(); + Ok(Self::from_ptr(copy)) + } + } +} + +impl Scratch { + pub fn new(database: &Database) -> Result { + let mut scratch = MaybeUninit::zeroed(); + unsafe { + hs::hs_alloc_scratch(database.as_ptr(), scratch.as_mut_ptr()) + .ok() + .map(|_| Scratch::from_ptr(scratch.assume_init())) + } + } +} +impl Stream { + pub fn new(database: &Database) -> Result { + let mut stream = MaybeUninit::uninit(); + unsafe { + hs::hs_open_stream(database.as_ptr(), 0, stream.as_mut_ptr()) + .ok() + .map(|_| Stream::from_ptr(stream.assume_init())) + } + } +} + +impl CompileError { + fn message(&self) -> String { + unsafe { + let err = self.0.as_ptr(); + + std::ffi::CStr::from_ptr((*err).message) + .to_str() + .unwrap() + .into() + } + } + fn expression(&self) -> i32 { + unsafe { (*self.0.as_ptr()).expression } + } +} + +impl From<*mut hs::hs_compile_error> for Error { + fn from(err: *mut hs::hs_compile_error) -> Self { + unsafe { + let err = CompileError::from_ptr(err); + Self::HypercanCompile(err.message(), err.expression()) + } + } +} + +bitflags! { +pub struct ScanMode: u32 { + const BLOCK = hs::HS_MODE_BLOCK; + const VECTORED = hs::HS_MODE_VECTORED; + const STREAM = hs::HS_MODE_STREAM; + const SOM_SMALL = hs::HS_MODE_SOM_HORIZON_SMALL; + const SOM_MEDIUM = hs::HS_MODE_SOM_HORIZON_MEDIUM; + const SOM_LARGE = hs::HS_MODE_SOM_HORIZON_LARGE; +} +} diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..8a777c2 --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,2 @@ +mod hyperscan; +mod python; diff --git a/src/python/extension.rs b/src/python/extension.rs new file mode 100644 index 0000000..2ae02c7 --- /dev/null +++ b/src/python/extension.rs @@ -0,0 +1,299 @@ +use std::ops::Deref; + +use super::Buffer; +use crate::hyperscan::{ + BlockDatabase, BlockScanner, Context, Error, Flag, HyperscanErrorCode, Pattern, Scan, + StreamDatabase, StreamScanner, VectoredDatabase, VectoredScanner, +}; +use pyo3::{create_exception, exceptions::PyValueError, prelude::*, types::PyTuple}; + +#[pyclass(name = "Pattern", unsendable)] +struct PyPattern { + expression: Vec, + tag: Option, + flags: Flag, +} + +#[allow(non_camel_case_types)] +#[allow(clippy::upper_case_acronyms)] +#[pyclass(name = "Flag")] +#[derive(Clone)] +enum PyFlag { + CASELESS, + DOTALL, + MULTILINE, + SINGLEMATCH, + ALLOWEMPTY, + UTF8, + UCP, + PREFILTER, + SOM_LEFTMOST, + COMBINATION, + QUIET, +} + +#[pyclass(name = "Scan")] +#[derive(Clone)] +enum PyScan { + Continue, + Terminate, +} + +impl From for PyScan { + fn from(s: Scan) -> Self { + match s { + Scan::Continue => Self::Continue, + Scan::Terminate => Self::Terminate, + } + } +} + +impl From for Scan { + fn from(s: PyScan) -> Self { + match s { + PyScan::Continue => Self::Continue, + PyScan::Terminate => Self::Terminate, + } + } +} + +impl From<&PyFlag> for Flag { + fn from(flags: &PyFlag) -> Self { + match flags { + PyFlag::CASELESS => Flag::CASELESS, + PyFlag::DOTALL => Flag::DOTALL, + PyFlag::MULTILINE => Flag::MULTILINE, + PyFlag::SINGLEMATCH => Flag::SINGLEMATCH, + PyFlag::ALLOWEMPTY => Flag::ALLOWEMPTY, + PyFlag::UTF8 => Flag::UTF8, + PyFlag::UCP => Flag::UCP, + PyFlag::PREFILTER => Flag::PREFILTER, + PyFlag::SOM_LEFTMOST => Flag::SOM_LEFTMOST, + PyFlag::COMBINATION => Flag::COMBINATION, + PyFlag::QUIET => Flag::QUIET, + } + } +} + +#[pymethods] +impl PyPattern { + #[new] + #[args(expression, flags = "*", tag = "None")] + fn py_new(expression: &[u8], flags: &PyTuple, tag: Option) -> PyResult { + let flags = flags + .iter() + .map(|f| f.extract::()) + .collect::>>()? + .iter() + .fold(Flag::empty(), |a, f| a.union(f.into())); + Ok(PyPattern { + expression: expression.into(), + tag, + flags, + }) + } +} + +struct PyContext { + user_data: PyObject, + tag_mapping: Vec>, +} + +#[pyclass(name = "BlockDatabase")] +struct PyBlockDatabase { + db: BlockDatabase, + tag_mapping: Vec>, +} + +#[pymethods] +impl PyBlockDatabase { + #[new] + #[args(patterns = "*")] + fn py_new(py: Python<'_>, patterns: &PyTuple) -> PyResult { + let (patterns, tag_mapping) = to_tag_mapping(py, patterns)?; + Ok(Self { + db: BlockDatabase::new(patterns)?, + tag_mapping, + }) + } + + fn build( + &self, + user_data: PyObject, + match_event_handler: PyObject, + ) -> PyResult { + let context = create_context(self.tag_mapping.clone(), user_data, match_event_handler)?; + let scanner = self.db.create_scanner(context)?; + Ok(PyBlockScanner(scanner)) + } +} + +#[pyclass(unsendable, name = "BlockDatabase")] +struct PyBlockScanner(BlockScanner); + +#[pymethods] +impl PyBlockScanner { + fn scan(&mut self, data: Buffer) -> PyResult { + Ok(self.0.scan(&data)?.into()) + } +} + +#[pyclass(name = "VectoredDatabase")] +struct PyVectoredDatabase { + db: VectoredDatabase, + tag_mapping: Vec>, +} + +#[pymethods] +impl PyVectoredDatabase { + #[new] + #[args(patterns = "*")] + fn py_new(py: Python<'_>, patterns: &PyTuple) -> PyResult { + let (patterns, tag_mapping) = to_tag_mapping(py, patterns)?; + Ok(Self { + db: VectoredDatabase::new(patterns)?, + tag_mapping, + }) + } + + fn build( + &self, + user_data: PyObject, + match_event_handler: PyObject, + ) -> PyResult { + let context = create_context(self.tag_mapping.clone(), user_data, match_event_handler)?; + let scanner = self.db.create_scanner(context)?; + Ok(PyVectoredScanner(scanner)) + } +} + +#[pyclass(unsendable, name = "VectoredScanner")] +struct PyVectoredScanner(VectoredScanner); + +#[pymethods] +impl PyVectoredScanner { + fn scan(&mut self, data: Vec) -> PyResult { + let data = data.iter().map(|d| d.deref()).collect(); + Ok(self.0.scan(data)?.into()) + } +} +#[pyclass(name = "StreamDatabase")] +struct PyStreamDatabase { + db: StreamDatabase, + tag_mapping: Vec>, +} + +#[pymethods] +impl PyStreamDatabase { + #[new] + #[args(patterns = "*")] + fn py_new(py: Python<'_>, patterns: &PyTuple) -> PyResult { + let (patterns, tag_mapping) = to_tag_mapping(py, patterns)?; + Ok(Self { + db: StreamDatabase::new(patterns)?, + tag_mapping, + }) + } + + fn build( + &self, + user_data: PyObject, + match_event_handler: PyObject, + ) -> PyResult { + let context = create_context(self.tag_mapping.clone(), user_data, match_event_handler)?; + let scanner = self.db.create_scanner(context)?; + Ok(PyStreamScanner(scanner)) + } +} + +#[pyclass(unsendable, name = "StreamScanner")] +struct PyStreamScanner(StreamScanner); + +#[pymethods] +impl PyStreamScanner { + fn scan(&mut self, data: Buffer) -> PyResult { + Ok(self.0.scan(&data)?.into()) + } +} + +fn to_tag_mapping( + py: Python<'_>, + patterns: &PyTuple, +) -> PyResult<(Vec, Vec>)> { + Ok(patterns + .iter() + .map(|p| p.extract::>()) + .collect::>>()? + .iter() + .enumerate() + .map(|(id, p)| { + let pat = p.borrow(py); + ( + Pattern::new( + pat.expression.clone(), + pat.flags, + Some(id.try_into().unwrap()), + ), + pat.tag.as_ref().map(|t| t.to_object(py)), + ) + }) + .unzip()) +} + +fn create_context( + tag_mapping: Vec>, + user_data: PyObject, + match_event_handler: PyObject, +) -> PyResult> { + let match_handler = move |ctx: &mut PyContext, id, from, to| -> Result { + Python::with_gil(|py| { + let result; + if let Some(id) = ctx.tag_mapping.get(id as usize).unwrap() { + let args = (&ctx.user_data, id, from, to); + result = match_event_handler.call1(py, args)?; + } else { + let args = (&ctx.user_data, id, from, to); + result = match_event_handler.call1(py, args)?; + } + result.extract::(py).map(|s| s.into()) + }) + .map_err(|exc| exc.into()) + }; + let py_user_data = PyContext { + user_data, + tag_mapping, + }; + Ok(Context::new(py_user_data, match_handler)) +} + +impl From for PyErr { + fn from(err: Error) -> PyErr { + match err { + Error::Nul(_) => PyValueError::new_err(format!("{err}")), + Error::Hyperscan(e, c) => HyperscanError::new_err((e, c)), + Error::HypercanCompile(msg, expr) => HyperscanCompileError::new_err((msg, expr)), + Error::PythonError(exc) => exc, + } + } +} + +create_exception!(module, HyperscanError, pyo3::exceptions::PyException); +create_exception!(module, HyperscanCompileError, pyo3::exceptions::PyException); + +#[pymodule] +pub fn pyperscan(py: Python, m: &PyModule) -> PyResult<()> { + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + + m.add("HyperscanError", py.get_type::())?; + m.add( + "HyperscanCompileError", + py.get_type::(), + )?; + Ok(()) +} diff --git a/src/python/mod.rs b/src/python/mod.rs new file mode 100644 index 0000000..03ab14c --- /dev/null +++ b/src/python/mod.rs @@ -0,0 +1,5 @@ +mod extension; +mod wrapper; + +pub use extension::*; +use wrapper::Buffer; diff --git a/src/python/wrapper.rs b/src/python/wrapper.rs new file mode 100644 index 0000000..f5a5d88 --- /dev/null +++ b/src/python/wrapper.rs @@ -0,0 +1,52 @@ +use std::ffi::{c_int, c_void}; +use std::ops::Deref; +use std::ptr; +use std::slice; + +use pyo3::{ffi, prelude::*, AsPyPointer}; + +pub struct Buffer<'a>(&'a [u8]); + +impl<'a> Deref for Buffer<'a> { + type Target = [u8]; + + fn deref(&self) -> &Self::Target { + self.0 + } +} + +impl<'a> FromPyObject<'a> for Buffer<'a> { + fn extract(ob: &'a pyo3::PyAny) -> pyo3::PyResult { + let mut buf = ptr::null::(); + let mut len = 0usize; + let buf = unsafe { + error_on_minus_one( + ob.py(), + PyObject_AsReadBuffer( + ob.as_ptr(), + &mut buf as *mut *const _ as *mut *const c_void, + &mut len as *mut _ as *mut isize, + ), + )?; + slice::from_raw_parts(buf, len) + }; + Ok(Buffer(buf)) + } +} + +#[inline] +fn error_on_minus_one(py: Python, result: i32) -> PyResult<()> { + if result == -1 { + Err(PyErr::fetch(py)) + } else { + Ok(()) + } +} + +extern "C" { + fn PyObject_AsReadBuffer( + obj: *mut ffi::PyObject, + buffer: *mut *const c_void, + buffer_len: *mut ffi::Py_ssize_t, + ) -> c_int; +} diff --git a/tests/test_python.py b/tests/test_python.py new file mode 100644 index 0000000..0843a16 --- /dev/null +++ b/tests/test_python.py @@ -0,0 +1,161 @@ +import mmap +from unittest import mock + +import pytest + +import pyperscan as ps + + +def args(*args, **kwargs): + return (args, kwargs) + + +@pytest.mark.parametrize( + "args", + ( + pytest.param( + args( + b"foo", + ), + id="default-args", + ), + pytest.param(args(b"foo", ps.Flag.SOM_LEFTMOST), id="single-flag"), + pytest.param( + args(b"foo", ps.Flag.SOM_LEFTMOST, ps.Flag.DOTALL), id="multiple-flags" + ), + pytest.param( + args(b"foo", ps.Flag.SOM_LEFTMOST, ps.Flag.DOTALL, tag="bar"), + id="flag-tag", + ), + ), +) +def test_patterns(args): + ps.Pattern(*args[0], **args[1]) + + +def test_pattern_expression_argument_is_required(): + with pytest.raises( + TypeError, match="missing 1 required positional argument: 'expression'" + ): + ps.Pattern() # type: ignore + + +def test_pattern_expression_argument_must_be_str(): + with pytest.raises( + TypeError, + match="argument 'expression': 'str' object cannot be converted to 'PyBytes'", + ): + ps.Pattern("foo") # type: ignore + + +def test_pattern_flags_argument_must_be_flags(): + with pytest.raises( + TypeError, + match="'int' object cannot be converted to 'Flag'", + ): + ps.Pattern(b"foo", 123) # type: ignore + + +@pytest.fixture +def tag(): + return "tag" + + +@pytest.fixture +def database(request, tag): + pat = ps.Pattern(b"foo", ps.Flag.SOM_LEFTMOST, tag=tag) + return request.param(pat) + + +@pytest.fixture +def ctx(): + return object() + + +@pytest.fixture +def on_match(): + on_match = mock.Mock() + on_match.return_value = ps.Scan.Continue + return on_match + + +@pytest.mark.parametrize("database", (ps.BlockDatabase,), indirect=True) +def test_block_database(database, ctx, tag, on_match): + scan = database.build(ctx, on_match) + + scan.scan(b"foo") + on_match.assert_called_with(ctx, tag, 0, 3) + scan.scan(b"barfoo") + on_match.assert_called_with(ctx, tag, 3, 6) + + +@pytest.mark.parametrize( + "database", + (ps.VectoredDatabase,), + indirect=True, +) +def test_vectored_database(database, ctx, tag, on_match): + scan = database.build(ctx, on_match) + + scan.scan([b"foo", b"barfoo"]) + on_match.assert_has_calls((mock.call(ctx, tag, 0, 3), mock.call(ctx, tag, 6, 9))) + + +@pytest.mark.parametrize( + "database", + (ps.StreamDatabase,), + indirect=True, +) +def test_stream_database(database, ctx, tag, on_match): + scan = database.build(ctx, on_match) + + scan.scan(b"foo") + on_match.assert_called_with(ctx, tag, 0, 3) + scan.scan(b"barf") + scan.scan(b"oo") + on_match.assert_called_with(ctx, tag, 6, 9) + + +def test_pattern_tags(ctx, on_match): + scan = ps.BlockDatabase(ps.Pattern(b"foo", tag="tag"), ps.Pattern(b"bar")).build( + ctx, on_match + ) + scan.scan(b"foo") + on_match.assert_called_with(ctx, "tag", 0, 3) + scan.scan(b"bar") + on_match.assert_called_with(ctx, 1, 0, 3) + + +def test_data_can_be_mmap_object(ctx, on_match): + scan = ps.BlockDatabase(ps.Pattern(b"foo")).build(ctx, on_match) + + with mmap.mmap(-1, len("foo")) as mm: + mm.write(b"foo") + + scan.scan(mm) + on_match.assert_called_with(ctx, 0, 0, 3) + + +@pytest.mark.parametrize( + "database,data", + ( + (ps.BlockDatabase, b"foofoo"), + ( + ps.VectoredDatabase, + (b"foofoo",), + ), + (ps.StreamDatabase, b"foofoo"), + ), + indirect=("database",), +) +def test_scanning_can_be_aborted(database, data, ctx, tag, on_match): + scan = database.build(ctx, on_match) + + on_match.return_value = ps.Scan.Continue + assert scan.scan(data) == ps.Scan.Continue + assert on_match.call_count == 2 + + on_match.reset_mock() + on_match.return_value = ps.Scan.Terminate + assert scan.scan(data) == ps.Scan.Terminate + assert on_match.call_count == 1