Skip to content

Commit

Permalink
Merge pull request #1360 from gchq/1359-accelerated-compaction-support
Browse files Browse the repository at this point in the history
Rust compaction
  • Loading branch information
patchwork01 authored Jul 12, 2024
2 parents 9d041b9 + 217b214 commit 2427ac8
Show file tree
Hide file tree
Showing 67 changed files with 10,551 additions and 91 deletions.
1 change: 1 addition & 0 deletions .devcontainer/devcontainer.json
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
"vscjava.vscode-java-pack",
"shengchen.vscode-checkstyle",
"eamodio.gitlens",
"1yib.rust-bundle",
"mhutchie.git-graph"
]
}
Expand Down
4 changes: 3 additions & 1 deletion .github/config/chunks.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ chunks:
compaction:
name: Compaction
workflow: chunk-compaction.yaml
modules: [ compaction/compaction-job-execution, compaction/compaction-task-creation, compaction/compaction-job-creation, compaction/compaction-job-creation-lambda, compaction/compaction-status-store, compaction/compaction-core, splitter/splitter-core, splitter/splitter-lambda ]
modules: [ compaction/compaction-job-execution, compaction/compaction-task-creation, compaction/compaction-job-creation, compaction/compaction-job-creation-lambda, compaction/compaction-status-store, compaction/compaction-core, splitter/splitter-core, splitter/splitter-lambda, compaction/compaction-rust ]
data:
name: Data
workflow: chunk-data.yaml
Expand All @@ -47,3 +47,5 @@ chunks:
name: Trino
workflow: chunk-trino.yaml
modules: [ trino ]


1 change: 1 addition & 0 deletions .github/workflows/chunk-cdk.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ on:
- 'java/bulk-import/bulk-import-common/**'
- 'java/splitter/splitter-core/**'
- 'java/compaction/compaction-job-execution/**'
- 'java/compaction/compaction-rust/**'
- 'java/ingest/ingest-batcher-core/**'
- 'java/query/query-runner/**'
- 'java/garbage-collector/**'
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/chunk-compaction.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ on:
- 'java/splitter/pom.xml'
- 'java/compaction/compaction-job-execution/**'
- 'java/compaction/compaction-task-creation/**'
- 'java/compaction/compaction-rust/**'
- 'java/compaction/compaction-job-creation/**'
- 'java/compaction/compaction-job-creation-lambda/**'
- 'java/compaction/compaction-status-store/**'
Expand Down
13 changes: 11 additions & 2 deletions .github/workflows/chunk.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,10 @@ on:
chunkId:
required: true
type: string
skipRust:
default: true
required: false
type: boolean

jobs:
build:
Expand All @@ -15,6 +19,11 @@ jobs:
with:
java-version: '17'
distribution: 'corretto'
- uses: dtolnay/[email protected]
if: ${{ ! inputs.skipRust }}
- name: Install cargo cross
run: cargo install cross
if: ${{ ! inputs.skipRust }}
- name: Cache dependencies
uses: actions/cache@v4
with:
Expand All @@ -34,11 +43,11 @@ jobs:
-Dexec.args="${{ inputs.chunkId }} github_actions_outputs ${{ github.workspace }}/.github/config/chunks.yaml" \
>> $GITHUB_OUTPUT
- name: Compile
run: mvn --batch-mode clean install -am -pl ${{ steps.config.outputs.moduleList }} -Pquick,skipShade -Dmaven.repo.local=${{ runner.temp }}/.m2/repository
run: mvn --batch-mode clean install -am -pl ${{ steps.config.outputs.moduleList }} -Pquick,skipShade -DskipRust=${{ inputs.skipRust }} -Dmaven.repo.local=${{ runner.temp }}/.m2/repository
working-directory: ./java
- name: Test
id: test
run: mvn --batch-mode --fail-at-end verify -pl ${{ steps.config.outputs.moduleList }} -Dmaven.repo.local=${{ runner.temp }}/.m2/repository -e
run: mvn --batch-mode --fail-at-end verify -pl ${{ steps.config.outputs.moduleList }} -DskipRust=${{ inputs.skipRust }} -Dmaven.repo.local=${{ runner.temp }}/.m2/repository -e
working-directory: ./java
- name: Generate site
id: site
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/dependency-check.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ jobs:
run: mvn --batch-mode dependency-check:update-only -Dmaven.repo.local=${{ runner.temp }}/.m2/repository
working-directory: ./java
- name: Build with Maven
run: mvn --batch-mode verify dependency-check:aggregate -Pquick -Dmaven.repo.local=${{ runner.temp }}/.m2/repository
run: mvn --batch-mode verify dependency-check:aggregate -Pquick -DskipRust -Dmaven.repo.local=${{ runner.temp }}/.m2/repository
working-directory: ./java
- name: Cache Maven dependencies & CVEs database
uses: actions/cache/save@v3
Expand Down
7 changes: 6 additions & 1 deletion .github/workflows/docker-cli-image.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,12 @@ jobs:

steps:
- name: Delete huge unnecessary tools folder
run: rm -rf /opt/hostedtoolcache
run: |
rm -rf /opt/hostedtoolcache
rm -rf /usr/share/dotnet
rm -rf /opt/ghc
rm -rf "/usr/local/share/boost"
rm -rf "$AGENT_TOOLSDIRECTORY"
- uses: actions/checkout@v3
- uses: actions/setup-java@v3
with:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/docker-cli.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ jobs:
needs: setup
uses: ./.github/workflows/docker-cli-image.yaml
with:
mavenCmd: ./scripts/cli/environment/buildMaven.sh package -Pquick --batch-mode -Dmaven.repo.local=../.m2/repository
mavenCmd: ./scripts/cli/environment/buildMaven.sh package -Pquick -DskipRust --batch-mode -Dmaven.repo.local=../.m2/repository
pushImages: ${{ inputs.pushImages }}
context: ./scripts/cli/environment
pushTag: ${{ needs.setup.outputs.envTag }}
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/maven-full.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ jobs:
run: mvn de.qaware.maven:go-offline-maven-plugin:resolve-dependencies -Dmaven.repo.local=${{ runner.temp }}/.m2/repository
working-directory: ./java
- name: Build with Maven
run: mvn --batch-mode verify -Pquick -T 1C -Dmaven.repo.local=${{ runner.temp }}/.m2/repository
run: mvn --batch-mode verify -Pquick -T 1C -DskipRust -Dmaven.repo.local=${{ runner.temp }}/.m2/repository
working-directory: ./java
- name: Validate properties templates are up to date
working-directory: ./java
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -24,3 +24,4 @@ __pycache__
python/build/
python/src/sleeper.egg-info/
python/env/

101 changes: 101 additions & 0 deletions NOTICES
Original file line number Diff line number Diff line change
Expand Up @@ -283,6 +283,107 @@ s3fs:

- The 3-Clause BSD License

Sleeper contains Rust code. This has the following dependencies.

Rust Object Store (https://github.com/apache/arrow-rs/tree/master/object_store)

- Apache License, Version 2.0

color-eyre (https://github.com/yaahc/color-eyre)

- Apache License, Version 2.0

Tokio (https://github.com/tokio-rs/tokio)

- MIT License

owo-colors (https://github.com/jam1garner/owo-colors)

- MIT License

thiserror (https://github.com/dtolnay/thiserror)

- Apache License, Version 2.0

log (https://github.com/rust-lang/log)

- Apache License, Version 2.0

env_logger (https://github.com/rust-cli/env_logger/)

- Apache License, Version 2.0

human-panic (https://github.com/rust-cli/human-panic)

- Apache License, Version 2.0

clap (https://github.com/clap-rs/clap)

- Apache License, Version 2.0

libc (https://github.com/rust-lang/libc)

- Apache License, Version 2.0

arrow (https://github.com/apache/arrow-rs)

- Apache License, Version 2.0

futures (https://github.com/rust-lang/futures-rs)

- MIT License

itertools (https://github.com/rust-itertools/itertools)

- Apache License, Version 2.0

object_store (https://github.com/apache/arrow-rs/tree/master/object_store)

- Apache License, Version 2.0

aws-config (https://github.com/awslabs/smithy-rs)

- Apache License, Version 2.0

aws-credentials (https://github.com/awslabs/smithy-rs)

- Apache License, Version 2.0

aws-types (https://github.com/awslabs/smithy-rs)

- Apache License, Version 2.0

url (https://github.com/servo/rust-url)

- Apache License, Version 2.0

bytes (https://github.com/tokio-rs/bytes)

- MIT License

tokio-test (https://github.com/tokio-rs/tokio)

- MIT License

chrono (https://github.com/chronotope/chrono)

- Apache License, Version 2.0

num-format (https://github.com/bcmyers/num-format)

- Apache License, Version 2.0

cxx (https://github.com/dtolnay/cxx)

- Apache 2 License

datasketches-cpp (https://github.com/apache/datasketches-cpp)

- Apache 2 License

git2 (https://github.com/rust-lang/git2-rs)

- Apache 2 License


The build pipeline uses the following GitHub Actions from the marketplace.
Expand Down
35 changes: 23 additions & 12 deletions docs/11-dev-guide.md
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,8 @@ You will need the following software:
* [Java 11/17](https://openjdk.java.net/install/)
* [Maven](https://maven.apache.org/): Tested with v3.8.6
* [NodeJS / NPM](https://github.com/nvm-sh/nvm#installing-and-updating): Tested with NodeJS v16.16.0 and npm v8.11.0
* [Rust](https://rustup.rs/): Tested with Rust v1.77
* [Cross-rs](https://github.com/cross-rs/cross)

## Building

Expand Down Expand Up @@ -122,6 +124,15 @@ mvn clean install -Pquick

Removing the '-Pquick' option will cause the unit and integration tests to run.

### Disabling Rust component

You can disable the building of the Rust modules with:

```bash
cd java
mvn clean install -Pquick -DskipRust=true
```

## Using the codebase

The codebase is structured around the components explained in the [design document](12-design.md). The elements of the
Expand All @@ -141,19 +152,19 @@ For VS Code there's [a separate setup guide](/.vscode/README.md).

For IntelliJ, these settings are available to import:

- Code style scheme at [code-style/intellij-style.xml](/code-style/intellij-style.xml)
- Inspection profile at [code-style/intellij-inspection-profile.xml](/code-style/intellij-inspection-profile.xml)
- Copyright profile for license header
* Code style scheme at [code-style/intellij-style.xml](/code-style/intellij-style.xml)
* Inspection profile at [code-style/intellij-inspection-profile.xml](/code-style/intellij-inspection-profile.xml)
* Copyright profile for license header
at [code-style/intellij-copyright-profile.xml](/code-style/intellij-copyright-profile.xml)
- Checkstyle plugin settings in [code-style/checkstyle-idea](/code-style/checkstyle-idea)
* Checkstyle plugin settings in [code-style/checkstyle-idea](/code-style/checkstyle-idea)

For Eclipse, these settings are available to import:

- Code style at [code-style/eclipse-style.xml](/code-style/eclipse-style.xml)
- Import order at [code-style/eclipse-import-order.importorder](/code-style/eclipse-import-order.importorder)
- License header at [code-style/licenseHeader.txt](/code-style/licenseHeader.txt)
- Code templates at [code-style/eclipse-codetemplates.xml](/code-style/eclipse-codetemplates.xml)
- Editor templates at [code-style/eclipse-templates.xml](/code-style/eclipse-templates.xml)
* Code style at [code-style/eclipse-style.xml](/code-style/eclipse-style.xml)
* Import order at [code-style/eclipse-import-order.importorder](/code-style/eclipse-import-order.importorder)
* License header at [code-style/licenseHeader.txt](/code-style/licenseHeader.txt)
* Code templates at [code-style/eclipse-codetemplates.xml](/code-style/eclipse-codetemplates.xml)
* Editor templates at [code-style/eclipse-templates.xml](/code-style/eclipse-templates.xml)

### Linting

Expand All @@ -173,7 +184,7 @@ We try to ensure that all classes have Javadoc. Most methods should also have Ja
getters and setters can be skipped unless there's something important to know.

See Oracle's standards for Javadoc:
https://www.oracle.com/technical-resources/articles/java/javadoc-tool.html
<https://www.oracle.com/technical-resources/articles/java/javadoc-tool.html>

Note that the first sentence in a Javadoc comment will be used as a summary fragment in generated documentation. This
should not contain any links or formatting, to read normally as an item in a list.
Expand Down Expand Up @@ -244,8 +255,8 @@ When deploying multiple instances (or running multiple system tests), many log g
it difficult to find the logs you need to view. This script will delete any log groups that meet all of the following
criteria:

- Its name does not contain the name of any deployed CloudFormation stack
- Either it's empty, or it has no retention period and is older than 30 days
* Its name does not contain the name of any deployed CloudFormation stack
* Either it's empty, or it has no retention period and is older than 30 days

This can be used to limit the number of log groups in your AWS account, particularly if all your log groups are
deployed by the CDK or CloudFormation, with the stack name in the log group name.
Expand Down
4 changes: 4 additions & 0 deletions example/full/instance.properties
Original file line number Diff line number Diff line change
Expand Up @@ -940,6 +940,10 @@ sleeper.default.table.compaction.strategy.sizeratio.ratio=3
# concurrently per partition. It can be overridden on a per-table basis.
sleeper.default.table.compaction.strategy.sizeratio.max.concurrent.jobs.per.partition=100000

# Select what compaction method to use on a table. Current options are JAVA and RUST. Rust compaction
# support is experimental.
sleeper.default.table.compaction.method=JAVA


## The following properties relate to queries.

Expand Down
4 changes: 4 additions & 0 deletions example/full/table.properties
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,10 @@ sleeper.table.compaction.strategy.sizeratio.ratio=3
# concurrently per partition.
sleeper.table.compaction.strategy.sizeratio.max.concurrent.jobs.per.partition=2147483647

# Select what compaction method to use on a table. Current options are JAVA and RUST. Rust compaction
# support is experimental.
sleeper.table.compaction.method=JAVA


## The following table properties relate to storing and retrieving metadata for tables.

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ public int hashCode() {

@Override
public String toString() {
return "OnPushPathsDiff{" +
return "OnPullRequestPathsDiff{" +
"expected=" + expected +
", actual=" + actual +
", missingEntries=" + missingEntries +
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@
import java.util.stream.Stream;

public interface CompactionJobStatusStore {

CompactionJobStatusStore NONE = new CompactionJobStatusStore() {
};

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
/*
* Copyright 2022-2024 Crown Copyright
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package sleeper.compaction.job;

import sleeper.core.record.process.RecordsProcessed;

@FunctionalInterface
public interface CompactionRunner extends CompactionRunnerDetails {
RecordsProcessed compact(CompactionJob job) throws Exception;
}
Loading

0 comments on commit 2427ac8

Please sign in to comment.