From 1d2700818474c008eaa324ac1b5c49c9d2857298 Mon Sep 17 00:00:00 2001 From: Nam Bui Date: Fri, 15 May 2020 00:39:54 +0200 Subject: [PATCH] [BEAM-9876] Migrate the Beam website from Jekyll to Hugo to enable localization of the site content (#11554) [BEAM-9876] Migrate the Beam website from Jekyll to Hugo to enable localization of the site content (#11554) * Moved files from src to www * Applied changes - Migrated Jekyll to Hugo Setup Docsy themes Fixed github_sample Added Works With logos Sync new update Adopted to reviews & Updated content Excluded website build files Added Apache License Docker removed root Modified test link checker Latest sync * Cleaned unused Jekyll files * Adapted to reviews * Fixes based on diff script * Corrected path flink_runner.gradle --- .gitmodules | 3 + build.gradle | 13 +- runners/flink/flink_runner.gradle | 2 +- website/.gitignore | 10 + website/CONTRIBUTE.md | 394 ++++ website/Dockerfile | 91 +- website/Gemfile | 38 - website/Gemfile.lock | 94 - website/README.md | 42 +- website/Rakefile | 25 - website/_config.yml | 70 - .../append_index_html_to_internal_links.py | 24 + website/build.gradle | 98 +- website/src/_data/authors.yml | 157 -- website/src/_data/capability-matrix.yml | 1708 ----------------- website/src/_data/meetings.yml | 39 - website/src/_includes/button-pydoc.md | 23 - website/src/_includes/button.md | 21 - website/src/_includes/buttons-code-snippet.md | 43 - .../src/_includes/capability-matrix-common.md | 20 - .../capability-matrix-row-summary.md | 14 - website/src/_includes/capability-matrix.md | 48 - website/src/_includes/footer.html | 66 - website/src/_includes/head.html | 42 - website/src/_includes/icon-github.svg | 19 - website/src/_includes/icon-twitter.svg | 19 - website/src/_includes/page-toc.html | 88 - .../_includes/section-menu/contribute.html | 45 - .../_includes/section-menu/documentation.html | 307 --- .../src/_includes/section-menu/roadmap.html | 49 - .../src/_includes/section-menu/runners.html | 23 - website/src/_includes/section-menu/sdks.html | 109 -- website/src/_layouts/post.html | 32 - website/src/_layouts/section.html | 38 - website/src/_layouts/v2home.html | 217 --- website/src/blog/index.md | 51 - website/src/coming-soon.md | 22 - website/src/community/logos.md | 80 - website/src/documentation/index.md | 70 - .../documentation/transforms/java/index.md | 82 - .../transforms/python/aggregation/latest.md | 26 - .../documentation/transforms/python/index.md | 86 - website/src/feed.xml | 39 - website/src/index.md | 176 -- website/src/v2/index.md | 51 - website/www/build_github_samples.sh | 41 + website/www/check-links.sh | 93 + website/www/package.json | 19 + .../site/archetypes/blog.md} | 9 +- .../site/archetypes/default.md} | 4 +- .../site/assets/scss}/_bootstrap.scss | 0 .../site/assets/scss}/_breakpoints.sass | 0 .../site/assets/scss}/_button.sass | 0 .../site/assets/scss}/_cards.sass | 0 .../_sass => www/site/assets/scss}/_ctas.sass | 0 .../site/assets/scss}/_footer.sass | 0 .../site/assets/scss}/_global.sass | 0 .../site/assets/scss}/_graphic.sass | 0 .../site/assets/scss}/_header.sass | 0 .../_sass => www/site/assets/scss}/_hero.sass | 0 .../site/assets/scss}/_layout.scss | 0 .../site/assets/scss}/_logos.sass | 0 .../site/assets/scss}/_navbar.sass | 0 .../site/assets/scss}/_page-nav.sass | 9 + .../site/assets/scss}/_pillars.sass | 0 .../site/assets/scss}/_section-nav.sass | 0 .../assets/scss}/_syntax-highlighting.scss | 0 .../www/site/assets/scss/_table-wrapper.sass | 24 + .../site/assets/scss}/_toggler-nav.scss | 0 .../_sass => www/site/assets/scss}/_type.sass | 0 .../_sass => www/site/assets/scss}/_vars.sass | 0 .../site/assets/scss}/bootstrap/_alerts.scss | 0 .../site/assets/scss}/bootstrap/_badges.scss | 0 .../assets/scss}/bootstrap/_breadcrumbs.scss | 0 .../scss}/bootstrap/_button-groups.scss | 0 .../site/assets/scss}/bootstrap/_buttons.scss | 0 .../assets/scss}/bootstrap/_carousel.scss | 0 .../site/assets/scss}/bootstrap/_close.scss | 0 .../site/assets/scss}/bootstrap/_code.scss | 0 .../bootstrap/_component-animations.scss | 0 .../assets/scss}/bootstrap/_dropdowns.scss | 0 .../site/assets/scss}/bootstrap/_forms.scss | 0 .../assets/scss}/bootstrap/_glyphicons.scss | 0 .../site/assets/scss}/bootstrap/_grid.scss | 0 .../assets/scss}/bootstrap/_input-groups.scss | 0 .../assets/scss}/bootstrap/_jumbotron.scss | 0 .../site/assets/scss}/bootstrap/_labels.scss | 0 .../assets/scss}/bootstrap/_list-group.scss | 0 .../site/assets/scss}/bootstrap/_media.scss | 0 .../site/assets/scss}/bootstrap/_mixins.scss | 0 .../site/assets/scss}/bootstrap/_modals.scss | 0 .../site/assets/scss}/bootstrap/_navbar.scss | 0 .../site/assets/scss}/bootstrap/_navs.scss | 0 .../assets/scss}/bootstrap/_normalize.scss | 0 .../site/assets/scss}/bootstrap/_pager.scss | 0 .../assets/scss}/bootstrap/_pagination.scss | 0 .../site/assets/scss}/bootstrap/_panels.scss | 0 .../assets/scss}/bootstrap/_popovers.scss | 0 .../site/assets/scss}/bootstrap/_print.scss | 0 .../scss}/bootstrap/_progress-bars.scss | 0 .../scss}/bootstrap/_responsive-embed.scss | 0 .../bootstrap/_responsive-utilities.scss | 0 .../assets/scss}/bootstrap/_scaffolding.scss | 0 .../site/assets/scss}/bootstrap/_tables.scss | 0 .../site/assets/scss}/bootstrap/_theme.scss | 0 .../assets/scss}/bootstrap/_thumbnails.scss | 0 .../site/assets/scss}/bootstrap/_tooltip.scss | 0 .../site/assets/scss}/bootstrap/_type.scss | 0 .../assets/scss}/bootstrap/_utilities.scss | 0 .../assets/scss}/bootstrap/_variables.scss | 0 .../site/assets/scss}/bootstrap/_wells.scss | 0 .../scss}/bootstrap/mixins/_alerts.scss | 0 .../bootstrap/mixins/_background-variant.scss | 0 .../bootstrap/mixins/_border-radius.scss | 0 .../scss}/bootstrap/mixins/_buttons.scss | 0 .../scss}/bootstrap/mixins/_center-block.scss | 0 .../scss}/bootstrap/mixins/_clearfix.scss | 0 .../assets/scss}/bootstrap/mixins/_forms.scss | 0 .../scss}/bootstrap/mixins/_gradients.scss | 0 .../bootstrap/mixins/_grid-framework.scss | 0 .../assets/scss}/bootstrap/mixins/_grid.scss | 0 .../scss}/bootstrap/mixins/_hide-text.scss | 0 .../assets/scss}/bootstrap/mixins/_image.scss | 0 .../scss}/bootstrap/mixins/_labels.scss | 0 .../scss}/bootstrap/mixins/_list-group.scss | 0 .../scss}/bootstrap/mixins/_nav-divider.scss | 0 .../bootstrap/mixins/_nav-vertical-align.scss | 0 .../scss}/bootstrap/mixins/_opacity.scss | 0 .../scss}/bootstrap/mixins/_pagination.scss | 0 .../scss}/bootstrap/mixins/_panels.scss | 0 .../scss}/bootstrap/mixins/_progress-bar.scss | 0 .../scss}/bootstrap/mixins/_reset-filter.scss | 0 .../scss}/bootstrap/mixins/_reset-text.scss | 0 .../scss}/bootstrap/mixins/_resize.scss | 0 .../mixins/_responsive-visibility.scss | 0 .../assets/scss}/bootstrap/mixins/_size.scss | 0 .../scss}/bootstrap/mixins/_tab-focus.scss | 0 .../scss}/bootstrap/mixins/_table-row.scss | 0 .../bootstrap/mixins/_text-emphasis.scss | 0 .../bootstrap/mixins/_text-overflow.scss | 0 .../bootstrap/mixins/_vendor-prefixes.scss | 0 .../site/assets/scss}/capability-matrix.scss | 0 .../site/assets/scss/main.scss} | 4 +- website/www/site/config.toml | 112 ++ website/www/site/content/en/_index.md | 17 + .../content/en/blog/added-apex-runner.md} | 13 +- .../en/blog/adding-data-sources-to-sql.md} | 21 +- .../site/content/en/blog/beam-2.10.0.md} | 9 +- .../site/content/en/blog/beam-2.11.0.md} | 9 +- .../site/content/en/blog/beam-2.12.0.md} | 9 +- .../site/content/en/blog/beam-2.13.0.md} | 10 +- .../site/content/en/blog/beam-2.14.0.md} | 11 +- .../site/content/en/blog/beam-2.15.0.md} | 11 +- .../site/content/en/blog/beam-2.16.0.md} | 11 +- .../site/content/en/blog/beam-2.17.0.md} | 13 +- .../site/content/en/blog/beam-2.18.0.md} | 10 +- .../site/content/en/blog/beam-2.19.0.md} | 11 +- .../site/content/en/blog/beam-2.20.0.md} | 12 +- .../site/content/en/blog/beam-2.3.0.md} | 7 +- .../site/content/en/blog/beam-2.5.0.md} | 9 +- .../site/content/en/blog/beam-2.6.0.md} | 7 +- .../site/content/en/blog/beam-2.7.0.md} | 12 +- .../site/content/en/blog/beam-2.8.0.md} | 9 +- .../site/content/en/blog/beam-2.9.0.md} | 9 +- .../site/content/en/blog/beam-a-look-back.md} | 13 +- .../en/blog/beam-first-stable-release.md} | 7 +- .../site/content/en/blog/beam-graduates.md} | 9 +- .../site/content/en/blog/beam-has-a-logo.md} | 13 +- .../content/en/blog/beam-kata-release.md} | 15 +- .../site/content/en/blog/beam-kotlin.md} | 41 +- .../content/en/blog/beam-summit-aftermath.md} | 9 +- .../en/blog/beam-summit-digital-2020.md} | 9 +- .../en/blog/beam-summit-europe-2019.md} | 15 +- .../content/en/blog/beam-summit-europe.md} | 9 +- .../site/content/en/blog/beam-summit-site.md} | 8 +- .../site/content/en/blog/capability-matrix.md | 40 + .../site/content/en/blog/first-release.md} | 12 +- .../en/blog/flink-batch-runner-milestone.md} | 7 +- .../en/blog/graduation-media-recap.md} | 11 +- .../site/content/en/blog/gsoc-19.md} | 13 +- .../site/content/en/blog/looping-timers.md} | 19 +- .../en/blog/presentation-materials.md} | 12 +- .../content/en/blog/python-sdk-now-public.md} | 11 +- .../content/en/blog/python-sdk-release.md} | 15 +- .../review-input-streaming-connectors.md} | 45 +- .../site/content/en/blog/season-of-docs.md} | 9 +- .../site/content/en/blog/six-months.md} | 9 +- .../en/blog/splitAtFraction-method.md} | 7 +- .../site/content/en/blog/splittable-do-fn.md} | 59 +- .../content/en/blog/stateful-processing.md} | 72 +- .../en/blog/strata-hadoop-world-and-beam.md} | 16 +- .../site/content/en/blog/test-stream.md} | 67 +- .../content/en/blog/timely-processing.md} | 65 +- .../blog/where-is-my-pcollection-dot-map.md} | 9 +- .../site/content/en}/community/contact-us.md | 12 +- .../site/content/en}/community/in-person.md | 8 +- .../content/en}/community/integrations.md | 4 - .../www/site/content/en/community/logos.md | 45 + .../site/content/en}/community/policies.md | 3 - .../en}/community/presentation-materials.md | 5 +- .../content/en}/community/twitter-handle.md | 3 - .../content/en}/community/youtube-channel.md | 5 +- .../site/content/en/contribute/_index.md} | 57 +- .../en}/contribute/become-a-committer.md | 3 - .../content/en}/contribute/committer-guide.md | 7 +- .../content/en}/contribute/dependencies.md | 5 +- .../en}/contribute/design-documents.md | 8 +- .../en}/contribute/feature-branches.md | 8 +- .../site/content/en}/contribute/get-help.md | 9 +- .../content/en}/contribute/jira-priorities.md | 3 - .../en}/contribute/postcommits-guides.md | 3 - .../postcommits-policies-details.md | 3 - .../en}/contribute/postcommits-policies.md | 21 +- .../en}/contribute/precommit-policies.md | 3 - .../en}/contribute/precommit-triage-guide.md | 3 - .../en}/contribute/ptransform-style-guide.md | 48 +- .../en}/contribute/release-blocking.md | 4 +- .../content/en}/contribute/release-guide.md | 25 +- .../content/en}/contribute/runner-guide.md | 105 +- .../site/content/en}/contribute/team.md | 11 +- .../site/content/en}/contribute/testing.md | 6 +- .../site/content/en/documentation/_index.md | 67 + .../dsls/sql/calcite/aggregate-functions.md | 9 +- .../dsls/sql/calcite/data-types.md | 9 +- .../dsls/sql/calcite/lexical-structure.md | 8 +- .../dsls/sql/calcite/overview.md | 42 +- .../dsls/sql/calcite/query-syntax.md | 21 +- .../dsls/sql/calcite/scalar-functions.md | 27 +- .../sql/extensions/create-external-table.md | 20 +- .../dsls/sql/extensions/joins.md | 6 +- .../documentation/dsls/sql/extensions/set.md | 14 +- .../sql/extensions/user-defined-functions.md | 14 +- .../extensions/windowing-and-triggering.md | 6 +- .../en}/documentation/dsls/sql/overview.md | 22 +- .../en}/documentation/dsls/sql/shell.md | 14 +- .../en}/documentation/dsls/sql/walkthrough.md | 30 +- .../dsls/sql/zetasql/aggregate-functions.md | 7 +- .../sql/zetasql/conditional-expressions.md | 7 +- .../dsls/sql/zetasql/conversion-rules.md | 10 +- .../dsls/sql/zetasql/data-types.md | 92 +- .../documentation/dsls/sql/zetasql/lexical.md | 42 +- .../dsls/sql/zetasql/math-functions.md | 10 +- .../dsls/sql/zetasql/operators.md | 61 +- .../dsls/sql/zetasql/overview.md | 23 +- .../dsls/sql/zetasql/query-syntax.md | 153 +- .../dsls/sql/zetasql/string-functions.md | 7 +- .../documentation/dsls/sql/zetasql/syntax.md | 4 +- .../content/en}/documentation/io/built-in.md | 17 +- .../io/built-in/google-bigquery.md} | 397 ++-- .../en/documentation/io/built-in/hadoop.md} | 171 +- .../en/documentation/io/built-in/hcatalog.md} | 22 +- .../en/documentation/io/built-in/parquet.md} | 71 +- .../documentation/io/developing-io-java.md | 15 +- .../io/developing-io-overview.md | 26 +- .../documentation/io/developing-io-python.md | 51 +- .../content/en}/documentation/io/testing.md | 15 +- .../en}/documentation/patterns/custom-io.md | 15 +- .../documentation/patterns/custom-windows.md | 42 +- .../documentation/patterns/file-processing.md | 75 +- .../en}/documentation/patterns/overview.md | 21 +- .../patterns/pipeline-options.md | 29 +- .../en}/documentation/patterns/side-inputs.md | 39 +- .../pipelines/create-your-pipeline.md | 39 +- .../pipelines/design-your-pipeline.md | 47 +- .../pipelines/test-your-pipeline.md | 78 +- .../en}/documentation/programming-guide.md | 1182 ++++++------ .../resources/learning-resources.md | 6 +- .../resources/videos-and-podcasts.md | 12 +- .../content/en}/documentation/runners/apex.md | 8 +- .../runners/capability-matrix.md | 28 +- .../en}/documentation/runners/dataflow.md | 57 +- .../en}/documentation/runners/direct.md | 32 +- .../en}/documentation/runners/flink.md | 141 +- .../en}/documentation/runners/gearpump.md | 15 +- .../content/en}/documentation/runners/jet.md | 53 +- .../en}/documentation/runners/jstorm.md | 12 +- .../en}/documentation/runners/mapreduce.md | 10 +- .../content/en}/documentation/runners/nemo.md | 8 +- .../en}/documentation/runners/samza.md | 14 +- .../en}/documentation/runners/spark.md | 140 +- .../en}/documentation/runtime/environments.md | 21 +- .../en}/documentation/runtime/model.md | 31 +- .../runtime/sdk-harness-config.md | 7 +- .../documentation/sdks/feature-comparison.md | 4 +- .../site/content/en}/documentation/sdks/go.md | 6 +- .../documentation/sdks/java-dependencies.md | 8 +- .../en}/documentation/sdks/java-extensions.md | 4 +- .../en}/documentation/sdks/java-thirdparty.md | 4 +- .../content/en}/documentation/sdks/java.md | 20 +- .../en/documentation/sdks/java}/euphoria.md | 140 +- .../sdks/java/testing}/nexmark.md | 16 +- .../documentation/sdks/python-dependencies.md | 10 +- .../sdks/python-pipeline-dependencies.md | 4 +- .../documentation/sdks/python-streaming.md | 42 +- .../documentation/sdks/python-type-safety.md | 71 +- .../content/en}/documentation/sdks/python.md | 16 +- .../java/aggregation/approximatequantiles.md | 10 +- .../java/aggregation/approximateunique.md | 12 +- .../java/aggregation/cogroupbykey.md | 14 +- .../transforms/java/aggregation/combine.md | 20 +- .../java/aggregation/combinewithcontext.md | 12 +- .../transforms/java/aggregation/count.md | 10 +- .../transforms/java/aggregation/distinct.md | 10 +- .../transforms/java/aggregation/groupbykey.md | 12 +- .../java/aggregation/groupintobatches.md | 8 +- .../transforms/java/aggregation/hllcount.md | 28 +- .../transforms/java/aggregation/latest.md | 14 +- .../transforms/java/aggregation/max.md | 20 +- .../transforms/java/aggregation/mean.md | 20 +- .../transforms/java/aggregation/min.md | 12 +- .../transforms/java/aggregation/sample.md | 10 +- .../transforms/java/aggregation/sum.md | 16 +- .../transforms/java/aggregation/top.md | 8 +- .../transforms/java/elementwise}/filter.md | 18 +- .../java/elementwise}/flatmapelements.md | 10 +- .../transforms/java/elementwise}/keys.md | 16 +- .../transforms/java/elementwise}/kvswap.md | 16 +- .../java/elementwise}/mapelements.md | 20 +- .../transforms/java/elementwise}/pardo.md | 24 +- .../transforms/java/elementwise}/partition.md | 18 +- .../transforms/java/elementwise}/regex.md | 8 +- .../transforms/java/elementwise}/reify.md | 10 +- .../transforms/java/elementwise}/tostring.md | 8 +- .../transforms/java/elementwise}/values.md | 16 +- .../transforms/java/elementwise}/withkeys.md | 16 +- .../java/elementwise}/withtimestamps.md | 8 +- .../transforms/java/other/create.md | 6 +- .../transforms/java/other/flatten.md | 17 +- .../transforms/java/other/passert.md | 14 +- .../transforms/java/other/view.md | 10 +- .../transforms/java/other/window.md | 10 +- .../documentation/transforms/java/overview.md | 79 + .../aggregation/approximatequantiles.md | 3 - .../python/aggregation/approximateunique.md | 3 - .../python/aggregation/cogroupbykey.md | 13 +- .../python/aggregation/combineglobally.md | 17 +- .../python/aggregation/combinewithcontext.md | 3 - .../transforms/python/aggregation/count.md | 7 +- .../transforms/python/aggregation/distinct.md | 9 +- .../python/aggregation/groupbykey.md | 13 +- .../python/aggregation/groupintobatches.md | 3 - .../transforms/python/aggregation/latest.md | 23 + .../transforms/python/aggregation/max.md | 3 - .../transforms/python/aggregation/mean.md | 9 +- .../transforms/python/aggregation/min.md | 3 - .../transforms/python/aggregation/sample.md | 9 +- .../transforms/python/aggregation/sum.md | 3 - .../transforms/python/aggregation/top.md | 9 +- .../transforms/python/elementwise/filter.md | 129 +- .../transforms/python/elementwise/flatmap.md | 189 +- .../transforms/python/elementwise/keys.md | 34 +- .../transforms/python/elementwise/kvswap.md | 34 +- .../transforms/python/elementwise/map.md | 169 +- .../transforms/python/elementwise/pardo.md | 76 +- .../python/elementwise/partition.md | 76 +- .../transforms/python/elementwise/regex.md | 186 +- .../transforms/python/elementwise/reify.md | 5 +- .../transforms/python/elementwise/tostring.md | 70 +- .../transforms/python/elementwise/values.md | 34 +- .../transforms/python/elementwise/withkeys.md | 3 - .../python/elementwise/withtimestamps.md | 76 +- .../transforms/python/other/create.md | 7 +- .../transforms/python/other/flatten.md | 12 +- .../transforms/python/other/passert.md | 3 - .../transforms/python/other/reshuffle.md | 7 +- .../transforms/python/other/view.md | 3 - .../transforms/python/other/windowinto.md | 11 +- .../transforms/python/overview.md | 83 + .../site/content/en/get-started/_index.md} | 19 +- .../content/en}/get-started/beam-overview.md | 44 +- .../site/content/en}/get-started/downloads.md | 15 +- .../en}/get-started/mobile-gaming-example.md | 238 +-- .../content/en}/get-started/quickstart-go.md | 45 +- .../en}/get-started/quickstart-java.md | 242 +-- .../content/en}/get-started/quickstart-py.md | 125 +- .../en}/get-started/try-apache-beam.md | 74 +- .../en}/get-started/wordcount-example.md | 922 ++++----- .../site/content/en/privacy_policy/_index.md} | 2 - .../site/content/en/roadmap/_index.md} | 21 +- .../site/content/en}/roadmap/apex-runner.md | 5 +- .../content/en}/roadmap/connectors-go-sdk.md | 3 - .../en}/roadmap/connectors-java-sdk.md | 3 - .../en}/roadmap/connectors-multi-sdk.md | 3 - .../en}/roadmap/connectors-python-sdk.md | 3 - .../content/en}/roadmap/dataflow-runner.md | 5 +- .../site/content/en}/roadmap/euphoria.md | 6 +- .../site/content/en}/roadmap/flink-runner.md | 5 +- .../content/en}/roadmap/gearpump-runner.md | 5 +- .../site/content/en}/roadmap/go-sdk.md | 5 +- .../site/content/en}/roadmap/java-sdk.md | 3 - .../site/content/en}/roadmap/nemo-runner.md | 6 +- .../site/content/en}/roadmap/portability.md | 16 +- .../site/content/en}/roadmap/python-sdk.md | 3 - .../site/content/en}/roadmap/samza-runner.md | 5 +- .../site/content/en}/roadmap/spark-runner.md | 3 - .../site/content/en}/roadmap/sql.md | 5 +- .../content/en}/security/CVE-2020-1929.md | 3 +- .../site/content/en/security/_index.md} | 4 +- website/www/site/data/authors.yml | 156 ++ website/www/site/data/capability_matrix.yaml | 1706 ++++++++++++++++ .../data/capability_matrix_snapshot.yaml} | 444 ++--- .../site/data/en/cards.yaml} | 15 +- website/www/site/data/en/pillars.yaml | 18 + .../{src/_data => www/site/data}/logos.yml | 13 +- website/www/site/data/meetings.yml | 38 + website/www/site/data/works_with.yaml | 30 + website/www/site/i18n/blog/en.yaml | 18 + website/www/site/i18n/footer/en.yaml | 58 + website/www/site/i18n/home/en.yaml | 44 + website/www/site/i18n/navbar/en.yaml | 42 + website/www/site/layouts/_default/baseof.html | 30 + website/www/site/layouts/blog/baseof.html | 25 + website/www/site/layouts/blog/list.html | 50 + website/www/site/layouts/blog/single.html | 36 + .../www/site/layouts/community/baseof.html | 40 + .../www/site/layouts/contribute/baseof.html | 40 + .../site/layouts/documentation/baseof.html | 40 + .../www/site/layouts/get-started/baseof.html | 40 + website/www/site/layouts/index.feed.xml | 53 + website/www/site/layouts/index.html | 150 ++ .../www/site/layouts/languages/baseof.html | 41 + website/www/site/layouts/partials/footer.html | 63 + website/www/site/layouts/partials/head.html | 56 + .../site/layouts/partials}/header.html | 40 +- .../partials/section-menu/en}/community.html | 22 +- .../partials/section-menu/en/contribute.html | 45 + .../section-menu/en/documentation.html | 307 +++ .../section-menu/en}/get-started.html | 22 +- .../partials/section-menu/en/roadmap.html | 49 + .../partials/section-menu/en/runners.html | 23 + .../partials/section-menu/en/sdks.html | 109 ++ .../site/layouts/privacy_policy/baseof.html} | 26 +- website/www/site/layouts/roadmap/baseof.html | 40 + website/www/site/layouts/runners/baseof.html | 40 + website/www/site/layouts/security/baseof.html | 41 + .../site/layouts/shortcodes/button-pydoc.html | 30 + .../shortcodes/buttons-code-snippet.html | 32 + .../shortcodes/capability-matrix-common.html | 19 + .../layouts/shortcodes/capability-matrix.html | 98 + .../site/layouts/shortcodes/colors/png.html | 33 + .../site/layouts/shortcodes/colors/svg.html | 28 + .../flink_java_pipeline_options.html | 11 +- .../flink_python_pipeline_options.html | 29 +- .../layouts/shortcodes/github_sample.html | 15 + .../site/layouts/shortcodes/highlight.html | 31 + .../layouts/shortcodes/language-switcher.html | 28 + .../layouts/shortcodes/localstorage.html} | 8 +- .../site/layouts/shortcodes/paragraph.html} | 23 +- .../www/site/layouts/shortcodes/table.html | 17 + .../site/layouts/shortcodes/toc.html} | 6 +- website/{src => www/site/static}/.htaccess | 0 .../site/static}/downloads/beam-doap.rdf | 0 .../site/static}/downloads/logos.zip | Bin .../site/static}/downloads/palette.pdf | Bin .../glyphicons-halflings-regular.eot | Bin .../glyphicons-halflings-regular.svg | 0 .../glyphicons-halflings-regular.ttf | Bin .../glyphicons-halflings-regular.woff | Bin .../glyphicons-halflings-regular.woff2 | Bin .../static}/images/apache_logo_circle.svg | 0 .../site/static}/images/beam_architecture.png | Bin .../site/static}/images/beam_logo_circle.svg | 0 .../site/static}/images/beam_logo_navbar.png | Bin .../site/static}/images/beam_logo_s.png | Bin .../static}/images/beam_sql_dsl_workflow.png | Bin .../images/blog/2017-look-back/timeline.png | Bin .../site/static}/images/blog/Facebook-AD.png | Bin .../images/blog/IMG_20160927_170455.jpg | Bin .../images/blog/IMG_20160927_170956.jpg | Bin .../site/static}/images/blog/SoD.png | Bin .../beam-kata/beam-kata-intellij-edu-1.png | Bin .../beam-kata/beam-kata-intellij-edu-2.png | Bin .../beam-kata/beam-kata-pycharm-edu-1.png | Bin .../beam-kata/beam-kata-pycharm-edu-2.png | Bin .../blog/beamsummit/beamsummit-digital.png | Bin .../site/static}/images/blog/kotlin.png | Bin .../images/blog/simple-wordcount-pipeline.png | Bin .../images/blog/splittable-do-fn/blocks.png | Bin .../splittable-do-fn/jdbcio-expansion.png | Bin .../blog/splittable-do-fn/kafka-splitting.png | Bin .../blog/splittable-do-fn/restrictions.png | Bin .../splittable-do-fn/transform-expansion.png | Bin .../stateful-processing/assign-indices.png | Bin .../blog/stateful-processing/combinefn.png | Bin .../stateful-processing/combiner-lifting.png | Bin .../stateful-processing/pardo-and-gbk.png | Bin .../blog/stateful-processing/pipeline.png | Bin .../images/blog/stateful-processing/plaid.png | Bin .../stateful-processing/stateful-dofn.png | Bin .../stateful-processing/stateful-pardo.png | Bin .../blog/test-stream/elements-all-on-time.png | Bin .../test-stream/elements-droppably-late.png | Bin .../test-stream/elements-observably-late.png | Bin .../elements-processing-speculative.png | Bin .../elements-unobservably-late.png | Bin .../timely-processing/BatchedRpcExpiry.png | Bin .../timely-processing/BatchedRpcStale.png | Bin .../timely-processing/BatchedRpcState.png | Bin .../blog/timely-processing/CombinePerKey.png | Bin .../images/blog/timely-processing/ParDo.png | Bin .../blog/timely-processing/StateAndTimers.png | Bin .../blog/timely-processing/UnifiedModel.png | Bin .../timely-processing/WindowingChoices.png | Bin .../site/static}/images/card_border.svg | 0 .../site/static}/images/cards_bg.svg | 0 .../static}/images/contribution-diversity.png | Bin .../static}/images/contribution-guide-1.png | Bin ...esign-your-pipeline-additional-outputs.svg | 0 .../images/design-your-pipeline-flatten.svg | 0 .../images/design-your-pipeline-join.svg | 0 .../images/design-your-pipeline-linear.svg | 0 ...gn-your-pipeline-multiple-pcollections.svg | 0 .../static}/images/dofn-sequence-diagram.svg | 0 .../images/execution_model_bundling.svg | 0 ...ecution_model_bundling_coupled_failure.svg | 0 .../images/execution_model_bundling_gantt.svg | 0 .../execution_model_bundling_gantt_max.svg | 0 .../images/execution_model_bundling_multi.svg | 0 .../execution_model_bundling_multi_gantt.svg | 0 .../images/execution_model_failure_retry.svg | 0 .../static}/images/external-link-icon.png | Bin .../site/static}/images/favicon.ico | Bin .../static}/images/fixed-time-windows.png | Bin .../static}/images/gaming-example-basic.png | Bin .../gaming-example-event-time-narrow.gif | Bin .../gaming-example-proc-time-narrow.gif | Bin .../images/gaming-example-session-windows.png | Bin .../gaming-example-team-scores-narrow.gif | Bin .../site/static}/images/gaming-example.gif | Bin .../site/static}/images/hero_bg.svg | 0 .../site/static}/images/hero_bg_flat.svg | 0 .../site/static}/images/logo_apex.png | Bin .../site/static}/images/logo_flink.png | Bin .../site/static}/images/logo_gearpump.png | Bin .../site/static}/images/logo_google_cloud.png | Bin .../site/static}/images/logo_samza.png | Bin .../site/static}/images/logo_spark.png | Bin .../beam-logo-3-color-name-bottom-100.png | Bin .../beam-logo-3-color-name-bottom-1000.png | Bin .../beam-logo-3-color-name-bottom-200.png | Bin .../beam-logo-3-color-name-bottom-500.png | Bin .../beam-logo-3-color-name-bottom.svg | 0 .../beam-logo-3-color-name-right-100.png | Bin .../beam-logo-3-color-name-right-1000.png | Bin .../beam-logo-3-color-name-right-200.png | Bin .../beam-logo-3-color-name-right-500.png | Bin .../beam-logo-3-color-name-right.svg | 0 .../beam-logo-3-color-nameless-100.png | Bin .../beam-logo-3-color-nameless-1000.png | Bin .../beam-logo-3-color-nameless-200.png | Bin .../beam-logo-3-color-nameless-500.png | Bin .../nameless/beam-logo-3-color-nameless.svg | 0 .../beam-logo-bw-name-bottom-100.png | Bin .../beam-logo-bw-name-bottom-1000.png | Bin .../beam-logo-bw-name-bottom-200.png | Bin .../beam-logo-bw-name-bottom-500.png | Bin .../name-bottom/beam-logo-bw-name-bottom.svg | 0 .../beam-logo-bw-name-right-100.png | Bin .../beam-logo-bw-name-right-1000.png | Bin .../beam-logo-bw-name-right-200.png | Bin .../beam-logo-bw-name-right-500.png | Bin .../bw/name-right/beam-logo-bw-name-right.svg | 0 .../bw/nameless/beam-logo-bw-nameless-100.png | Bin .../nameless/beam-logo-bw-nameless-1000.png | Bin .../bw/nameless/beam-logo-bw-nameless-200.png | Bin .../bw/nameless/beam-logo-bw-nameless-500.png | Bin .../bw/nameless/beam-logo-bw-nameless.svg | 0 .../beam-logo-full-color-name-bottom-100.png | Bin .../beam-logo-full-color-name-bottom-1000.png | Bin .../beam-logo-full-color-name-bottom-200.png | Bin .../beam-logo-full-color-name-bottom-500.png | Bin .../beam-logo-full-color-name-bottom.svg | 0 .../beam-logo-full-color-name-right-100.png | Bin .../beam-logo-full-color-name-right-1000.png | Bin .../beam-logo-full-color-name-right-200.png | Bin .../beam-logo-full-color-name-right-500.png | Bin .../beam-logo-full-color-name-right.svg | 0 .../beam-logo-full-color-nameless-100.png | Bin .../beam-logo-full-color-nameless-1000.png | Bin .../beam-logo-full-color-nameless-200.png | Bin .../beam-logo-full-color-nameless-500.png | Bin .../beam-logo-full-color-nameless.svg | 0 .../static}/images/logos/runners/apex.png | Bin .../static}/images/logos/runners/dataflow.png | Bin .../static}/images/logos/runners/flink.png | Bin .../static}/images/logos/runners/gearpump.png | Bin .../site/static}/images/logos/runners/jet.png | Bin .../static}/images/logos/runners/jstorm.png | Bin .../static}/images/logos/runners/samza.png | Bin .../static}/images/logos/runners/spark.png | Bin .../site/static}/images/logos/sdks/go.png | Bin .../site/static}/images/logos/sdks/java.png | Bin .../site/static}/images/logos/sdks/python.png | Bin .../site/static}/images/logos/sdks/scala.png | Bin .../static}/images/precommit_dashboard.png | Bin .../images/precommit_graph_queuing_time.png | Bin .../site/static}/images/release-guide-1.png | Bin .../images/resources/se-radio-podcast.png | Bin .../images/resources/streaming-101.png | Bin .../images/resources/streaming-102.png | Bin .../site/static}/images/session-windows.png | Bin .../static}/images/sliding-time-windows.png | Bin .../images/source-sequence-diagram.svg | 0 .../images/standard-vs-dynamic-sessions.png | Bin .../static}/images/trigger-accumulation.png | Bin .../images/unwindowed-pipeline-bounded.svg | 0 .../images/windowing-pipeline-bounded.svg | 0 .../images/windowing-pipeline-unbounded.svg | 0 .../static}/images/wordcount-pipeline.svg | 0 .../site/static}/js/bootstrap-sprockets.js | 0 .../{src => www/site/static}/js/bootstrap.js | 0 .../site/static}/js/bootstrap.min.js | 0 .../site/static}/js/bootstrap/affix.js | 0 .../site/static}/js/bootstrap/alert.js | 0 .../site/static}/js/bootstrap/button.js | 0 .../site/static}/js/bootstrap/carousel.js | 0 .../site/static}/js/bootstrap/collapse.js | 0 .../site/static}/js/bootstrap/dropdown.js | 0 .../site/static}/js/bootstrap/modal.js | 0 .../site/static}/js/bootstrap/popover.js | 0 .../site/static}/js/bootstrap/scrollspy.js | 0 .../site/static}/js/bootstrap/tab.js | 0 .../site/static}/js/bootstrap/tooltip.js | 0 .../site/static}/js/bootstrap/transition.js | 0 .../{src => www/site/static}/js/fix-menu.js | 0 .../site/static}/js/language-switch.js | 0 .../{src => www/site/static}/js/page-nav.js | 0 .../site/static}/js/section-nav.js | 7 +- website/www/site/themes/docsy | 1 + website/www/yarn.lock | 828 ++++++++ 630 files changed, 10254 insertions(+), 9064 deletions(-) create mode 100644 .gitmodules create mode 100644 website/CONTRIBUTE.md delete mode 100644 website/Gemfile delete mode 100644 website/Gemfile.lock delete mode 100644 website/Rakefile delete mode 100644 website/_config.yml delete mode 100644 website/src/_data/authors.yml delete mode 100644 website/src/_data/capability-matrix.yml delete mode 100644 website/src/_data/meetings.yml delete mode 100644 website/src/_includes/button-pydoc.md delete mode 100644 website/src/_includes/button.md delete mode 100644 website/src/_includes/buttons-code-snippet.md delete mode 100644 website/src/_includes/capability-matrix-common.md delete mode 100644 website/src/_includes/capability-matrix-row-summary.md delete mode 100644 website/src/_includes/capability-matrix.md delete mode 100644 website/src/_includes/footer.html delete mode 100644 website/src/_includes/head.html delete mode 100644 website/src/_includes/icon-github.svg delete mode 100644 website/src/_includes/icon-twitter.svg delete mode 100644 website/src/_includes/page-toc.html delete mode 100644 website/src/_includes/section-menu/contribute.html delete mode 100644 website/src/_includes/section-menu/documentation.html delete mode 100644 website/src/_includes/section-menu/roadmap.html delete mode 100644 website/src/_includes/section-menu/runners.html delete mode 100644 website/src/_includes/section-menu/sdks.html delete mode 100644 website/src/_layouts/post.html delete mode 100644 website/src/_layouts/section.html delete mode 100644 website/src/_layouts/v2home.html delete mode 100644 website/src/blog/index.md delete mode 100644 website/src/coming-soon.md delete mode 100644 website/src/community/logos.md delete mode 100644 website/src/documentation/index.md delete mode 100644 website/src/documentation/transforms/java/index.md delete mode 100644 website/src/documentation/transforms/python/aggregation/latest.md delete mode 100644 website/src/documentation/transforms/python/index.md delete mode 100644 website/src/feed.xml delete mode 100644 website/src/index.md delete mode 100644 website/src/v2/index.md create mode 100755 website/www/build_github_samples.sh create mode 100755 website/www/check-links.sh create mode 100644 website/www/package.json rename website/{src/_includes/capability-matrix-row-full.md => www/site/archetypes/blog.md} (71%) rename website/{src/_includes/capability-matrix-row-blog.md => www/site/archetypes/default.md} (81%) rename website/{src/_sass => www/site/assets/scss}/_bootstrap.scss (100%) rename website/{src/_sass => www/site/assets/scss}/_breakpoints.sass (100%) rename website/{src/_sass => www/site/assets/scss}/_button.sass (100%) rename website/{src/_sass => www/site/assets/scss}/_cards.sass (100%) rename website/{src/_sass => www/site/assets/scss}/_ctas.sass (100%) rename website/{src/_sass => www/site/assets/scss}/_footer.sass (100%) rename website/{src/_sass => www/site/assets/scss}/_global.sass (100%) rename website/{src/_sass => www/site/assets/scss}/_graphic.sass (100%) rename website/{src/_sass => www/site/assets/scss}/_header.sass (100%) rename website/{src/_sass => www/site/assets/scss}/_hero.sass (100%) rename website/{src/_sass => www/site/assets/scss}/_layout.scss (100%) rename website/{src/_sass => www/site/assets/scss}/_logos.sass (100%) rename website/{src/_sass => www/site/assets/scss}/_navbar.sass (100%) rename website/{src/_sass => www/site/assets/scss}/_page-nav.sass (90%) rename website/{src/_sass => www/site/assets/scss}/_pillars.sass (100%) rename website/{src/_sass => www/site/assets/scss}/_section-nav.sass (100%) rename website/{src/_sass => www/site/assets/scss}/_syntax-highlighting.scss (100%) create mode 100644 website/www/site/assets/scss/_table-wrapper.sass rename website/{src/_sass => www/site/assets/scss}/_toggler-nav.scss (100%) rename website/{src/_sass => www/site/assets/scss}/_type.sass (100%) rename website/{src/_sass => www/site/assets/scss}/_vars.sass (100%) rename website/{src/_sass => www/site/assets/scss}/bootstrap/_alerts.scss (100%) rename website/{src/_sass => www/site/assets/scss}/bootstrap/_badges.scss (100%) rename website/{src/_sass => www/site/assets/scss}/bootstrap/_breadcrumbs.scss (100%) rename website/{src/_sass => www/site/assets/scss}/bootstrap/_button-groups.scss (100%) rename website/{src/_sass => www/site/assets/scss}/bootstrap/_buttons.scss (100%) rename website/{src/_sass => www/site/assets/scss}/bootstrap/_carousel.scss (100%) rename website/{src/_sass => www/site/assets/scss}/bootstrap/_close.scss (100%) rename website/{src/_sass => www/site/assets/scss}/bootstrap/_code.scss (100%) rename website/{src/_sass => www/site/assets/scss}/bootstrap/_component-animations.scss (100%) rename website/{src/_sass => www/site/assets/scss}/bootstrap/_dropdowns.scss (100%) rename website/{src/_sass => www/site/assets/scss}/bootstrap/_forms.scss (100%) rename website/{src/_sass => www/site/assets/scss}/bootstrap/_glyphicons.scss (100%) rename website/{src/_sass => www/site/assets/scss}/bootstrap/_grid.scss (100%) rename website/{src/_sass => www/site/assets/scss}/bootstrap/_input-groups.scss (100%) rename website/{src/_sass => www/site/assets/scss}/bootstrap/_jumbotron.scss (100%) rename website/{src/_sass => www/site/assets/scss}/bootstrap/_labels.scss (100%) rename website/{src/_sass => www/site/assets/scss}/bootstrap/_list-group.scss (100%) rename website/{src/_sass => www/site/assets/scss}/bootstrap/_media.scss (100%) rename website/{src/_sass => www/site/assets/scss}/bootstrap/_mixins.scss (100%) rename website/{src/_sass => www/site/assets/scss}/bootstrap/_modals.scss (100%) rename website/{src/_sass => www/site/assets/scss}/bootstrap/_navbar.scss (100%) rename website/{src/_sass => www/site/assets/scss}/bootstrap/_navs.scss (100%) rename website/{src/_sass => www/site/assets/scss}/bootstrap/_normalize.scss (100%) rename website/{src/_sass => www/site/assets/scss}/bootstrap/_pager.scss (100%) rename website/{src/_sass => www/site/assets/scss}/bootstrap/_pagination.scss (100%) rename website/{src/_sass => www/site/assets/scss}/bootstrap/_panels.scss (100%) rename website/{src/_sass => www/site/assets/scss}/bootstrap/_popovers.scss (100%) rename website/{src/_sass => www/site/assets/scss}/bootstrap/_print.scss (100%) rename website/{src/_sass => www/site/assets/scss}/bootstrap/_progress-bars.scss (100%) rename website/{src/_sass => www/site/assets/scss}/bootstrap/_responsive-embed.scss (100%) rename website/{src/_sass => www/site/assets/scss}/bootstrap/_responsive-utilities.scss (100%) rename website/{src/_sass => www/site/assets/scss}/bootstrap/_scaffolding.scss (100%) rename website/{src/_sass => www/site/assets/scss}/bootstrap/_tables.scss (100%) rename website/{src/_sass => www/site/assets/scss}/bootstrap/_theme.scss (100%) rename website/{src/_sass => www/site/assets/scss}/bootstrap/_thumbnails.scss (100%) rename website/{src/_sass => www/site/assets/scss}/bootstrap/_tooltip.scss (100%) rename website/{src/_sass => www/site/assets/scss}/bootstrap/_type.scss (100%) rename website/{src/_sass => www/site/assets/scss}/bootstrap/_utilities.scss (100%) rename website/{src/_sass => www/site/assets/scss}/bootstrap/_variables.scss (100%) rename website/{src/_sass => www/site/assets/scss}/bootstrap/_wells.scss (100%) rename website/{src/_sass => www/site/assets/scss}/bootstrap/mixins/_alerts.scss (100%) rename website/{src/_sass => www/site/assets/scss}/bootstrap/mixins/_background-variant.scss (100%) rename website/{src/_sass => www/site/assets/scss}/bootstrap/mixins/_border-radius.scss (100%) rename website/{src/_sass => www/site/assets/scss}/bootstrap/mixins/_buttons.scss (100%) rename website/{src/_sass => www/site/assets/scss}/bootstrap/mixins/_center-block.scss (100%) rename website/{src/_sass => www/site/assets/scss}/bootstrap/mixins/_clearfix.scss (100%) rename website/{src/_sass => www/site/assets/scss}/bootstrap/mixins/_forms.scss (100%) rename website/{src/_sass => www/site/assets/scss}/bootstrap/mixins/_gradients.scss (100%) rename website/{src/_sass => www/site/assets/scss}/bootstrap/mixins/_grid-framework.scss (100%) rename website/{src/_sass => www/site/assets/scss}/bootstrap/mixins/_grid.scss (100%) rename website/{src/_sass => www/site/assets/scss}/bootstrap/mixins/_hide-text.scss (100%) rename website/{src/_sass => www/site/assets/scss}/bootstrap/mixins/_image.scss (100%) rename website/{src/_sass => www/site/assets/scss}/bootstrap/mixins/_labels.scss (100%) rename website/{src/_sass => www/site/assets/scss}/bootstrap/mixins/_list-group.scss (100%) rename website/{src/_sass => www/site/assets/scss}/bootstrap/mixins/_nav-divider.scss (100%) rename website/{src/_sass => www/site/assets/scss}/bootstrap/mixins/_nav-vertical-align.scss (100%) rename website/{src/_sass => www/site/assets/scss}/bootstrap/mixins/_opacity.scss (100%) rename website/{src/_sass => www/site/assets/scss}/bootstrap/mixins/_pagination.scss (100%) rename website/{src/_sass => www/site/assets/scss}/bootstrap/mixins/_panels.scss (100%) rename website/{src/_sass => www/site/assets/scss}/bootstrap/mixins/_progress-bar.scss (100%) rename website/{src/_sass => www/site/assets/scss}/bootstrap/mixins/_reset-filter.scss (100%) rename website/{src/_sass => www/site/assets/scss}/bootstrap/mixins/_reset-text.scss (100%) rename website/{src/_sass => www/site/assets/scss}/bootstrap/mixins/_resize.scss (100%) rename website/{src/_sass => www/site/assets/scss}/bootstrap/mixins/_responsive-visibility.scss (100%) rename website/{src/_sass => www/site/assets/scss}/bootstrap/mixins/_size.scss (100%) rename website/{src/_sass => www/site/assets/scss}/bootstrap/mixins/_tab-focus.scss (100%) rename website/{src/_sass => www/site/assets/scss}/bootstrap/mixins/_table-row.scss (100%) rename website/{src/_sass => www/site/assets/scss}/bootstrap/mixins/_text-emphasis.scss (100%) rename website/{src/_sass => www/site/assets/scss}/bootstrap/mixins/_text-overflow.scss (100%) rename website/{src/_sass => www/site/assets/scss}/bootstrap/mixins/_vendor-prefixes.scss (100%) rename website/{src/_sass => www/site/assets/scss}/capability-matrix.scss (100%) rename website/{src/css/site.scss => www/site/assets/scss/main.scss} (97%) create mode 100644 website/www/site/config.toml create mode 100644 website/www/site/content/en/_index.md rename website/{src/_posts/2017-01-09-added-apex-runner.md => www/site/content/en/blog/added-apex-runner.md} (90%) rename website/{src/_posts/2019-05-01-adding-data-sources-to-sql.md => www/site/content/en/blog/adding-data-sources-to-sql.md} (96%) rename website/{src/_posts/2019-02-15-beam-2.10.0.md => www/site/content/en/blog/beam-2.10.0.md} (96%) rename website/{src/_posts/2019-03-05-beam-2.11.0.md => www/site/content/en/blog/beam-2.11.0.md} (96%) rename website/{src/_posts/2019-04-25-beam-2.12.0.md => www/site/content/en/blog/beam-2.12.0.md} (94%) rename website/{src/_posts/2019-05-22-beam-2.13.0.md => www/site/content/en/blog/beam-2.13.0.md} (93%) rename website/{src/_posts/2019-07-31-beam-2.14.0.md => www/site/content/en/blog/beam-2.14.0.md} (96%) rename website/{src/_posts/2019-08-22-beam-2.15.0.md => www/site/content/en/blog/beam-2.15.0.md} (94%) rename website/{src/_posts/2019-10-07-beam-2.16.0.md => www/site/content/en/blog/beam-2.16.0.md} (96%) rename website/{src/_posts/2020-01-06-beam-2.17.0.md => www/site/content/en/blog/beam-2.17.0.md} (94%) rename website/{src/_posts/2020-01-13-beam-2.18.0.md => www/site/content/en/blog/beam-2.18.0.md} (97%) rename website/{src/_posts/2020-02-04-beam-2.19.0.md => www/site/content/en/blog/beam-2.19.0.md} (96%) rename website/{src/_posts/2020-04-15-beam-2.20.0.md => www/site/content/en/blog/beam-2.20.0.md} (96%) rename website/{src/_posts/2018-02-19-beam-2.3.0.md => www/site/content/en/blog/beam-2.3.0.md} (98%) rename website/{src/_posts/2018-06-26-beam-2.5.0.md => www/site/content/en/blog/beam-2.5.0.md} (97%) rename website/{src/_posts/2018-08-10-beam-2.6.0.md => www/site/content/en/blog/beam-2.6.0.md} (97%) rename website/{src/_posts/2018-10-03-beam-2.7.0.md => www/site/content/en/blog/beam-2.7.0.md} (90%) rename website/{src/_posts/2018-10-29-beam-2.8.0.md => www/site/content/en/blog/beam-2.8.0.md} (95%) rename website/{src/_posts/2018-12-13-beam-2.9.0.md => www/site/content/en/blog/beam-2.9.0.md} (95%) rename website/{src/_posts/2018-01-09-beam-a-look-back.md => www/site/content/en/blog/beam-a-look-back.md} (95%) rename website/{src/_posts/2017-05-17-beam-first-stable-release.md => www/site/content/en/blog/beam-first-stable-release.md} (98%) rename website/{src/_posts/2017-01-10-beam-graduates.md => www/site/content/en/blog/beam-graduates.md} (96%) rename website/{src/_posts/2016-02-22-beam-has-a-logo.markdown => www/site/content/en/blog/beam-has-a-logo.md} (89%) rename website/{src/_posts/2019-05-30-beam-kata-release.md => www/site/content/en/blog/beam-kata-release.md} (77%) rename website/{src/_posts/2019-04-25-beam-kotlin.md => www/site/content/en/blog/beam-kotlin.md} (86%) rename website/{src/_posts/2018-10-30-beam-summit-aftermath.md => www/site/content/en/blog/beam-summit-aftermath.md} (99%) rename website/{src/_posts/2020-05-08-beam-summit-digital-2020.md => www/site/content/en/blog/beam-summit-digital-2020.md} (96%) rename website/{src/_posts/2019-05-11-beam-summit-europe-2019.md => www/site/content/en/blog/beam-summit-europe-2019.md} (92%) rename website/{src/_posts/2018-08-21-beam-summit-europe.md => www/site/content/en/blog/beam-summit-europe.md} (93%) rename website/{src/_posts/2019-03-18-beam-summit-site.md => www/site/content/en/blog/beam-summit-site.md} (95%) create mode 100644 website/www/site/content/en/blog/capability-matrix.md rename website/{src/_posts/2016-06-15-first-release.md => www/site/content/en/blog/first-release.md} (89%) rename website/{src/_posts/2016-06-13-flink-batch-runner-milestone.md => www/site/content/en/blog/flink-batch-runner-milestone.md} (98%) rename website/{src/_posts/2017-02-01-graduation-media-recap.md => www/site/content/en/blog/graduation-media-recap.md} (95%) rename website/{src/_posts/2019-09-04-gsoc-19.md => www/site/content/en/blog/gsoc-19.md} (97%) rename website/{src/_posts/2019-06-11-looping-timers.md => www/site/content/en/blog/looping-timers.md} (98%) rename website/{src/_posts/2016-04-03-presentation-materials.md => www/site/content/en/blog/presentation-materials.md} (79%) rename website/{src/_posts/2016-02-25-python-sdk-now-public.markdown => www/site/content/en/blog/python-sdk-now-public.md} (93%) rename website/{src/_posts/2017-03-16-python-sdk-release.md => www/site/content/en/blog/python-sdk-release.md} (76%) rename website/{src/_posts/2018-08-20-review-input-streaming-connectors.md => www/site/content/en/blog/review-input-streaming-connectors.md} (70%) rename website/{src/_posts/2019-04-19-season-of-docs.md => www/site/content/en/blog/season-of-docs.md} (95%) rename website/{src/_posts/2016-08-03-six-months.md => www/site/content/en/blog/six-months.md} (88%) rename website/{src/_posts/2016-05-18-splitAtFraction-method.md => www/site/content/en/blog/splitAtFraction-method.md} (96%) rename website/{src/_posts/2017-08-04-splittable-do-fn.md => www/site/content/en/blog/splittable-do-fn.md} (94%) rename website/{src/_posts/2017-02-13-stateful-processing.md => www/site/content/en/blog/stateful-processing.md} (95%) rename website/{src/_posts/2016-10-12-strata-hadoop-world-and-beam.md => www/site/content/en/blog/strata-hadoop-world-and-beam.md} (87%) rename website/{src/_posts/2016-10-20-test-stream.md => www/site/content/en/blog/test-stream.md} (89%) rename website/{src/_posts/2017-08-28-timely-processing.md => www/site/content/en/blog/timely-processing.md} (95%) rename website/{src/_posts/2016-05-20-where-is-my-pcollection-dot-map.md => www/site/content/en/blog/where-is-my-pcollection-dot-map.md} (97%) rename website/{src => www/site/content/en}/community/contact-us.md (95%) rename website/{src => www/site/content/en}/community/in-person.md (95%) rename website/{src => www/site/content/en}/community/integrations.md (90%) create mode 100644 website/www/site/content/en/community/logos.md rename website/{src => www/site/content/en}/community/policies.md (94%) rename website/{src => www/site/content/en}/community/presentation-materials.md (96%) rename website/{src => www/site/content/en}/community/twitter-handle.md (96%) rename website/{src => www/site/content/en}/community/youtube-channel.md (91%) rename website/{src/contribute/index.md => www/site/content/en/contribute/_index.md} (84%) rename website/{src => www/site/content/en}/contribute/become-a-committer.md (97%) rename website/{src => www/site/content/en}/contribute/committer-guide.md (96%) rename website/{src => www/site/content/en}/contribute/dependencies.md (88%) rename website/{src => www/site/content/en}/contribute/design-documents.md (76%) rename website/{src => www/site/content/en}/contribute/feature-branches.md (93%) rename website/{src => www/site/content/en}/contribute/get-help.md (58%) rename website/{src => www/site/content/en}/contribute/jira-priorities.md (95%) rename website/{src => www/site/content/en}/contribute/postcommits-guides.md (96%) rename website/{src => www/site/content/en}/contribute/postcommits-policies-details.md (97%) rename website/{src => www/site/content/en}/contribute/postcommits-policies.md (74%) rename website/{src => www/site/content/en}/contribute/precommit-policies.md (94%) rename website/{src => www/site/content/en}/contribute/precommit-triage-guide.md (97%) rename website/{src => www/site/content/en}/contribute/ptransform-style-guide.md (98%) rename website/{src => www/site/content/en}/contribute/release-blocking.md (92%) rename website/{src => www/site/content/en}/contribute/release-guide.md (97%) rename website/{src => www/site/content/en}/contribute/runner-guide.md (97%) rename website/{src => www/site/content/en}/contribute/team.md (74%) rename website/{src => www/site/content/en}/contribute/testing.md (86%) create mode 100644 website/www/site/content/en/documentation/_index.md rename website/{src => www/site/content/en}/documentation/dsls/sql/calcite/aggregate-functions.md (85%) rename website/{src => www/site/content/en}/documentation/dsls/sql/calcite/data-types.md (92%) rename website/{src => www/site/content/en}/documentation/dsls/sql/calcite/lexical-structure.md (99%) rename website/{src => www/site/content/en}/documentation/dsls/sql/calcite/overview.md (71%) rename website/{src => www/site/content/en}/documentation/dsls/sql/calcite/query-syntax.md (96%) rename website/{src => www/site/content/en}/documentation/dsls/sql/calcite/scalar-functions.md (96%) rename website/{src => www/site/content/en}/documentation/dsls/sql/extensions/create-external-table.md (95%) rename website/{src => www/site/content/en}/documentation/dsls/sql/extensions/joins.md (94%) rename website/{src => www/site/content/en}/documentation/dsls/sql/extensions/set.md (75%) rename website/{src => www/site/content/en}/documentation/dsls/sql/extensions/user-defined-functions.md (94%) rename website/{src => www/site/content/en}/documentation/dsls/sql/extensions/windowing-and-triggering.md (92%) rename website/{src => www/site/content/en}/documentation/dsls/sql/overview.md (75%) rename website/{src => www/site/content/en}/documentation/dsls/sql/shell.md (93%) rename website/{src => www/site/content/en}/documentation/dsls/sql/walkthrough.md (89%) rename website/{src => www/site/content/en}/documentation/dsls/sql/zetasql/aggregate-functions.md (96%) rename website/{src => www/site/content/en}/documentation/dsls/sql/zetasql/conditional-expressions.md (97%) rename website/{src => www/site/content/en}/documentation/dsls/sql/zetasql/conversion-rules.md (96%) rename website/{src => www/site/content/en}/documentation/dsls/sql/zetasql/data-types.md (91%) rename website/{src => www/site/content/en}/documentation/dsls/sql/zetasql/lexical.md (95%) rename website/{src => www/site/content/en}/documentation/dsls/sql/zetasql/math-functions.md (94%) rename website/{src => www/site/content/en}/documentation/dsls/sql/zetasql/operators.md (92%) rename website/{src => www/site/content/en}/documentation/dsls/sql/zetasql/overview.md (82%) rename website/{src => www/site/content/en}/documentation/dsls/sql/zetasql/query-syntax.md (93%) rename website/{src => www/site/content/en}/documentation/dsls/sql/zetasql/string-functions.md (98%) rename website/{src => www/site/content/en}/documentation/dsls/sql/zetasql/syntax.md (91%) rename website/{src => www/site/content/en}/documentation/io/built-in.md (88%) rename website/{src/documentation/io/built-in-google-bigquery.md => www/site/content/en/documentation/io/built-in/google-bigquery.md} (76%) rename website/{src/documentation/io/built-in-hadoop.md => www/site/content/en/documentation/io/built-in/hadoop.md} (90%) rename website/{src/documentation/io/built-in-hcatalog.md => www/site/content/en/documentation/io/built-in/hcatalog.md} (97%) rename website/{src/documentation/io/built-in-parquet.md => www/site/content/en/documentation/io/built-in/parquet.md} (75%) rename website/{src => www/site/content/en}/documentation/io/developing-io-java.md (96%) rename website/{src => www/site/content/en}/documentation/io/developing-io-overview.md (87%) rename website/{src => www/site/content/en}/documentation/io/developing-io-python.md (90%) rename website/{src => www/site/content/en}/documentation/io/testing.md (97%) rename website/{src => www/site/content/en}/documentation/patterns/custom-io.md (73%) rename website/{src => www/site/content/en}/documentation/patterns/custom-windows.md (76%) rename website/{src => www/site/content/en}/documentation/patterns/file-processing.md (69%) rename website/{src => www/site/content/en}/documentation/patterns/overview.md (51%) rename website/{src => www/site/content/en}/documentation/patterns/pipeline-options.md (51%) rename website/{src => www/site/content/en}/documentation/patterns/side-inputs.md (76%) rename website/{src => www/site/content/en}/documentation/pipelines/create-your-pipeline.md (83%) rename website/{src => www/site/content/en}/documentation/pipelines/design-your-pipeline.md (87%) rename website/{src => www/site/content/en}/documentation/pipelines/test-your-pipeline.md (85%) rename website/{src => www/site/content/en}/documentation/programming-guide.md (90%) rename website/{src => www/site/content/en}/documentation/resources/learning-resources.md (99%) rename website/{src => www/site/content/en}/documentation/resources/videos-and-podcasts.md (93%) rename website/{src => www/site/content/en}/documentation/runners/apex.md (94%) rename website/{src => www/site/content/en}/documentation/runners/capability-matrix.md (78%) rename website/{src => www/site/content/en}/documentation/runners/dataflow.md (90%) rename website/{src => www/site/content/en}/documentation/runners/direct.md (71%) rename website/{src => www/site/content/en}/documentation/runners/flink.md (79%) rename website/{src => www/site/content/en}/documentation/runners/gearpump.md (92%) rename website/{src => www/site/content/en}/documentation/runners/jet.md (89%) rename website/{src => www/site/content/en}/documentation/runners/jstorm.md (92%) rename website/{src => www/site/content/en}/documentation/runners/mapreduce.md (86%) rename website/{src => www/site/content/en}/documentation/runners/nemo.md (96%) rename website/{src => www/site/content/en}/documentation/runners/samza.md (95%) rename website/{src => www/site/content/en}/documentation/runners/spark.md (82%) rename website/{src => www/site/content/en}/documentation/runtime/environments.md (97%) rename website/{src => www/site/content/en}/documentation/runtime/model.md (89%) rename website/{src => www/site/content/en}/documentation/runtime/sdk-harness-config.md (91%) rename website/{src => www/site/content/en}/documentation/sdks/feature-comparison.md (85%) rename website/{src => www/site/content/en}/documentation/sdks/go.md (73%) rename website/{src => www/site/content/en}/documentation/sdks/java-dependencies.md (87%) rename website/{src => www/site/content/en}/documentation/sdks/java-extensions.md (97%) rename website/{src => www/site/content/en}/documentation/sdks/java-thirdparty.md (97%) rename website/{src => www/site/content/en}/documentation/sdks/java.md (50%) rename website/{src/documentation/sdks => www/site/content/en/documentation/sdks/java}/euphoria.md (93%) rename website/{src/documentation/sdks => www/site/content/en/documentation/sdks/java/testing}/nexmark.md (97%) rename website/{src => www/site/content/en}/documentation/sdks/python-dependencies.md (72%) rename website/{src => www/site/content/en}/documentation/sdks/python-pipeline-dependencies.md (98%) rename website/{src => www/site/content/en}/documentation/sdks/python-streaming.md (89%) rename website/{src => www/site/content/en}/documentation/sdks/python-type-safety.md (87%) rename website/{src => www/site/content/en}/documentation/sdks/python.md (64%) rename website/{src => www/site/content/en}/documentation/transforms/java/aggregation/approximatequantiles.md (82%) rename website/{src => www/site/content/en}/documentation/transforms/java/aggregation/approximateunique.md (78%) rename website/{src => www/site/content/en}/documentation/transforms/java/aggregation/cogroupbykey.md (88%) rename website/{src => www/site/content/en}/documentation/transforms/java/aggregation/combine.md (87%) rename website/{src => www/site/content/en}/documentation/transforms/java/aggregation/combinewithcontext.md (75%) rename website/{src => www/site/content/en}/documentation/transforms/java/aggregation/count.md (84%) rename website/{src => www/site/content/en}/documentation/transforms/java/aggregation/distinct.md (81%) rename website/{src => www/site/content/en}/documentation/transforms/java/aggregation/groupbykey.md (79%) rename website/{src => www/site/content/en}/documentation/transforms/java/aggregation/groupintobatches.md (84%) rename website/{src => www/site/content/en}/documentation/transforms/java/aggregation/hllcount.md (90%) rename website/{src => www/site/content/en}/documentation/transforms/java/aggregation/latest.md (84%) rename website/{src => www/site/content/en}/documentation/transforms/java/aggregation/max.md (81%) rename website/{src => www/site/content/en}/documentation/transforms/java/aggregation/mean.md (84%) rename website/{src => www/site/content/en}/documentation/transforms/java/aggregation/min.md (78%) rename website/{src => www/site/content/en}/documentation/transforms/java/aggregation/sample.md (81%) rename website/{src => www/site/content/en}/documentation/transforms/java/aggregation/sum.md (87%) rename website/{src => www/site/content/en}/documentation/transforms/java/aggregation/top.md (85%) rename website/{src/documentation/transforms/java/element-wise => www/site/content/en/documentation/transforms/java/elementwise}/filter.md (83%) rename website/{src/documentation/transforms/java/element-wise => www/site/content/en/documentation/transforms/java/elementwise}/flatmapelements.md (78%) rename website/{src/documentation/transforms/java/element-wise => www/site/content/en/documentation/transforms/java/elementwise}/keys.md (70%) rename website/{src/documentation/transforms/java/element-wise => www/site/content/en/documentation/transforms/java/elementwise}/kvswap.md (70%) rename website/{src/documentation/transforms/java/element-wise => www/site/content/en/documentation/transforms/java/elementwise}/mapelements.md (79%) rename website/{src/documentation/transforms/java/element-wise => www/site/content/en/documentation/transforms/java/elementwise}/pardo.md (92%) rename website/{src/documentation/transforms/java/element-wise => www/site/content/en/documentation/transforms/java/elementwise}/partition.md (80%) rename website/{src/documentation/transforms/java/element-wise => www/site/content/en/documentation/transforms/java/elementwise}/regex.md (83%) rename website/{src/documentation/transforms/java/element-wise => www/site/content/en/documentation/transforms/java/elementwise}/reify.md (79%) rename website/{src/documentation/transforms/java/element-wise => www/site/content/en/documentation/transforms/java/elementwise}/tostring.md (83%) rename website/{src/documentation/transforms/java/element-wise => www/site/content/en/documentation/transforms/java/elementwise}/values.md (70%) rename website/{src/documentation/transforms/java/element-wise => www/site/content/en/documentation/transforms/java/elementwise}/withkeys.md (76%) rename website/{src/documentation/transforms/java/element-wise => www/site/content/en/documentation/transforms/java/elementwise}/withtimestamps.md (83%) rename website/{src => www/site/content/en}/documentation/transforms/java/other/create.md (91%) rename website/{src => www/site/content/en}/documentation/transforms/java/other/flatten.md (85%) rename website/{src => www/site/content/en}/documentation/transforms/java/other/passert.md (92%) rename website/{src => www/site/content/en}/documentation/transforms/java/other/view.md (78%) rename website/{src => www/site/content/en}/documentation/transforms/java/other/window.md (81%) create mode 100644 website/www/site/content/en/documentation/transforms/java/overview.md rename website/{src => www/site/content/en}/documentation/transforms/python/aggregation/approximatequantiles.md (83%) rename website/{src => www/site/content/en}/documentation/transforms/python/aggregation/approximateunique.md (83%) rename website/{src => www/site/content/en}/documentation/transforms/python/aggregation/cogroupbykey.md (74%) rename website/{src => www/site/content/en}/documentation/transforms/python/aggregation/combineglobally.md (66%) rename website/{src => www/site/content/en}/documentation/transforms/python/aggregation/combinewithcontext.md (83%) rename website/{src => www/site/content/en}/documentation/transforms/python/aggregation/count.md (89%) rename website/{src => www/site/content/en}/documentation/transforms/python/aggregation/distinct.md (80%) rename website/{src => www/site/content/en}/documentation/transforms/python/aggregation/groupbykey.md (68%) rename website/{src => www/site/content/en}/documentation/transforms/python/aggregation/groupintobatches.md (83%) create mode 100644 website/www/site/content/en/documentation/transforms/python/aggregation/latest.md rename website/{src => www/site/content/en}/documentation/transforms/python/aggregation/max.md (84%) rename website/{src => www/site/content/en}/documentation/transforms/python/aggregation/mean.md (82%) rename website/{src => www/site/content/en}/documentation/transforms/python/aggregation/min.md (84%) rename website/{src => www/site/content/en}/documentation/transforms/python/aggregation/sample.md (82%) rename website/{src => www/site/content/en}/documentation/transforms/python/aggregation/sum.md (84%) rename website/{src => www/site/content/en}/documentation/transforms/python/aggregation/top.md (81%) rename website/{src => www/site/content/en}/documentation/transforms/python/elementwise/filter.md (55%) rename website/{src => www/site/content/en}/documentation/transforms/python/elementwise/flatmap.md (53%) rename website/{src => www/site/content/en}/documentation/transforms/python/elementwise/keys.md (50%) rename website/{src => www/site/content/en}/documentation/transforms/python/elementwise/kvswap.md (51%) rename website/{src => www/site/content/en}/documentation/transforms/python/elementwise/map.md (52%) rename website/{src => www/site/content/en}/documentation/transforms/python/elementwise/pardo.md (75%) rename website/{src => www/site/content/en}/documentation/transforms/python/elementwise/partition.md (68%) rename website/{src => www/site/content/en}/documentation/transforms/python/elementwise/regex.md (64%) rename website/{src => www/site/content/en}/documentation/transforms/python/elementwise/reify.md (73%) rename website/{src => www/site/content/en}/documentation/transforms/python/elementwise/tostring.md (54%) rename website/{src => www/site/content/en}/documentation/transforms/python/elementwise/values.md (50%) rename website/{src => www/site/content/en}/documentation/transforms/python/elementwise/withkeys.md (84%) rename website/{src => www/site/content/en}/documentation/transforms/python/elementwise/withtimestamps.md (60%) rename website/{src => www/site/content/en}/documentation/transforms/python/other/create.md (90%) rename website/{src => www/site/content/en}/documentation/transforms/python/other/flatten.md (78%) rename website/{src => www/site/content/en}/documentation/transforms/python/other/passert.md (84%) rename website/{src => www/site/content/en}/documentation/transforms/python/other/reshuffle.md (91%) rename website/{src => www/site/content/en}/documentation/transforms/python/other/view.md (85%) rename website/{src => www/site/content/en}/documentation/transforms/python/other/windowinto.md (81%) create mode 100644 website/www/site/content/en/documentation/transforms/python/overview.md rename website/{src/get-started/index.md => www/site/content/en/get-started/_index.md} (62%) rename website/{src => www/site/content/en}/get-started/beam-overview.md (50%) rename website/{src => www/site/content/en}/get-started/downloads.md (96%) rename website/{src => www/site/content/en}/get-started/mobile-gaming-example.md (77%) rename website/{src => www/site/content/en}/get-started/quickstart-go.md (69%) rename website/{src => www/site/content/en}/get-started/quickstart-java.md (71%) rename website/{src => www/site/content/en}/get-started/quickstart-py.md (77%) rename website/{src => www/site/content/en}/get-started/try-apache-beam.md (69%) rename website/{src => www/site/content/en}/get-started/wordcount-example.md (76%) rename website/{src/privacy_policy/index.md => www/site/content/en/privacy_policy/_index.md} (97%) rename website/{src/roadmap/index.md => www/site/content/en/roadmap/_index.md} (79%) rename website/{src => www/site/content/en}/roadmap/apex-runner.md (83%) rename website/{src => www/site/content/en}/roadmap/connectors-go-sdk.md (91%) rename website/{src => www/site/content/en}/roadmap/connectors-java-sdk.md (92%) rename website/{src => www/site/content/en}/roadmap/connectors-multi-sdk.md (98%) rename website/{src => www/site/content/en}/roadmap/connectors-python-sdk.md (91%) rename website/{src => www/site/content/en}/roadmap/dataflow-runner.md (82%) rename website/{src => www/site/content/en}/roadmap/euphoria.md (90%) rename website/{src => www/site/content/en}/roadmap/flink-runner.md (83%) rename website/{src => www/site/content/en}/roadmap/gearpump-runner.md (82%) rename website/{src => www/site/content/en}/roadmap/go-sdk.md (97%) rename website/{src => www/site/content/en}/roadmap/java-sdk.md (91%) rename website/{src => www/site/content/en}/roadmap/nemo-runner.md (84%) rename website/{src => www/site/content/en}/roadmap/portability.md (91%) rename website/{src => www/site/content/en}/roadmap/python-sdk.md (96%) rename website/{src => www/site/content/en}/roadmap/samza-runner.md (83%) rename website/{src => www/site/content/en}/roadmap/spark-runner.md (90%) rename website/{src => www/site/content/en}/roadmap/sql.md (82%) rename website/{src => www/site/content/en}/security/CVE-2020-1929.md (86%) rename website/{src/security/index.md => www/site/content/en/security/_index.md} (95%) create mode 100644 website/www/site/data/authors.yml create mode 100644 website/www/site/data/capability_matrix.yaml rename website/{src/_posts/2016-03-17-capability-matrix.md => www/site/data/capability_matrix_snapshot.yaml} (68%) rename website/{_config_test.yml => www/site/data/en/cards.yaml} (58%) create mode 100644 website/www/site/data/en/pillars.yaml rename website/{src/_data => www/site/data}/logos.yml (76%) create mode 100644 website/www/site/data/meetings.yml create mode 100644 website/www/site/data/works_with.yaml create mode 100644 website/www/site/i18n/blog/en.yaml create mode 100644 website/www/site/i18n/footer/en.yaml create mode 100644 website/www/site/i18n/home/en.yaml create mode 100644 website/www/site/i18n/navbar/en.yaml create mode 100644 website/www/site/layouts/_default/baseof.html create mode 100644 website/www/site/layouts/blog/baseof.html create mode 100644 website/www/site/layouts/blog/list.html create mode 100644 website/www/site/layouts/blog/single.html create mode 100644 website/www/site/layouts/community/baseof.html create mode 100644 website/www/site/layouts/contribute/baseof.html create mode 100644 website/www/site/layouts/documentation/baseof.html create mode 100644 website/www/site/layouts/get-started/baseof.html create mode 100644 website/www/site/layouts/index.feed.xml create mode 100644 website/www/site/layouts/index.html create mode 100644 website/www/site/layouts/languages/baseof.html create mode 100644 website/www/site/layouts/partials/footer.html create mode 100644 website/www/site/layouts/partials/head.html rename website/{src/_includes => www/site/layouts/partials}/header.html (67%) rename website/{src/_includes/section-menu => www/site/layouts/partials/section-menu/en}/community.html (59%) create mode 100644 website/www/site/layouts/partials/section-menu/en/contribute.html create mode 100644 website/www/site/layouts/partials/section-menu/en/documentation.html rename website/{src/_includes/section-menu => www/site/layouts/partials/section-menu/en}/get-started.html (52%) create mode 100644 website/www/site/layouts/partials/section-menu/en/roadmap.html create mode 100644 website/www/site/layouts/partials/section-menu/en/runners.html create mode 100644 website/www/site/layouts/partials/section-menu/en/sdks.html rename website/{src/_layouts/default.html => www/site/layouts/privacy_policy/baseof.html} (68%) create mode 100644 website/www/site/layouts/roadmap/baseof.html create mode 100644 website/www/site/layouts/runners/baseof.html create mode 100644 website/www/site/layouts/security/baseof.html create mode 100644 website/www/site/layouts/shortcodes/button-pydoc.html create mode 100644 website/www/site/layouts/shortcodes/buttons-code-snippet.html create mode 100644 website/www/site/layouts/shortcodes/capability-matrix-common.html create mode 100644 website/www/site/layouts/shortcodes/capability-matrix.html create mode 100644 website/www/site/layouts/shortcodes/colors/png.html create mode 100644 website/www/site/layouts/shortcodes/colors/svg.html rename website/{src/_includes => www/site/layouts/shortcodes}/flink_java_pipeline_options.html (99%) rename website/{src/_includes => www/site/layouts/shortcodes}/flink_python_pipeline_options.html (92%) create mode 100644 website/www/site/layouts/shortcodes/github_sample.html create mode 100644 website/www/site/layouts/shortcodes/highlight.html create mode 100644 website/www/site/layouts/shortcodes/language-switcher.html rename website/{src/_includes/icon-github.html => www/site/layouts/shortcodes/localstorage.html} (75%) rename website/{src/_layouts/page.html => www/site/layouts/shortcodes/paragraph.html} (72%) create mode 100644 website/www/site/layouts/shortcodes/table.html rename website/{src/_includes/icon-twitter.html => www/site/layouts/shortcodes/toc.html} (74%) rename website/{src => www/site/static}/.htaccess (100%) rename website/{src => www/site/static}/downloads/beam-doap.rdf (100%) rename website/{src => www/site/static}/downloads/logos.zip (100%) rename website/{src => www/site/static}/downloads/palette.pdf (100%) rename website/{src => www/site/static}/fonts/bootstrap/glyphicons-halflings-regular.eot (100%) rename website/{src => www/site/static}/fonts/bootstrap/glyphicons-halflings-regular.svg (100%) rename website/{src => www/site/static}/fonts/bootstrap/glyphicons-halflings-regular.ttf (100%) rename website/{src => www/site/static}/fonts/bootstrap/glyphicons-halflings-regular.woff (100%) rename website/{src => www/site/static}/fonts/bootstrap/glyphicons-halflings-regular.woff2 (100%) rename website/{src => www/site/static}/images/apache_logo_circle.svg (100%) rename website/{src => www/site/static}/images/beam_architecture.png (100%) rename website/{src => www/site/static}/images/beam_logo_circle.svg (100%) rename website/{src => www/site/static}/images/beam_logo_navbar.png (100%) rename website/{src => www/site/static}/images/beam_logo_s.png (100%) rename website/{src => www/site/static}/images/beam_sql_dsl_workflow.png (100%) rename website/{src => www/site/static}/images/blog/2017-look-back/timeline.png (100%) rename website/{src => www/site/static}/images/blog/Facebook-AD.png (100%) rename website/{src => www/site/static}/images/blog/IMG_20160927_170455.jpg (100%) rename website/{src => www/site/static}/images/blog/IMG_20160927_170956.jpg (100%) rename website/{src => www/site/static}/images/blog/SoD.png (100%) rename website/{src => www/site/static}/images/blog/beam-kata/beam-kata-intellij-edu-1.png (100%) rename website/{src => www/site/static}/images/blog/beam-kata/beam-kata-intellij-edu-2.png (100%) rename website/{src => www/site/static}/images/blog/beam-kata/beam-kata-pycharm-edu-1.png (100%) rename website/{src => www/site/static}/images/blog/beam-kata/beam-kata-pycharm-edu-2.png (100%) rename website/{src => www/site/static}/images/blog/beamsummit/beamsummit-digital.png (100%) rename website/{src => www/site/static}/images/blog/kotlin.png (100%) rename website/{src => www/site/static}/images/blog/simple-wordcount-pipeline.png (100%) rename website/{src => www/site/static}/images/blog/splittable-do-fn/blocks.png (100%) rename website/{src => www/site/static}/images/blog/splittable-do-fn/jdbcio-expansion.png (100%) rename website/{src => www/site/static}/images/blog/splittable-do-fn/kafka-splitting.png (100%) rename website/{src => www/site/static}/images/blog/splittable-do-fn/restrictions.png (100%) rename website/{src => www/site/static}/images/blog/splittable-do-fn/transform-expansion.png (100%) rename website/{src => www/site/static}/images/blog/stateful-processing/assign-indices.png (100%) rename website/{src => www/site/static}/images/blog/stateful-processing/combinefn.png (100%) rename website/{src => www/site/static}/images/blog/stateful-processing/combiner-lifting.png (100%) rename website/{src => www/site/static}/images/blog/stateful-processing/pardo-and-gbk.png (100%) rename website/{src => www/site/static}/images/blog/stateful-processing/pipeline.png (100%) rename website/{src => www/site/static}/images/blog/stateful-processing/plaid.png (100%) rename website/{src => www/site/static}/images/blog/stateful-processing/stateful-dofn.png (100%) rename website/{src => www/site/static}/images/blog/stateful-processing/stateful-pardo.png (100%) rename website/{src => www/site/static}/images/blog/test-stream/elements-all-on-time.png (100%) rename website/{src => www/site/static}/images/blog/test-stream/elements-droppably-late.png (100%) rename website/{src => www/site/static}/images/blog/test-stream/elements-observably-late.png (100%) rename website/{src => www/site/static}/images/blog/test-stream/elements-processing-speculative.png (100%) rename website/{src => www/site/static}/images/blog/test-stream/elements-unobservably-late.png (100%) rename website/{src => www/site/static}/images/blog/timely-processing/BatchedRpcExpiry.png (100%) rename website/{src => www/site/static}/images/blog/timely-processing/BatchedRpcStale.png (100%) rename website/{src => www/site/static}/images/blog/timely-processing/BatchedRpcState.png (100%) rename website/{src => www/site/static}/images/blog/timely-processing/CombinePerKey.png (100%) rename website/{src => www/site/static}/images/blog/timely-processing/ParDo.png (100%) rename website/{src => www/site/static}/images/blog/timely-processing/StateAndTimers.png (100%) rename website/{src => www/site/static}/images/blog/timely-processing/UnifiedModel.png (100%) rename website/{src => www/site/static}/images/blog/timely-processing/WindowingChoices.png (100%) rename website/{src => www/site/static}/images/card_border.svg (100%) rename website/{src => www/site/static}/images/cards_bg.svg (100%) rename website/{src => www/site/static}/images/contribution-diversity.png (100%) rename website/{src => www/site/static}/images/contribution-guide-1.png (100%) rename website/{src => www/site/static}/images/design-your-pipeline-additional-outputs.svg (100%) rename website/{src => www/site/static}/images/design-your-pipeline-flatten.svg (100%) rename website/{src => www/site/static}/images/design-your-pipeline-join.svg (100%) rename website/{src => www/site/static}/images/design-your-pipeline-linear.svg (100%) rename website/{src => www/site/static}/images/design-your-pipeline-multiple-pcollections.svg (100%) rename website/{src => www/site/static}/images/dofn-sequence-diagram.svg (100%) rename website/{src => www/site/static}/images/execution_model_bundling.svg (100%) rename website/{src => www/site/static}/images/execution_model_bundling_coupled_failure.svg (100%) rename website/{src => www/site/static}/images/execution_model_bundling_gantt.svg (100%) rename website/{src => www/site/static}/images/execution_model_bundling_gantt_max.svg (100%) rename website/{src => www/site/static}/images/execution_model_bundling_multi.svg (100%) rename website/{src => www/site/static}/images/execution_model_bundling_multi_gantt.svg (100%) rename website/{src => www/site/static}/images/execution_model_failure_retry.svg (100%) rename website/{src => www/site/static}/images/external-link-icon.png (100%) rename website/{src => www/site/static}/images/favicon.ico (100%) rename website/{src => www/site/static}/images/fixed-time-windows.png (100%) rename website/{src => www/site/static}/images/gaming-example-basic.png (100%) rename website/{src => www/site/static}/images/gaming-example-event-time-narrow.gif (100%) rename website/{src => www/site/static}/images/gaming-example-proc-time-narrow.gif (100%) rename website/{src => www/site/static}/images/gaming-example-session-windows.png (100%) rename website/{src => www/site/static}/images/gaming-example-team-scores-narrow.gif (100%) rename website/{src => www/site/static}/images/gaming-example.gif (100%) rename website/{src => www/site/static}/images/hero_bg.svg (100%) rename website/{src => www/site/static}/images/hero_bg_flat.svg (100%) rename website/{src => www/site/static}/images/logo_apex.png (100%) rename website/{src => www/site/static}/images/logo_flink.png (100%) rename website/{src => www/site/static}/images/logo_gearpump.png (100%) rename website/{src => www/site/static}/images/logo_google_cloud.png (100%) rename website/{src => www/site/static}/images/logo_samza.png (100%) rename website/{src => www/site/static}/images/logo_spark.png (100%) rename website/{src => www/site/static}/images/logos/3-color/name-bottom/beam-logo-3-color-name-bottom-100.png (100%) rename website/{src => www/site/static}/images/logos/3-color/name-bottom/beam-logo-3-color-name-bottom-1000.png (100%) rename website/{src => www/site/static}/images/logos/3-color/name-bottom/beam-logo-3-color-name-bottom-200.png (100%) rename website/{src => www/site/static}/images/logos/3-color/name-bottom/beam-logo-3-color-name-bottom-500.png (100%) rename website/{src => www/site/static}/images/logos/3-color/name-bottom/beam-logo-3-color-name-bottom.svg (100%) rename website/{src => www/site/static}/images/logos/3-color/name-right/beam-logo-3-color-name-right-100.png (100%) rename website/{src => www/site/static}/images/logos/3-color/name-right/beam-logo-3-color-name-right-1000.png (100%) rename website/{src => www/site/static}/images/logos/3-color/name-right/beam-logo-3-color-name-right-200.png (100%) rename website/{src => www/site/static}/images/logos/3-color/name-right/beam-logo-3-color-name-right-500.png (100%) rename website/{src => www/site/static}/images/logos/3-color/name-right/beam-logo-3-color-name-right.svg (100%) rename website/{src => www/site/static}/images/logos/3-color/nameless/beam-logo-3-color-nameless-100.png (100%) rename website/{src => www/site/static}/images/logos/3-color/nameless/beam-logo-3-color-nameless-1000.png (100%) rename website/{src => www/site/static}/images/logos/3-color/nameless/beam-logo-3-color-nameless-200.png (100%) rename website/{src => www/site/static}/images/logos/3-color/nameless/beam-logo-3-color-nameless-500.png (100%) rename website/{src => www/site/static}/images/logos/3-color/nameless/beam-logo-3-color-nameless.svg (100%) rename website/{src => www/site/static}/images/logos/bw/name-bottom/beam-logo-bw-name-bottom-100.png (100%) rename website/{src => www/site/static}/images/logos/bw/name-bottom/beam-logo-bw-name-bottom-1000.png (100%) rename website/{src => www/site/static}/images/logos/bw/name-bottom/beam-logo-bw-name-bottom-200.png (100%) rename website/{src => www/site/static}/images/logos/bw/name-bottom/beam-logo-bw-name-bottom-500.png (100%) rename website/{src => www/site/static}/images/logos/bw/name-bottom/beam-logo-bw-name-bottom.svg (100%) rename website/{src => www/site/static}/images/logos/bw/name-right/beam-logo-bw-name-right-100.png (100%) rename website/{src => www/site/static}/images/logos/bw/name-right/beam-logo-bw-name-right-1000.png (100%) rename website/{src => www/site/static}/images/logos/bw/name-right/beam-logo-bw-name-right-200.png (100%) rename website/{src => www/site/static}/images/logos/bw/name-right/beam-logo-bw-name-right-500.png (100%) rename website/{src => www/site/static}/images/logos/bw/name-right/beam-logo-bw-name-right.svg (100%) rename website/{src => www/site/static}/images/logos/bw/nameless/beam-logo-bw-nameless-100.png (100%) rename website/{src => www/site/static}/images/logos/bw/nameless/beam-logo-bw-nameless-1000.png (100%) rename website/{src => www/site/static}/images/logos/bw/nameless/beam-logo-bw-nameless-200.png (100%) rename website/{src => www/site/static}/images/logos/bw/nameless/beam-logo-bw-nameless-500.png (100%) rename website/{src => www/site/static}/images/logos/bw/nameless/beam-logo-bw-nameless.svg (100%) rename website/{src => www/site/static}/images/logos/full-color/name-bottom/beam-logo-full-color-name-bottom-100.png (100%) rename website/{src => www/site/static}/images/logos/full-color/name-bottom/beam-logo-full-color-name-bottom-1000.png (100%) rename website/{src => www/site/static}/images/logos/full-color/name-bottom/beam-logo-full-color-name-bottom-200.png (100%) rename website/{src => www/site/static}/images/logos/full-color/name-bottom/beam-logo-full-color-name-bottom-500.png (100%) rename website/{src => www/site/static}/images/logos/full-color/name-bottom/beam-logo-full-color-name-bottom.svg (100%) rename website/{src => www/site/static}/images/logos/full-color/name-right/beam-logo-full-color-name-right-100.png (100%) rename website/{src => www/site/static}/images/logos/full-color/name-right/beam-logo-full-color-name-right-1000.png (100%) rename website/{src => www/site/static}/images/logos/full-color/name-right/beam-logo-full-color-name-right-200.png (100%) rename website/{src => www/site/static}/images/logos/full-color/name-right/beam-logo-full-color-name-right-500.png (100%) rename website/{src => www/site/static}/images/logos/full-color/name-right/beam-logo-full-color-name-right.svg (100%) rename website/{src => www/site/static}/images/logos/full-color/nameless/beam-logo-full-color-nameless-100.png (100%) rename website/{src => www/site/static}/images/logos/full-color/nameless/beam-logo-full-color-nameless-1000.png (100%) rename website/{src => www/site/static}/images/logos/full-color/nameless/beam-logo-full-color-nameless-200.png (100%) rename website/{src => www/site/static}/images/logos/full-color/nameless/beam-logo-full-color-nameless-500.png (100%) rename website/{src => www/site/static}/images/logos/full-color/nameless/beam-logo-full-color-nameless.svg (100%) rename website/{src => www/site/static}/images/logos/runners/apex.png (100%) rename website/{src => www/site/static}/images/logos/runners/dataflow.png (100%) rename website/{src => www/site/static}/images/logos/runners/flink.png (100%) rename website/{src => www/site/static}/images/logos/runners/gearpump.png (100%) rename website/{src => www/site/static}/images/logos/runners/jet.png (100%) rename website/{src => www/site/static}/images/logos/runners/jstorm.png (100%) rename website/{src => www/site/static}/images/logos/runners/samza.png (100%) rename website/{src => www/site/static}/images/logos/runners/spark.png (100%) rename website/{src => www/site/static}/images/logos/sdks/go.png (100%) rename website/{src => www/site/static}/images/logos/sdks/java.png (100%) rename website/{src => www/site/static}/images/logos/sdks/python.png (100%) rename website/{src => www/site/static}/images/logos/sdks/scala.png (100%) rename website/{src => www/site/static}/images/precommit_dashboard.png (100%) rename website/{src => www/site/static}/images/precommit_graph_queuing_time.png (100%) rename website/{src => www/site/static}/images/release-guide-1.png (100%) rename website/{src => www/site/static}/images/resources/se-radio-podcast.png (100%) rename website/{src => www/site/static}/images/resources/streaming-101.png (100%) rename website/{src => www/site/static}/images/resources/streaming-102.png (100%) rename website/{src => www/site/static}/images/session-windows.png (100%) rename website/{src => www/site/static}/images/sliding-time-windows.png (100%) rename website/{src => www/site/static}/images/source-sequence-diagram.svg (100%) rename website/{src => www/site/static}/images/standard-vs-dynamic-sessions.png (100%) rename website/{src => www/site/static}/images/trigger-accumulation.png (100%) rename website/{src => www/site/static}/images/unwindowed-pipeline-bounded.svg (100%) rename website/{src => www/site/static}/images/windowing-pipeline-bounded.svg (100%) rename website/{src => www/site/static}/images/windowing-pipeline-unbounded.svg (100%) rename website/{src => www/site/static}/images/wordcount-pipeline.svg (100%) rename website/{src => www/site/static}/js/bootstrap-sprockets.js (100%) rename website/{src => www/site/static}/js/bootstrap.js (100%) rename website/{src => www/site/static}/js/bootstrap.min.js (100%) rename website/{src => www/site/static}/js/bootstrap/affix.js (100%) rename website/{src => www/site/static}/js/bootstrap/alert.js (100%) rename website/{src => www/site/static}/js/bootstrap/button.js (100%) rename website/{src => www/site/static}/js/bootstrap/carousel.js (100%) rename website/{src => www/site/static}/js/bootstrap/collapse.js (100%) rename website/{src => www/site/static}/js/bootstrap/dropdown.js (100%) rename website/{src => www/site/static}/js/bootstrap/modal.js (100%) rename website/{src => www/site/static}/js/bootstrap/popover.js (100%) rename website/{src => www/site/static}/js/bootstrap/scrollspy.js (100%) rename website/{src => www/site/static}/js/bootstrap/tab.js (100%) rename website/{src => www/site/static}/js/bootstrap/tooltip.js (100%) rename website/{src => www/site/static}/js/bootstrap/transition.js (100%) rename website/{src => www/site/static}/js/fix-menu.js (100%) rename website/{src => www/site/static}/js/language-switch.js (100%) rename website/{src => www/site/static}/js/page-nav.js (100%) rename website/{src => www/site/static}/js/section-nav.js (97%) create mode 160000 website/www/site/themes/docsy create mode 100644 website/www/yarn.lock diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 000000000000..5abfbe7da83e --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "website/www/site/themes/docsy"] + path = website/www/site/themes/docsy + url = https://github.com/google/docsy.git diff --git a/build.gradle b/build.gradle index f1908dc90c71..2fddf6e71938 100644 --- a/build.gradle +++ b/build.gradle @@ -86,14 +86,17 @@ rat { // JDBC package config files "**/META-INF/services/java.sql.Driver", - // Ruby build files + // Website build files "**/Gemfile.lock", "**/Rakefile", "**/.htaccess", - "website/src/_sass/_bootstrap.scss", - "website/src/_sass/bootstrap/**/*", - "website/src/js/bootstrap*.js", - "website/src/js/bootstrap/**/*", + "website/www/site/assets/scss/_bootstrap.scss", + "website/www/site/assets/scss/bootstrap/**/*", + "website/www/site/static/js/bootstrap*.js", + "website/www/site/static/js/bootstrap/**/*", + "website/www/site/themes", + "website/www/yarn.lock", + "website/www/package.json", // Ignore ownership files "ownership/**/*", diff --git a/runners/flink/flink_runner.gradle b/runners/flink/flink_runner.gradle index 31807de75a38..c3531268cca1 100644 --- a/runners/flink/flink_runner.gradle +++ b/runners/flink/flink_runner.gradle @@ -246,7 +246,7 @@ def createPipelineOptionsTableTask(String target) { args = [target] standardOutput = new ByteArrayOutputStream() doLast { - def dest = file("${project(':website').getProjectDir()}/src/_includes/flink_${target.toLowerCase()}_pipeline_options.html") + def dest = file("${project(':website').getProjectDir()}/www/site/layouts/shortcodes/flink_${target.toLowerCase()}_pipeline_options.html") if (!dest.exists()) { throw new GradleException("Pipeline options file is not in expected location: ${dest}") } diff --git a/website/.gitignore b/website/.gitignore index 76451ba8bc88..8191c0750dc0 100644 --- a/website/.gitignore +++ b/website/.gitignore @@ -11,3 +11,13 @@ content/ *.iml *.ipr *.iws + +# Hugo +www/node_modules +www/dist +www/site/resources +www/site/github_samples +www/site/_config_branch_repo.toml +www/yarn-error.log +!www/site/content + diff --git a/website/CONTRIBUTE.md b/website/CONTRIBUTE.md new file mode 100644 index 000000000000..555fd2a3026e --- /dev/null +++ b/website/CONTRIBUTE.md @@ -0,0 +1,394 @@ + + +# Contribution Guide + +This guide consists of: + +- [Project structure](#project-structure) +- [Configuration walkthrough](#configuration-walkthrough) +- [How to add a new doc](#how-to-add-a-new-doc) +- [How to add a new blogpost](#how-to-add-a-new-blogpost) +- [How to add a new landing page](#how-to-add-a-new-landing-page) +- [How to write in Hugo way](#how-to-write-in-hugo-way) + - [Define TableOfContents](#define-tableofcontents) + - [Language switching](#language-switching) + - [Code highlighting](#code-highlighting) + - [Adding class to markdown text](#paragraph) + - [Table](#table) + - [Github sample](#github-sample) + - [Others](#others) +- [What to be replaced in Jekyll](#what-to-be-replaced-in-jekyll) +- [Translation guide](#translation-guide) + +## Project structure + +``` +www/ +├── dist # bundle files +├── site +│   ├── archetypes # frontmatter template +│   ├── assets +│ │ └── scss # styles +│   ├── content # pages +│ │ └── en +│ │ ├── blog +│ │ ├── community +│ │ ├── contribute +│ │ ├── documentation +│ │ ├── get-started +│ │ ├── privacy_policy +│ │ ├── roadmap +│ │ └── security +│ │ └── _index.md +│   ├── data +│   ├── layouts # content template +│   ├── static +│ │ ├── downloads # downloaded files +│ │ └── fonts +│ │ └── images +│ │ └── js +│   └── themes +│ └── docsy +├── build_github_samples.sh +├── check-links.sh # links checker +└── package.json +``` + +## Configuration walkthrough + +If you prefer to experience locally instead of using our gradle commands, this [Hugo installation](https://gohugo.io/getting-started/installing/) is a good start. + +When we mention the `config file` in this documentation, we mean the Hugo configuration file located at `/www/site/config.toml`. + +Most of the setup are self-explained in the comments. Please refer to the [Hugo documentation](https://gohugo.io/getting-started/configuration/) for more details. + +You should notice at `[params]`, they are considered as global variables. For instance, when you'd like to replace the release latest version, make a change on `release_latest = ""` to replace it everywhere in the project. + +## How to add a new doc + +Let's start with an example as you'd like to add a new doc named `new-doc` in `/www/site/content/en/documentation/runtime/`. Locate to `/www/site/` and run: + +``` +$ hugo new documentation/runtime/new-doc.md +``` + +A markdown file will be created with pre-filled frontmatter: + +``` +--- +title: "New Doc" +--- +``` + +Then, put your content below the frontmatter. Do not worry about its layout, since its inside `documentation`, the layout at `/www/site/layouts/documentation/` will be shared with its children. + +**Note**: if you'd like to add a new doc in another language, apart from English version. For example, a new doc inside `pl` directory. Use: + +``` +$ hugo new -c content/pl documentation/runtime/new-doc.md +``` + +## How to add a new blogpost + +To add a new blogpost with pre-filled frontmatter, in `/www/site/` run: + +``` +$ hugo new blog/my-new-blogpost.md +``` + +That will create a markdown file `/www/site/content/en/blog/my-new-blogpost.md` with following content: + +``` +--- +title: "My New Blogpost" +date: "2020-04-20T14:02:57+02:00" +categories: + - blog +authors: + - "Your Name" +--- +``` + +Below frontmatter, put your blogpost content. The filename will also serve as URL for your blogpost as `/blog/{filename}`. Don't forget to add ``, which is the delimiter between summary and the main content. + +**Note**: if you'd like to add a new blog in another language, apart from English version. For example, a new blog inside `pl` directory. Use: + +``` +$ hugo new -c content/pl blog/my-new-blogpost.md +``` + +## How to add a new landing page + +For example, you would like to add a new `About` page. + +First, you need to create a markdown file in `/www/site/content//about/_index.md` with following content: + +``` +--- +title: "Your page title" +--- +``` + +Below frontmatter, put your page content. The filename will also serve as URL for your page as `/about`. + +Second, define your page layout in the `layout` section with the same structure `/www/site/layout/about/{your_template}`. Hugo will help you to pick up the template behind the scene. Please refer to the [Hugo documentation](https://gohugo.io/templates/) for the usage of templates. + +You can also create a new page with pre-filled frontmatter, for instance, in `/www/site/` run: + +``` +$ hugo new about/_index.md +``` + +**Note**: if you'd like to add a new page in another language, apart from English version. For example, a new page inside `pl` directory. Use: + +``` +$ hugo new -c content/pl about/_index.md +``` + +## How to write in Hugo + +This section will guide you how to use Hugo shortcodes in Apache Beam website. Please refer to the [Hugo documentation](https://gohugo.io/content-management/shortcodes/) for more details of usage. + +### Define TableOfContents + +To automatically generate table of contents in a markdown file. Simply use: + +``` +{{< toc >}} +``` + +### Language switching + +To have a programming language tab switcher, for instance of java, python and go. Use: + +``` +{{< language-switchers java py go >}} +``` + +The purpose is to switch languages of codeblocks. + +### Code highlighting + +To be consistent, please prefer to use `{{< highlight >}}` syntax instead of ` ``` `, for code-blocks or syntax-highlighters. + +1. To apply code highlighting to java, python or go. Use: + +``` +{{< highlight java >}} +// This is java +{{< /highlight >}} +``` + +2. To apply code highlighting to a wrapper class + +Usage: + +``` +{{< highlight class="class-name" >}} +Write some code here. +{{< /highlight >}} +``` + +Render: + +``` +
+
+    
+    "Write some code here."
+    
+  
+
+``` + +The purpose of adding classes or programming languages (java, py or go) in code highlighting is to activate the language switching feature. + +### Adding class to markdown text + +1. To add a class to a single line in mardown. Use: + +``` +{{< paragraph class="java-language">}} +This is an inline markdown text. +{{< /paragraph >}} +``` + +2. To add a class to multi lines in markdown. Use: + +``` +{{< paragraph class="java-language" wrap="span">}} +- This is the first text. +- This is the second text. +- This is the third text. +{{< /paragraph >}} +``` + +The purpose of adding classes in markdown text is to activate the language switching feature. + +### Table + +If you would like to use the table markdown syntax but also have bootstrap table styles. Wrap your table markdown inside: + +``` +{{< table >}} +A table markdown here. +{{< /table >}} +``` + +### Github sample + +To retrieve a piece of code in github. + +Usage: + +``` +{{< github_sample "/path/to/file" selected_tag >}} +``` + +Example: + +``` +{{< github_sample "/apache/beam/blob/master/sdks/python/apache_beam/examples/complete/game/user_score.py" extract_and_sum_score >}} +``` + +### Others + +To get released latest version in markdown: + +``` +{{< param release_latest >}} +``` + +To get branch of the repository in markdown: + +``` +{{< param branch_repo >}} +``` + +To render capability matrix, please take a look at [this example](/www/site/content/en/documentation/runners/capability-matrix/#beam-capability-matrix). + +## What to be replaced in Jekyll + +This section will briefly let you know the replaced features of Jekyll in terms of writing a new blog post or documentation in Hugo. + +1. Redirect to: + +The `redirect_to` feature will no longer be used since Hugo doesn't support it. You can solve this in Hugo by replacing the external URLs directly in links, instead of using markdown file to be the third-person. + +Currently, there are 3 removed `redirect_to` links which were used in Jekyll: + +``` +/contribute/project/team/ # https://home.apache.org/phonebook.html?pmc=beam +/contribute/team/ # https://home.apache.org/phonebook.html?pmc=beam +/contribute/design-documents/ # https://cwiki.apache.org/confluence/display/BEAM/Design+Documents +``` + +2. Redirect from: + +Fortunately, Hugo supports `redirect_from` with `aliases` in the frontmatter. + +``` +aliases: + - /path/example.html +``` + +3. IALs: + +IALs feature is used in Jekyll to add a class to markdown paragraph, `{:.myclass}` as an example. And to show this matter, we use Hugo shortcodes to [add a class to inline texts](#adding-class-to-markdown-text) or [blocks](#code-highlighting). + +4. Filenames of blog posts: + +In Jekyll, filenames included the typical date prefix as part of the filename and it will cause some issues when we'd like to change the date later. Hugo prefers to get rid of them and add date as metadata in frontmatter. + +5. Relative URLs: + +`{{ site.baseurl }}` will no longer be used, since Hugo handle the relative or absolute path in the config file. + +6. Global variables: + +The `param` - global variables are placed in the [config file](#configuration-walkthrough). + +In Jekyll: + +``` +{{ site.release_latest }} +{{ site.branch_repo }} +``` + +In Hugo: + +``` +{{< param release_latest >}} +{{< param branch_repo >}} +``` + +## Translation guide + +In order to add a new language into Apache Beam website, please follow this guide. You could take a look at an [example branch](https://github.com/PolideaInternal/beam/tree/example/i18n/) to see how we completely translate the whole website. + +For more details of syntax, please refer to the [Hugo documentation](https://gohugo.io/content-management/multilingual/). Below is a step-by-step instructions of translating our website to Polish as an example. + +1. Configuring a new language + +Firstly, we add the following params to our config file `/www/site/config.toml`. + +``` +[languages.pl] +contentDir = "content/pl" +title = "Apache Beam title" +description = "Apache Beam description" +languageName = "Polish" +weight = 2 +``` + +2. Translating markdown contents + +The `www/site/content/pl` directory will be your main workspace of contents here. Therefore, you need to translate all of the markdown files inside `/www/site/content/en` and place them into your workspace. Remember to keep the same project structure for both, since they're sharing the same layouts. + +3. Localizing our strings + +Some of the texts are placed into layouts which are html files, you need to translate all of these phrases inside `www/site/i18n`. Afterwards from our templates, Hugo's `i18n` function does the localization job. Please follow [our example](https://github.com/PolideaInternal/beam/tree/example/i18n/website/www/site/i18n) to understand the structure. + +4. Data files + +Consider the following structure for your data directories `/www/site/data` where `en` and `pl` are your website’s languages’ respective codes. + +``` +data + ├── en + │ └── people.yaml + └── pl + └── people.yaml +``` + +Now from your template: + +``` +{{ $data := index .Site.Data .Site.Language.Lang }} +{{ range $data.people }} + {{ .name }} +{{ end }} +``` + +5. Section menus + +Similar to markdown content translation, there are two separated section menus `/www/site/layouts/partials/section-menu` corresponding to your languages. Your job is to take the section menus in `en` directory, translate and place them inside your `pl` directory. + +**Note**: if you get stuck at adding translation, please refer to [our example](https://github.com/PolideaInternal/beam/tree/example/i18n/). diff --git a/website/Dockerfile b/website/Dockerfile index 8a5974370752..4c4847481045 100644 --- a/website/Dockerfile +++ b/website/Dockerfile @@ -1,33 +1,72 @@ -############################################################################### -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at # -# http://www.apache.org/licenses/LICENSE-2.0 +# http://www.apache.org/licenses/LICENSE-2.0 # -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -############################################################################### - -# This image contains Ruby and dependencies required to build and test the Beam +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# This image contains Hugo and dependencies required to build and test the Beam # website. It is used by tasks in build.gradle. -FROM ruby:2.5 +FROM debian:stretch-slim + +SHELL ["/bin/bash", "-o", "pipefail", "-e", "-u", "-x", "-c"] + +ENV DEBIAN_FRONTEND=noninteractive \ + LANGUAGE=C.UTF-8 \ + LANG=C.UTF-8 \ + LC_ALL=C.UTF-8 \ + LC_CTYPE=C.UTF-8 \ + LC_MESSAGES=C.UTF-8 + +# Install deps being used by sh files +RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + ca-certificates \ + curl \ + git \ + gnupg2 \ + gosu \ + lynx \ + && apt-get autoremove -yqq --purge \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +# Install node environment +RUN curl -sL https://deb.nodesource.com/setup_10.x | bash - \ + && apt-get update \ + && apt-get install -y --no-install-recommends \ + nodejs \ + && apt-get autoremove -yqq --purge \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* -WORKDIR /ruby -RUN gem install bundler -# Update buildDockerImage's inputs.files if you change this list. -ADD Gemfile Gemfile.lock /ruby/ -RUN bundle install --deployment --path $GEM_HOME +# Install yarn +RUN curl -sS https://dl.yarnpkg.com/debian/pubkey.gpg | apt-key add - \ + && echo "deb https://dl.yarnpkg.com/debian/ stable main" | tee /etc/apt/sources.list.d/yarn.list \ + && apt-get update \ + && apt-get install -y --no-install-recommends yarn \ + && apt-get autoremove -yqq --purge \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* -# Required for website testing using HTMLProofer. -ENV LC_ALL C.UTF-8 +# Install hugo extended version v0.68.3 +RUN HUGOHOME="$(mktemp -d)" \ + && export HUGOHOME \ + && curl -sL https://github.com/gohugoio/hugo/releases/download/v0.68.3/hugo_extended_0.68.3_Linux-64bit.tar.gz > "${HUGOHOME}/hugo.tar.gz" \ + && tar -xzvf "${HUGOHOME}/hugo.tar.gz" hugo \ + && mv hugo /usr/local/bin/hugo \ + && chmod +x /usr/local/bin/hugo \ + && rm -r "${HUGOHOME}" -CMD sleep 3600 +WORKDIR /opt/ diff --git a/website/Gemfile b/website/Gemfile deleted file mode 100644 index 105030350562..000000000000 --- a/website/Gemfile +++ /dev/null @@ -1,38 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# Updates to this file should include a corresponding change to Gemfile.lock. -# See README.md for more info. - -source 'https://rubygems.org' - -gem 'jekyll', '3.6.3' - -# Jekyll plugins -group :jekyll_plugins do - gem 'jekyll-redirect-from' - gem 'jekyll-sass-converter' - gem 'html-proofer' - gem 'jekyll_github_sample' -end - -# Used by Travis tests. -gem 'rake' - -# Force a version lower than 5.0.0.0, which requires a newer ruby than Travis -# supports. -gem 'activesupport', '<5.0.0.0' diff --git a/website/Gemfile.lock b/website/Gemfile.lock deleted file mode 100644 index 9db2ebef047e..000000000000 --- a/website/Gemfile.lock +++ /dev/null @@ -1,94 +0,0 @@ -GEM - remote: https://rubygems.org/ - specs: - activesupport (4.2.11) - i18n (~> 0.7) - minitest (~> 5.1) - thread_safe (~> 0.3, >= 0.3.4) - tzinfo (~> 1.1) - addressable (2.5.2) - public_suffix (>= 2.0.2, < 4.0) - colorator (1.1.0) - colorize (0.8.1) - concurrent-ruby (1.1.4) - ethon (0.11.0) - ffi (>= 1.3.0) - ffi (1.11.1) - forwardable-extended (2.6.0) - html-proofer (3.9.3) - activesupport (>= 4.2, < 6.0) - addressable (~> 2.3) - colorize (~> 0.8) - mercenary (~> 0.3.2) - nokogiri (~> 1.8.1) - parallel (~> 1.3) - typhoeus (~> 1.3) - yell (~> 2.0) - i18n (0.9.5) - concurrent-ruby (~> 1.0) - jekyll (3.6.3) - addressable (~> 2.4) - colorator (~> 1.0) - jekyll-sass-converter (~> 1.0) - jekyll-watch (~> 1.1) - kramdown (~> 1.14) - liquid (~> 4.0) - mercenary (~> 0.3.3) - pathutil (~> 0.9) - rouge (>= 1.7, < 3) - safe_yaml (~> 1.0) - jekyll-redirect-from (0.11.0) - jekyll (>= 2.0) - jekyll-sass-converter (1.5.2) - sass (~> 3.4) - jekyll-watch (1.5.1) - listen (~> 3.0) - jekyll_github_sample (0.3.1) - activesupport (~> 4.0) - jekyll (~> 3.0) - kramdown (1.17.0) - liquid (4.0.3) - listen (3.2.0) - rb-fsevent (~> 0.10, >= 0.10.3) - rb-inotify (~> 0.9, >= 0.9.10) - mercenary (0.3.6) - mini_portile2 (2.3.0) - minitest (5.11.3) - nokogiri (1.8.5) - mini_portile2 (~> 2.3.0) - parallel (1.12.1) - pathutil (0.16.2) - forwardable-extended (~> 2.6) - public_suffix (3.0.3) - rake (12.3.0) - rb-fsevent (0.10.3) - rb-inotify (0.10.0) - ffi (~> 1.0) - rouge (2.2.1) - safe_yaml (1.0.5) - sass (3.7.4) - sass-listen (~> 4.0.0) - sass-listen (4.0.0) - rb-fsevent (~> 0.9, >= 0.9.4) - rb-inotify (~> 0.9, >= 0.9.7) - thread_safe (0.3.6) - typhoeus (1.3.1) - ethon (>= 0.9.0) - tzinfo (1.2.5) - thread_safe (~> 0.1) - yell (2.0.7) - -PLATFORMS - ruby - -DEPENDENCIES - activesupport (< 5.0.0.0) - html-proofer - jekyll (= 3.6.3) - jekyll-redirect-from - jekyll-sass-converter - jekyll_github_sample - rake - -BUNDLED WITH - 1.16.2 diff --git a/website/README.md b/website/README.md index 2ba273b7c0e1..8a2d7a648e72 100644 --- a/website/README.md +++ b/website/README.md @@ -17,12 +17,14 @@ under the License. --> +# Apache Beam website + These are the main sources of the website for Apache Beam, hosted at https://beam.apache.org/. -## About this site +## About -The Beam website is built using [Jekyll](https://jekyllrb.com/). Additionally, +The Beam website is built using [Hugo](https://gohugo.io/). Additionally, for additional formatting capabilities, this website uses [Twitter Bootstrap](https://getbootstrap.com/). @@ -30,22 +32,28 @@ Documentation generated from source code, such as Javadoc and Pydoc, is stored separately on the [beam-site repository](https://github.com/apache/beam-site/tree/release-docs). -## Active development +## Getting started Website development requires Docker installed if you wish to preview changes and run website tests. -The following command is used to build and serve the website locally. +The Docsy theme required for the site to work properly is included as a git submodule. This means that after you already cloned the repository, you need to update submodules at ``. + +`$ git submodule update --init --recursive` + +The following command is used to build and serve the website locally. Note: you should run the command at ``. - $ ./gradlew :website:serveWebsite +`$ ./gradlew :website:serveWebsite` Any changes made locally will trigger a rebuild of the website. Websites tests may be run using this command: - $ ./gradlew :website:testWebsite +`$ ./gradlew :website:testWebsite` -## Website push +For a more detailed description, please refer to the [contribution guide](CONTRIBUTE.md). + +## Deployment After a PR is merged, a background Jenkins job will automatically generate and push [website @@ -53,22 +61,6 @@ content](https://github.com/apache/beam/tree/asf-site/website/generated-content) to the asf-site branch. This content is later picked up and pushed to https://beam.apache.org/. -## Additional Information - -### Writing blog posts - -Blog posts are created in the `_posts` directory. - -If this is your first post, make sure to add yourself to `_data\authors.yml`. - -While you a working on your post before the publishing time listed in its header, -add `--future` when running Jekyll in order to view your draft on your local copy of -the site. - -### Adding Jekyll plugins - -If you modify the site to use additional Jekyll plugins, add them in `Gemfile` -and then run `bundle update`, which will regenerate the complete `Gemfile.lock`. -Make sure that the updated `Gemfile.lock` is included in your pull request. For more information, -see the Bundler [documentation](https://bundler.io/v1.3/rationale.html). +## Contribution guide +If you'd like to contribute to the Apache Beam website, read our [contribution guide](CONTRIBUTE.md) where you can find detailed instructions on how to work with the website. \ No newline at end of file diff --git a/website/Rakefile b/website/Rakefile deleted file mode 100644 index 8a0e6dea4af2..000000000000 --- a/website/Rakefile +++ /dev/null @@ -1,25 +0,0 @@ -require 'fileutils' -require 'html-proofer' -require 'etc' - -task :test do - HTMLProofer.check_directory("./generated-local-content", { - :typhoeus => { - :timeout => 120, - }, - :hydra => { - :max_concurrency => 5, - }, - :allow_hash_href => true, - :check_html => true, - :file_ignore => [/v2/], - :url_ignore => [ - /104.154.241.245/ # Preccommit dashboard is down [BEAM-6455]. - # To ignore link checking for a URL, i.e. if it is temporariliy down, - # add a URL regex below with a tracking JIRA issue. For example: - # /example.com/, # BEAM-1234 failing due to expired SSL cert - ], - :parallel => { :in_processes => 4 }, - :disable_external => ENV["disable_external"], - }).run -end diff --git a/website/_config.yml b/website/_config.yml deleted file mode 100644 index 1a0c1db07a45..000000000000 --- a/website/_config.yml +++ /dev/null @@ -1,70 +0,0 @@ -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Welcome to Jekyll! - -# This config file is meant for settings that affect your whole blog, values -# which you are expected to set up once and rarely need to edit after that. -# For technical reasons, this file is *NOT* reloaded automatically when you use -# 'jekyll serve'. If you change this file, please restart the server process. - -# Site settings -title: Apache Beam -description: > # this means to ignore newlines until "baseurl:" - Apache Beam is an open source, unified model and set of language-specific - SDKs for defining and executing data processing workflows, and also data - ingestion and integration flows, supporting Enterprise Integration Patterns - (EIPs) and Domain Specific Languages (DSLs). Dataflow pipelines simplify the - mechanics of large-scale batch and streaming data processing and can run on a - number of runtimes like Apache Flink, Apache Spark, and Google Cloud Dataflow - (a cloud service). Beam also brings DSL in different languages, allowing - users to easily implement their data integration processes. - -# input directory -source: src - -# the base hostname & protocol for your site -url: "https://beam.apache.org" - -# The repository and branch where the files live in Github or Colab. This is used -# to serve and stage from your local branch, but publish to the master branch. -# e.g. https://github.com/{{ site.branch_repo }}/path/to/notebook.ipynb -# e.g. https://colab.sandbox.google.com/github/{{ site.branch_repo }}/path/to/notebook.ipynb -branch_repo: "apache/beam/blob/master" - -twitter_username: apachebeam - -# Build settings -markdown: kramdown - -collections: -- beam_team - -# Things to include that are subdirectories created by sphinx -include: ['_static', '_modules', '_sources', '.htaccess'] - -# Things to ignore in the build -exclude: ['README.md', 'Gemfile.lock', 'Gemfile', 'Rakefile', 'vendor/'] - -# Downloads directory -downloads: downloads - -# Don't use the page title in the table of contents -kramdown: - toc_levels: 2..6 - -# The most recent release of Beam. -release_latest: 2.20.0 - -# Plugins are configured in the Gemfile. - -# Set the time zone for site generation, fixed to US Pacific Time -timezone: America/Los_Angeles diff --git a/website/append_index_html_to_internal_links.py b/website/append_index_html_to_internal_links.py index feb0a62c5a2c..32c3d46163ab 100644 --- a/website/append_index_html_to_internal_links.py +++ b/website/append_index_html_to_internal_links.py @@ -72,6 +72,30 @@ print('Fixing links in: ' + match) mf = open(match) soup = BeautifulSoup(mf) + + # Iterates over every which is used for aliases - redirected links + for meta in soup.findAll('meta'): + try: + content = meta['content'] + alias = content.replace('0; url=', '') + if re.match(linkMatch, alias) is not None: + if alias.endswith('/'): + # /internal/link/ + meta['content'] = content + 'index.html' + else: + # /internal/link + meta['content'] = content + '/index.html' + mf.close() + + html = unicode(soup).encode('utf-8') + # Write back to the file. + with open(match, "wb") as f: + print('Replacing ' + content + ' with: ' + meta['content']) + f.write(html) + except KeyError as e: + # Some tags don't have url. + continue + # Iterates over every for a in soup.findAll('a'): try: diff --git a/website/build.gradle b/website/build.gradle index a1076e3197a9..235cb8426e6d 100644 --- a/website/build.gradle +++ b/website/build.gradle @@ -22,8 +22,10 @@ plugins { } def dockerImageTag = 'beam-website' -def dockerWorkDir = "/repo" +def dockerWorkDir = "/opt" def buildDir = "${project.rootDir}/build/website" +def dockerSourceDir = "$dockerWorkDir/website/www" +def dockerBuildDir = "$dockerWorkDir/build/website" def commitedChanges = false def gitboxUrl = project.findProperty('gitPublishRemote') ?: 'https://gitbox.apache.org/repos/asf/beam.git' @@ -53,7 +55,6 @@ task setupVirtualenv { } task buildDockerImage(type: Exec) { - inputs.files 'Gemfile', 'Gemfile.lock' commandLine 'docker', 'build', '-t', dockerImageTag, '.' } @@ -66,8 +67,8 @@ task createDockerContainer(type: Exec) { gradle.taskGraph.whenReady { def extraOptions = '' if (gradle.taskGraph.hasTask(":${project.name}:serveWebsite")) { - // Publish port 4000 where Jekyll serves website from - extraOptions = '--publish 127.0.0.1:4000:4000' + // Publish port 1313 where Hugo serves website from + extraOptions = "--publish 127.0.0.1:1313:1313" } // Jenkins websites node: run as root. Files written to /repo will get @@ -78,7 +79,7 @@ task createDockerContainer(type: Exec) { extraOptions += " -u \$(id -u):\$(id -g)" } commandLine '/bin/bash', '-c', - "docker create -v $project.rootDir:$dockerWorkDir $extraOptions $dockerImageTag" + "docker create -v $project.rootDir:$dockerWorkDir $extraOptions $dockerImageTag sh -c 'trap \"exit 0\" INT; while true; do sleep 30; done;'" } } @@ -91,8 +92,28 @@ task startDockerContainer(type: Exec) { "${->createDockerContainer.containerId()}" // Lazily evaluate containerId. } +// Clone Docsy submodule which is our Hugo theme +task initGitSubmodules(type: Exec) { + commandLine 'docker', 'exec', + "${->startDockerContainer.containerId()}", 'git', 'submodule', 'update', '--init', '--recursive' +} + +// Install yarn dependencies +task installDependencies(type: Exec) { + commandLine 'docker', 'exec', '--workdir', "$dockerSourceDir", + "${->startDockerContainer.containerId()}", 'yarn', 'install' +} + +// Run build_github_samples.sh to fetch github content +// which is used by github_sample shortcodes to inject snippets into codeblocks +task buildGithubSamples(type: Exec) { + commandLine 'docker', 'exec', '--workdir', "$dockerSourceDir", + "${->startDockerContainer.containerId()}", 'yarn', 'build_github_samples' +} + task setupDockerContainer(type: Exec) { dependsOn startDockerContainer + finalizedBy initGitSubmodules, installDependencies, buildGithubSamples ext.containerId = { return startDockerContainer.containerId() } @@ -100,21 +121,13 @@ task setupDockerContainer(type: Exec) { // Create the config to point to a GitHub or Colab blob in the repo, e.g. apache/beam/blob/master commandLine 'docker', 'exec', '-u', 'root', "${->startDockerContainer.containerId()}", '/bin/bash', '-c', - """echo 'branch_repo: "${getBranchRepo()}"' > /tmp/_config_branch_repo.yml""" + """echo '[params]\n branch_repo = "${getBranchRepo()}"' > /tmp/_config_branch_repo.toml""" } task stopAndRemoveDockerContainer(type: Exec) { commandLine 'docker', 'rm', '-f', "${->createDockerContainer.containerId()}" } -task setupBuildDir(type: Copy) { - from('.') { - include 'Gemfile*' - include 'Rakefile' - } - into buildDir -} - task cleanWebsite(type: Delete) { delete buildDir } @@ -123,35 +136,24 @@ clean.dependsOn cleanWebsite class BuildTaskConfiguration { String name boolean useTestConfig = false - boolean useBranchRepoConfig = false String baseUrl = '' String dockerWorkDir = '' } def createBuildTask = { BuildTaskConfiguration config = it as BuildTaskConfiguration task "build${config.name}Website" (type:Exec) { - dependsOn setupDockerContainer, setupBuildDir + dependsOn setupDockerContainer finalizedBy stopAndRemoveDockerContainer - inputs.files 'Gemfile.lock', '_config.yml' - inputs.dir 'src' - outputs.dir "$buildDir/.sass-cache" - outputs.dir buildContentDir(config.name) - def configs = "${config.dockerWorkDir}/website/_config.yml" - if (config.useTestConfig) { - configs += ",${config.dockerWorkDir}/website/_config_test.yml" - } - if (config.useBranchRepoConfig) { - configs += ",/tmp/_config_branch_repo.yml" - } - def baseUrlFlag = config.baseUrl ? "--baseurl=/${config.baseUrl}" : "" + + def configs = "$dockerSourceDir/site/config.toml" + def baseUrlFlag = config.baseUrl ? "--baseURL /${config.baseUrl}" : "" commandLine 'docker', 'exec', "${->setupDockerContainer.containerId()}", '/bin/bash', '-c', - """cd ${config.dockerWorkDir}/build/website && \ - bundle exec jekyll build \ - --destination generated-${config.name.toLowerCase()}-content \ - --config ${configs} \ - --incremental ${baseUrlFlag} \ - --source ${config.dockerWorkDir}/website/src + """cd $dockerSourceDir && \ + yarn build \ + -d $dockerBuildDir/generated-${config.name.toLowerCase()}-content \ + --config $configs \ + $baseUrlFlag """ } } @@ -159,9 +161,6 @@ def createBuildTask = { // task buildLocalWebsite createBuildTask( name:'Local', - useTestConfig: true, - useBranchRepoConfig: true, - dockerWorkDir: dockerWorkDir, ) task buildWebsite(dependsOn:buildLocalWebsite) build.dependsOn buildWebsite @@ -169,16 +168,12 @@ build.dependsOn buildWebsite // task buildGcsWebsite createBuildTask( name:'Gcs', - useTestConfig: true, - useBranchRepoConfig: true, baseUrl: getBaseUrl(), - dockerWorkDir: dockerWorkDir, ) // task buildApacheWebsite createBuildTask( name:'Apache', - dockerWorkDir: dockerWorkDir, ) /** @@ -236,20 +231,14 @@ def buildContentDir(name) { } task serveWebsite(type: Exec) { - dependsOn setupDockerContainer, setupBuildDir + dependsOn setupDockerContainer finalizedBy stopAndRemoveDockerContainer - inputs.files 'Gemfile.lock', '_config.yml' - inputs.dir 'src' - outputs.dir "$buildDir/.sass-cache" - outputs.dir buildContentDir('local') commandLine 'docker', 'exec', "${->setupDockerContainer.containerId()}", '/bin/bash', '-c', - """cd $dockerWorkDir/build/website && \ - bundle exec jekyll serve \ - --config $dockerWorkDir/website/_config.yml,/tmp/_config_branch_repo.yml \ - --incremental \ - --source $dockerWorkDir/website/src \ - --host 0.0.0.0 + """cd $dockerSourceDir && \ + yarn develop \ + --bind="0.0.0.0" \ + --config $dockerSourceDir/site/config.toml,/tmp/_config_branch_repo.toml """ } @@ -257,12 +246,9 @@ task testWebsite(type: Exec) { // dependsOn setupDockerContainer, 'buildWebsite' finalizedBy stopAndRemoveDockerContainer - inputs.files "$buildDir/Rakefile" - inputs.dir buildContentDir('local') commandLine 'docker', 'exec', "${->setupDockerContainer.containerId()}", '/bin/bash', '-c', - """cd $dockerWorkDir/build/website && \ - bundle exec -- rake test disable_external=${findProperty('disableExternal') ?: true}""" + "$dockerSourceDir/check-links.sh $dockerBuildDir/generated-local-content" } testWebsite.dependsOn 'buildLocalWebsite' diff --git a/website/src/_data/authors.yml b/website/src/_data/authors.yml deleted file mode 100644 index f4e5fbe7f9ef..000000000000 --- a/website/src/_data/authors.yml +++ /dev/null @@ -1,157 +0,0 @@ -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Welcome to Jekyll! - -aizhamal: - name: Aizhamal Nurmamat kyzy - email: aizhamal@google.com - twitter: iamaijamal -aljoscha: - name: Aljoscha Krettek - email: aljoscha@apache.org - twitter: aljoscha -altay: - name: Ahmet Altay - email: altay@apache.org -angoenka: - name: Ankur Goenka - email: goenka@apache.org -anton: - name: Anton Kedin - email: anton@apache.org -ccy: - name: Charles Chen - email: ccy@apache.org -chamikara: - name: Chamikara Jayalath - email: chamikara@apache.org -davor: - name: Davor Bonaci - email: davor@apache.org - twitter: BonaciDavor -dhalperi: - name: Dan Halperin - email: dhalperi@apache.org - twitter: -fjp: - name: Frances Perry - email: frances@apache.org - twitter: francesjperry -harshithdwivedi: - name: Harshit Dwivedi - email: harshithdwivedi@gmail.com - twitter: harshithdwivedi -henryken: - name: Henry Suryawirawan - email: henry.ken@gmail.com - twitter: henry_ken -iemejia: - name: Ismaël Mejía - email: iemejia@apache.org - twitter: iemejia -jamesmalone: - name: James Malone - email: jamesmalone@apache.org - twitter: chimerasaurus -jesseanderson: - name: Jesse Anderson - twitter: jessetanderson -jphalip: - name: Julien Phalip - email: jphalip@google.com - twitter: julienphalip -klk: - name: Kenneth Knowles - email: klk@apache.org - twitter: KennKnowles -lkuligin: - name: Leonid Kuligin - email: kuligin@google.com - twitter: lkulighin -markliu: - name: Mark Liu - email: markliu@apache.org - twitter: -ardagan: - name: Mikhail Gryzykhin - email: mikhail@apache.org - twitter: -robertwb: - name: Robert Bradshaw - email: robertwb@apache.org - twitter: -takidau: - name: Tyler Akidau - email: takidau@apache.org - twitter: takidau -tgroh: - name: Thomas Groh - email: tgroh@google.com -thw: - name: Thomas Weise - email: thw@apache.org - twitter: thweise -jkff: - name: Eugene Kirpichov - email: ekirpichov@gmail.com - twitter: -jbonofre: - name: Jean-Baptiste Onofré - email: jbonofre@apache.org - twitter: jbonofre -ianand: - name: Anand Iyer - email: ianand@google.com - twitter: -aromanenko: - name: Alexey Romanenko - email: aromanenko@apache.org - twitter: alexromdev -pabloem: - name: Pablo Estrada - email: pabloem@apache.org - twitter: polecitoem -rfernand: - name: Rafael Fernández - email: rfernand@google.com -mbaetens: - name: Matthias Baetens - email: baetensmatthias@gmail.com - twitter: matthiasbaetens -rez: - name: Reza Rokni - email: rez@google.com - twitter: rarokni -ttanay: - name: Tanay Tummalapalli - email: ttanay100@gmail.com - twitter: ttanay100 -udim: - name: Udi Meiri - email: udim@apache.org - twitter: udim -boyuanzz: - name: Boyuan Zhang - email: boyuanz@apache.org - twitter: -amaliujia: - name: Rui Wang - email: amaliujia@apache.org - twitter: -mxm: - name: Maximilian Michels - email: mxm@apache.org - twitter: stadtlegende -pedro: - name: Pedro Galvan - email: pedro@sg.com.mx - twitter: pedrogk diff --git a/website/src/_data/capability-matrix.yml b/website/src/_data/capability-matrix.yml deleted file mode 100644 index 307824bed7ba..000000000000 --- a/website/src/_data/capability-matrix.yml +++ /dev/null @@ -1,1708 +0,0 @@ -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Welcome to Jekyll! - -columns: - - class: model - name: Beam Model - - class: dataflow - name: Google Cloud Dataflow - - class: flink - name: Apache Flink - - class: spark-rdd - name: Apache Spark (RDD/DStream based) - - class: spark-dataset - name: Apache Spark Structured Streaming (Dataset based) - - class: apex - name: Apache Apex - - class: gearpump - name: Apache Gearpump - - class: mapreduce - name: Apache Hadoop MapReduce - - class: jstorm - name: JStorm - - class: ibmstreams - name: IBM Streams - - class: samza - name: Apache Samza - - class: nemo - name: Apache Nemo - - class: jet - name: Hazelcast Jet - -categories: - - description: What is being computed? - anchor: what - color-b: 'ca1' - color-y: 'ec3' - color-p: 'fe5' - color-n: 'ddd' - rows: - - name: ParDo - values: - - class: model - l1: 'Yes' - l2: element-wise processing - l3: Element-wise transformation parameterized by a chunk of user code. Elements are processed in bundles, with initialization and termination hooks. Bundle size is chosen by the runner and cannot be controlled by user code. ParDo processes a main input PCollection one element at a time, but provides side input access to additional PCollections. - - class: dataflow - l1: 'Yes' - l2: fully supported - l3: Batch mode uses large bundle sizes. Streaming uses smaller bundle sizes. - - class: flink - l1: 'Yes' - l2: fully supported - l3: ParDo itself, as per-element transformation with UDFs, is fully supported by Flink for both batch and streaming. - - class: spark-rdd - l1: 'Yes' - l2: fully supported - l3: ParDo applies per-element transformations as Spark FlatMapFunction. - - class: spark-dataset - l1: 'Partially' - l2: fully supported in batch mode - l3: ParDo applies per-element transformations as Spark FlatMapFunction. - - class: apex - l1: 'Yes' - l2: fully supported - l3: Supported through Apex operator that wraps the function and processes data as single element bundles. - - class: gearpump - l1: 'Yes' - l2: fully supported - l3: Gearpump wraps the per-element transformation function into processor execution. - - class: mapreduce - l1: 'Yes' - l2: fully supported - l3: '' - - class: jstorm - l1: 'Yes' - l2: fully supported - l3: '' - - class: ibmstreams - l1: 'Yes' - l2: fully supported - l3: '' - - class: samza - l1: 'Yes' - l2: fully supported - l3: Supported with per-element transformation. - - class: nemo - l1: 'Yes' - l2: fully supported - l3: '' - - class: jet - l1: 'Yes' - l2: fully supported - l3: '' - - name: GroupByKey - values: - - class: model - l1: 'Yes' - l2: key grouping - l3: Grouping of key-value pairs per key, window, and pane. (See also other tabs.) - - class: dataflow - l1: 'Yes' - l2: fully supported - l3: '' - - class: flink - l1: 'Yes' - l2: fully supported - l3: "Uses Flink's keyBy for key grouping. When grouping by window in streaming (creating the panes) the Flink runner uses the Beam code. This guarantees support for all windowing and triggering mechanisms." - - class: spark-rdd - l1: 'Partially' - l2: fully supported in batch mode - l3: "Using Spark's groupByKey. GroupByKey with multiple trigger firings in streaming mode is a work in progress." - - class: spark-dataset - l1: 'Partially' - l2: fully supported in batch mode - l3: "Using Spark's groupByKey." - - class: apex - l1: 'Yes' - l2: fully supported - l3: "Apex runner uses the Beam code for grouping by window and thereby has support for all windowing and triggering mechanisms. Runner does not implement partitioning yet (BEAM-838)" - - class: gearpump - l1: 'Yes' - l2: fully supported - l3: "Use Gearpump's groupBy and window for key grouping and translate Beam's windowing and triggering to Gearpump's internal implementation." - - class: mapreduce - l1: 'Yes' - l2: fully supported - l3: '' - - class: jstorm - l1: 'Yes' - l2: fully supported - l3: '' - - class: ibmstreams - l1: 'Yes' - l2: fully supported - l3: '' - - class: samza - l1: 'Yes' - l2: fully supported - l3: "Uses Samza's partitionBy for key grouping and Beam's logic for window aggregation and triggering." - - class: nemo - l1: 'Yes' - l2: fully supported - l3: '' - - class: jet - l1: 'Yes' - l2: fully supported - l3: '' - - name: Flatten - values: - - class: model - l1: 'Yes' - l2: collection concatenation - l3: Concatenates multiple homogenously typed collections together. - - class: dataflow - l1: 'Yes' - l2: fully supported - l3: '' - - class: flink - l1: 'Yes' - l2: fully supported - l3: '' - - class: spark-rdd - l1: 'Yes' - l2: fully supported - l3: '' - - class: spark-dataset - l1: 'Partially' - l2: fully supported in batch mode - l3: Some corner cases like flatten on empty collections are not yet supported. - - class: apex - l1: 'Yes' - l2: fully supported - l3: '' - - class: gearpump - l1: 'Yes' - l2: fully supported - l3: '' - - class: mapreduce - l1: 'Yes' - l2: fully supported - l3: '' - - class: jstorm - l1: 'Yes' - l2: fully supported - l3: '' - - class: ibmstreams - l1: 'Yes' - l2: fully supported - l3: '' - - class: samza - l1: 'Yes' - l2: fully supported - l3: '' - - class: nemo - l1: 'Yes' - l2: fully supported - l3: '' - - class: jet - l1: 'Yes' - l2: fully supported - l3: '' - - name: Combine - values: - - class: model - l1: 'Yes' - l2: associative & commutative aggregation - l3: 'Application of an associative, commutative operation over all values ("globally") or over all values associated with each key ("per key"). Can be implemented using ParDo, but often more efficient implementations exist.' - - class: dataflow - l1: 'Yes' - l2: 'efficient execution' - l3: '' - - class: flink - l1: 'Yes' - l2: 'fully supported' - l3: Uses a combiner for pre-aggregation for batch and streaming. - - class: spark-rdd - l1: 'Yes' - l2: fully supported - l3: "Using Spark's combineByKey and aggregate functions." - - class: spark-dataset - l1: 'Partially' - l2: fully supported in batch mode - l3: "Using Spark's Aggregator and agg function" - - class: apex - l1: 'Yes' - l2: 'fully supported' - l3: "Default Beam translation. Currently no efficient pre-aggregation (BEAM-935)." - - class: gearpump - l1: 'Yes' - l2: fully supported - l3: '' - - class: mapreduce - l1: 'Yes' - l2: fully supported - l3: '' - - class: jstorm - l1: 'Yes' - l2: fully supported - l3: '' - - class: ibmstreams - l1: 'Yes' - l2: fully supported - l3: '' - - class: samza - l1: 'Yes' - l2: fully supported - l3: Use combiner for efficient pre-aggregation. - - class: nemo - l1: 'Yes' - l2: fully supported - l3: 'Batch mode uses pre-aggregation' - - class: jet - l1: 'Yes' - l2: fully supported - l3: 'Batch mode uses pre-aggregation' - - name: Composite Transforms - values: - - class: model - l1: 'Yes' - l2: user-defined transformation subgraphs - l3: Allows easy extensibility for library writers. In the near future, we expect there to be more information provided at this level -- customized metadata hooks for monitoring, additional runtime/environment hooks, etc. - - class: dataflow - l1: 'Partially' - l2: supported via inlining - l3: Currently composite transformations are inlined during execution. The structure is later recreated from the names, but other transform level information (if added to the model) will be lost. - - class: flink - l1: 'Partially' - l2: supported via inlining - l3: '' - - class: spark-rdd - l1: 'Partially' - l2: supported via inlining - l3: '' - - class: spark-dataset - l1: 'Partially' - l2: supported via inlining only in batch mode - l3: '' - - class: apex - l1: 'Partially' - l2: supported via inlining - l3: '' - - class: gearpump - l1: 'Partially' - l2: supported via inlining - l3: '' - - class: mapreduce - l1: 'Yes' - l2: fully supported - l3: '' - - class: jstorm - l1: 'Yes' - l2: fully supported - l3: '' - - class: ibmstreams - l1: 'Partially' - l2: supported via inlining - l3: '' - - class: samza - l1: 'Partially' - l2: supported via inlining - l3: '' - - class: nemo - l1: 'Yes' - l2: fully supported - l3: '' - - class: jet - l1: 'Partially' - l2: supported via inlining - l3: '' - - name: Side Inputs - values: - - class: model - l1: 'Yes' - l2: additional elements available during DoFn execution - l3: Side inputs are additional PCollections whose contents are computed during pipeline execution and then made accessible to DoFn code. The exact shape of the side input depends both on the PCollectionView used to describe the access pattern (interable, map, singleton) and the window of the element from the main input that is currently being processed. - - class: dataflow - l1: 'Yes' - l2: some size restrictions in streaming - l3: Batch mode supports a distributed implementation, but streaming mode may force some size restrictions. Neither mode is able to push lookups directly up into key-based sources. - - class: flink - l1: 'Yes' - l2: some size restrictions in streaming - l3: Batch mode supports a distributed implementation, but streaming mode may force some size restrictions. Neither mode is able to push lookups directly up into key-based sources. - - class: spark-rdd - l1: 'Yes' - l2: fully supported - l3: "Using Spark's broadcast variables. In streaming mode, side inputs may update but only between micro-batches." - - class: spark-dataset - l1: 'Partially' - l2: fully supported in batch mode - l3: "Using Spark's broadcast variables." - - class: apex - l1: 'Yes' - l2: size restrictions - l3: No distributed implementation and therefore size restrictions. - - class: gearpump - l1: 'Yes' - l2: fully supported - l3: Implemented by merging side input as a normal stream in Gearpump - - class: mapreduce - l1: 'Yes' - l2: fully supported - l3: '' - - class: jstorm - l1: 'Yes' - l2: some size restrictions - l3: '' - - class: ibmstreams - l1: 'Yes' - l2: fully supported - l3: '' - - class: samza - l1: 'Yes' - l2: fully supported - l3: Uses Samza's broadcast operator to distribute the side inputs. - - class: nemo - l1: 'Yes' - l2: fully supported - l3: '' - - class: jet - l1: 'Partially' - l2: with restrictions - l3: Supported only when the side input source is bounded and windowing uses global window - - name: Source API - values: - - class: model - l1: 'Yes' - l2: user-defined sources - l3: Allows users to provide additional input sources. Supports both bounded and unbounded data. Includes hooks necessary to provide efficient parallelization (size estimation, progress information, dynamic splitting, etc). - - class: dataflow - l1: 'Yes' - l2: fully supported - l3: Support includes autotuning features (https://cloud.google.com/dataflow/service/dataflow-service-desc#autotuning-features). - - class: flink - l1: 'Yes' - l2: fully supported - l3: - - class: spark-rdd - l1: 'Yes' - l2: fully supported - l3: - - class: spark-dataset - l1: 'Partially' - l2: bounded source only - l3: "Using Spark's DatasourceV2 API in microbatch mode (Continuous streaming mode is tagged experimental in spark and does not support aggregation)." - - class: apex - l1: 'Yes' - l2: fully supported - l3: - - class: gearpump - l1: 'Yes' - l2: fully supported - l3: '' - - class: mapreduce - l1: 'Partially' - l2: bounded source only - l3: '' - - class: jstorm - l1: 'Yes' - l2: fully supported - l3: '' - - class: ibmstreams - l1: 'Yes' - l2: fully supported - l3: '' - - class: samza - l1: 'Yes' - l2: fully supported - l3: '' - - class: nemo - l1: 'Yes' - l2: fully supported - l3: '' - - class: jet - l1: 'Yes' - l2: fully supported - l3: '' - - name: Splittable DoFn (SDF) - values: - - class: model - l1: 'Partially' - l2: DoFn where processing of each element can be split for parallelism, or suspended and resumed - l3: Allows users to develop DoFn's that process a single element in portions ("restrictions"), executed in parallel or sequentially. This supersedes the unbounded and bounded `Source` APIs by supporting all of their features on a per-element basis. See http://s.apache.org/splittable-do-fn. Design is in progress on achieving parity with Source API regarding progress signals. - - class: dataflow - l1: 'Yes' - l2: - l3: Does not yet support autotuning features of the Source API. - - class: flink - l1: 'Yes' - l2: - l3: - - class: spark-rdd - l1: 'Partially' - l2: supports bounded-per-element SDFs - l3: - - class: spark-dataset - l1: 'No' - l2: not implemented - l3: - - class: apex - l1: 'Partially' - l2: supports bounded-per-element SDFs - l3: implementation in streaming mode coming soon - - class: gearpump - l1: 'Partially' - l2: supports bounded-per-element SDFs - l3: - - class: mapreduce - l1: 'No' - l2: not implemented - l3: - - class: jstorm - l1: 'No' - l2: not implemented - l3: - - class: ibmstreams - l1: 'No' - l2: not implemented - l3: - - class: samza - l1: 'Partially' - l2: supports bounded-per-element SDFs - l3: - - class: nemo - l1: 'No' - l2: not implemented - l3: '' - - class: jet - l1: 'No' - l2: not implemented - l3: '' - - name: Metrics - values: - - class: model - l1: 'Partially' - l2: user-provided metrics - l3: Allow transforms to gather simple metrics across bundles in a PTransform. Provide a mechanism to obtain both committed and attempted metrics. Semantically similar to using an additional output, but support partial results as the transform executes, and support both committed and attempted values. Will likely want to augment Metrics to be more useful for processing unbounded data by making them windowed. - - class: dataflow - l1: 'Partially' - l2: '' - l3: Gauge metrics are not supported. All other metric types are supported. - - class: flink - l1: 'Partially' - l2: All metrics types are supported. - l3: Only attempted values are supported. No committed values for metrics. - - class: spark-rdd - l1: 'Partially' - l2: All metric types are supported. - l3: Only attempted values are supported. No committed values for metrics. - - class: spark-dataset - l1: 'Partially' - l2: All metric types are supported in batch mode. - l3: Only attempted values are supported. No committed values for metrics. - - class: apex - l1: 'No' - l2: Not implemented in runner. - l3: - - class: gearpump - l1: 'No' - l2: '' - l3: not implemented - - class: mapreduce - l1: 'Partially' - l2: Only attempted counters are supported - l3: '' - - class: jstorm - l1: 'Partially' - l2: Metrics are only supported in local mode. - l3: '' - - class: ibmstreams - l1: 'Partially' - l2: All metrics types are supported. - l3: Only attempted values are supported. No committed values for metrics. - - class: samza - l1: 'Partially' - l2: Counter and Gauge are supported. - l3: Only attempted values are supported. No committed values for metrics. - - class: nemo - l1: 'No' - l2: not implemented - l3: '' - - class: jet - l1: 'Partially' - l2: All metrics types supported, both in batching and streaming mode. - l3: Doesn't differentiate between committed and attempted values. - - name: Stateful Processing - values: - - class: model - l1: 'Yes' - l2: storage per key, per window - l3: Allows fine-grained access to per-key, per-window persistent state. Necessary for certain use cases (e.g. high-volume windows which store large amounts of data, but typically only access small portions of it; complex state machines; etc.) that are not easily or efficiently addressed via Combine or GroupByKey+ParDo. - - class: dataflow - l1: 'Partially' - l2: non-merging windows - l3: State is supported for non-merging windows. SetState and MapState are not yet supported. - - class: flink - l1: 'Partially' - l2: non-merging windows - l3: State is supported for non-merging windows. SetState and MapState are not yet supported. - - class: spark-rdd - l1: 'Partially' - l2: full support in batch mode - l3: - - class: spark-dataset - l1: 'No' - l2: not implemented - l3: - - class: apex - l1: 'Partially' - l2: non-merging windows - l3: State is supported for non-merging windows. SetState and MapState are not yet supported. - - class: gearpump - l1: 'No' - l2: not implemented - l3: '' - - class: mapreduce - l1: 'Partially' - l2: non-merging windows - l3: '' - - class: jstorm - l1: 'Partially' - l2: non-merging windows - l3: '' - - class: ibmstreams - l1: 'Partially' - l2: non-merging windows - l3: '' - - class: samza - l1: 'Partially' - l2: non-merging windows - l3: 'States are backed up by either rocksDb KV store or in-memory hash map, and persist using changelog.' - - class: nemo - l1: 'No' - l2: not implemented - l3: '' - - class: jet - l1: 'Partially' - l2: non-merging windows - l3: '' - - description: Where in event time? - anchor: where - color-b: '37d' - color-y: '59f' - color-p: '8cf' - color-n: 'ddd' - rows: - - name: Global windows - values: - - class: model - l1: 'Yes' - l2: all time - l3: The default window which covers all of time. (Basically how traditional batch cases fit in the model.) - - class: dataflow - l1: 'Yes' - l2: default - l3: '' - - class: flink - l1: 'Yes' - l2: supported - l3: '' - - class: spark-rdd - l1: 'Yes' - l2: supported - l3: '' - - class: spark-dataset - l1: 'Partially' - l2: fully supported in batch mode - l3: '' - - class: apex - l1: 'Yes' - l2: supported - l3: '' - - class: gearpump - l1: 'Yes' - l2: supported - l3: '' - - class: mapreduce - l1: 'Yes' - l2: supported - l3: '' - - class: jstorm - l1: 'Yes' - l2: supported - l3: '' - - class: ibmstreams - l1: 'Yes' - l2: supported - l3: '' - - class: samza - l1: 'Yes' - l2: supported - l3: '' - - class: nemo - l1: 'Yes' - l2: supported - l3: '' - - class: jet - l1: 'Yes' - l2: supported - l3: '' - - name: Fixed windows - values: - - class: model - l1: 'Yes' - l2: periodic, non-overlapping - l3: Fixed-size, timestamp-based windows. (Hourly, Daily, etc) - - class: dataflow - l1: 'Yes' - l2: built-in - l3: '' - - class: flink - l1: 'Yes' - l2: supported - l3: '' - - class: spark-rdd - l1: 'Yes' - l2: supported - l3: '' - - class: spark-dataset - l1: 'Partially' - l2: fully supported in batch mode - l3: '' - - class: apex - l1: 'Yes' - l2: supported - l3: '' - - class: gearpump - l1: 'Yes' - l2: supported - l3: '' - - class: mapreduce - l1: 'Yes' - l2: supported - l3: '' - - class: jstorm - l1: 'Yes' - l2: supported - l3: '' - - class: ibmstreams - l1: 'Yes' - l2: supported - l3: '' - - class: samza - l1: 'Yes' - l2: supported - l3: '' - - class: nemo - l1: 'Yes' - l2: supported - l3: '' - - class: jet - l1: 'Yes' - l2: supported - l3: '' - - name: Sliding windows - values: - - class: model - l1: 'Yes' - l2: periodic, overlapping - l3: Possibly overlapping fixed-size timestamp-based windows (Every minute, use the last ten minutes of data.) - - class: dataflow - l1: 'Yes' - l2: built-in - l3: '' - - class: flink - l1: 'Yes' - l2: supported - l3: '' - - class: spark-rdd - l1: 'Yes' - l2: supported - l3: '' - - class: spark-dataset - l1: 'Partially' - l2: fully supported in batch mode - l3: '' - - class: apex - l1: 'Yes' - l2: supported - l3: '' - - class: gearpump - l1: 'Yes' - l2: supported - l3: '' - - class: mapreduce - l1: 'Yes' - l2: supported - l3: '' - - class: jstorm - l1: 'Yes' - l2: supported - l3: '' - - class: ibmstreams - l1: 'Yes' - l2: supported - l3: '' - - class: samza - l1: 'Yes' - l2: supported - l3: '' - - class: nemo - l1: 'Yes' - l2: supported - l3: '' - - class: jet - l1: 'Yes' - l2: supported - l3: '' - - name: Session windows - values: - - class: model - l1: 'Yes' - l2: activity-based - l3: Based on bursts of activity separated by a gap size. Different per key. - - class: dataflow - l1: 'Yes' - l2: built-in - l3: '' - - class: flink - l1: 'Yes' - l2: supported - l3: '' - - class: spark-rdd - l1: 'Yes' - l2: supported - l3: '' - - class: spark-dataset - l1: 'Partially' - l2: fully supported in batch mode - l3: '' - - class: apex - l1: 'Yes' - l2: supported - l3: '' - - class: gearpump - l1: 'Yes' - l2: supported - l3: '' - - class: mapreduce - l1: 'Yes' - l2: supported - l3: '' - - class: jstorm - l1: 'Yes' - l2: supported - l3: '' - - class: ibmstreams - l1: 'Yes' - l2: supported - l3: '' - - class: samza - l1: 'Yes' - l2: supported - l3: '' - - class: nemo - l1: 'Yes' - l2: supported - l3: '' - - class: jet - l1: 'Yes' - l2: supported - l3: '' - - name: Custom windows - values: - - class: model - l1: 'Yes' - l2: user-defined windows - l3: All windows must implement BoundedWindow, which specifies a max timestamp. Each WindowFn assigns elements to an associated window. - - class: dataflow - l1: 'Yes' - l2: supported - l3: '' - - class: flink - l1: 'Yes' - l2: supported - l3: '' - - class: spark-rdd - l1: 'Yes' - l2: supported - l3: '' - - class: spark-dataset - l1: 'Partially' - l2: fully supported in batch mode - l3: '' - - class: apex - l1: 'Yes' - l2: supported - l3: '' - - class: gearpump - l1: 'Yes' - l2: supported - l3: '' - - class: mapreduce - l1: 'Yes' - l2: supported - l3: '' - - class: jstorm - l1: 'Yes' - l2: supported - l3: '' - - class: ibmstreams - l1: 'Yes' - l2: supported - l3: '' - - class: samza - l1: 'Yes' - l2: supported - l3: '' - - class: nemo - l1: 'Yes' - l2: supported - l3: '' - - class: jet - l1: 'Yes' - l2: supported - l3: '' - - name: Custom merging windows - values: - - class: model - l1: 'Yes' - l2: user-defined merging windows - l3: A custom WindowFn additionally specifies whether and how to merge windows. - - class: dataflow - l1: 'Yes' - l2: supported - l3: '' - - class: flink - l1: 'Yes' - l2: supported - l3: '' - - class: spark-rdd - l1: 'Yes' - l2: supported - l3: '' - - class: spark-dataset - l1: 'Partially' - l2: fully supported in batch mode - l3: '' - - class: apex - l1: 'Yes' - l2: supported - l3: '' - - class: gearpump - l1: 'Yes' - l2: supported - l3: '' - - class: mapreduce - l1: 'Yes' - l2: supported - l3: '' - - class: jstorm - l1: 'Yes' - l2: supported - l3: '' - - class: ibmstreams - l1: 'Yes' - l2: supported - l3: '' - - class: samza - l1: 'Yes' - l2: supported - l3: '' - - class: nemo - l1: 'Yes' - l2: supported - l3: '' - - class: jet - l1: 'Yes' - l2: supported - l3: '' - - name: Timestamp control - values: - - class: model - l1: 'Yes' - l2: output timestamp for window panes - l3: For a grouping transform, such as GBK or Combine, an OutputTimeFn specifies (1) how to combine input timestamps within a window and (2) how to merge aggregated timestamps when windows merge. - - class: dataflow - l1: 'Yes' - l2: supported - l3: '' - - class: flink - l1: 'Yes' - l2: supported - l3: '' - - class: spark-rdd - l1: 'Yes' - l2: supported - l3: '' - - class: spark-dataset - l1: 'Partially' - l2: fully supported in batch mode - l3: '' - - class: apex - l1: 'Yes' - l2: supported - l3: '' - - class: gearpump - l1: 'Yes' - l2: supported - l3: '' - - class: mapreduce - l1: 'Yes' - l2: supported - l3: '' - - class: jstorm - l1: 'Yes' - l2: supported - l3: '' - - class: ibmstreams - l1: 'Yes' - l2: supported - l3: '' - - class: samza - l1: 'Yes' - l2: supported - l3: '' - - class: nemo - l1: 'Yes' - l2: supported - l3: '' - - class: jet - l1: 'Yes' - l2: supported - l3: '' - - - description: When in processing time? - anchor: when - color-b: '6a4' - color-y: '8c6' - color-p: 'ae8' - color-n: 'ddd' - rows: - - - name: Configurable triggering - values: - - class: model - l1: 'Yes' - l2: user customizable - l3: Triggering may be specified by the user (instead of simply driven by hardcoded defaults). - - class: dataflow - l1: 'Yes' - l2: fully supported - l3: Fully supported in streaming mode. In batch mode, intermediate trigger firings are effectively meaningless. - - class: flink - l1: 'Yes' - l2: fully supported - l3: '' - - class: spark-rdd - l1: 'Yes' - l2: fully supported - l3: '' - - class: spark-dataset - l1: 'Partially' - l2: fully supported in batch mode - l3: '' - - class: apex - l1: 'Yes' - l2: fully supported - l3: '' - - class: gearpump - l1: 'No' - l2: '' - l3: '' - - class: mapreduce - l1: 'No' - l2: batch-only runner - l3: '' - - class: jstorm - l1: 'Yes' - l2: fully supported - l3: '' - - class: ibmstreams - l1: 'Yes' - l2: fully supported - l3: '' - - class: samza - l1: 'Yes' - l2: fully supported - l3: '' - - class: nemo - l1: 'Yes' - l2: fully supported - l3: '' - - class: jet - l1: 'Yes' - l2: fully supported - l3: '' - - - name: Event-time triggers - values: - - class: model - l1: 'Yes' - l2: relative to event time - l3: Triggers that fire in response to event-time completeness signals, such as watermarks progressing. - - class: dataflow - l1: 'Yes' - l2: yes in streaming, fixed granularity in batch - l3: Fully supported in streaming mode. In batch mode, currently watermark progress jumps from the beginning of time to the end of time once the input has been fully consumed, thus no additional triggering granularity is available. - - class: flink - l1: 'Yes' - l2: fully supported - l3: '' - - class: spark-rdd - l1: 'Yes' - l2: fully supported - l3: '' - - class: spark-dataset - l1: 'Partially' - l2: fully supported in batch mode - l3: '' - - class: apex - l1: 'Yes' - l2: fully supported - l3: '' - - class: gearpump - l1: 'Yes' - l2: fully supported - l3: '' - - class: mapreduce - l1: 'No' - l2: '' - l3: '' - - class: jstorm - l1: 'Yes' - l2: fully supported - l3: '' - - class: ibmstreams - l1: 'Yes' - l2: fully supported - l3: '' - - class: samza - l1: 'Yes' - l2: fully supported - l3: '' - - class: nemo - l1: 'Yes' - l2: fully supported - l3: '' - - class: jet - l1: 'Yes' - l2: fully supported - l3: '' - - - name: Processing-time triggers - values: - - class: model - l1: 'Yes' - l2: relative to processing time - l3: Triggers that fire in response to processing-time advancing. - - class: dataflow - l1: 'Yes' - l2: yes in streaming, fixed granularity in batch - l3: Fully supported in streaming mode. In batch mode, from the perspective of triggers, processing time currently jumps from the beginning of time to the end of time once the input has been fully consumed, thus no additional triggering granularity is available. - - class: flink - l1: 'Yes' - l2: fully supported - l3: '' - - class: spark-rdd - l1: 'Yes' - l2: "This is Spark streaming's native model" - l3: "Spark processes streams in micro-batches. The micro-batch size is actually a pre-set, fixed, time interval. Currently, the runner takes the first window size in the pipeline and sets it's size as the batch interval. Any following window operations will be considered processing time windows and will affect triggering." - - class: spark-dataset - l1: 'Partially' - l2: fully supported in batch mode - l3: - - class: apex - l1: 'Yes' - l2: fully supported - l3: '' - - class: gearpump - l1: 'No' - l2: '' - l3: '' - - class: mapreduce - l1: 'No' - l2: '' - l3: '' - - class: jstorm - l1: 'Yes' - l2: fully supported - l3: '' - - class: ibmstreams - l1: 'Yes' - l2: fully supported - l3: '' - - class: samza - l1: 'Yes' - l2: fully supported - l3: '' - - class: nemo - l1: 'Yes' - l2: fully supported - l3: '' - - class: jet - l1: 'Yes' - l2: fully supported - l3: '' - - - name: Count triggers - values: - - class: model - l1: 'Yes' - l2: every N elements - l3: Triggers that fire after seeing at least N elements. - - class: dataflow - l1: 'Yes' - l2: fully supported - l3: Fully supported in streaming mode. In batch mode, elements are processed in the largest bundles possible, so count-based triggers are effectively meaningless. - - class: flink - l1: 'Yes' - l2: fully supported - l3: '' - - class: spark-rdd - l1: 'Yes' - l2: fully supported - l3: '' - - class: spark-dataset - l1: 'Partially' - l2: fully supported in batch mode - l3: '' - - class: apex - l1: 'Yes' - l2: fully supported - l3: '' - - class: gearpump - l1: 'No' - l2: '' - l3: '' - - class: mapreduce - l1: 'No' - l2: '' - l3: '' - - class: jstorm - l1: 'Yes' - l2: fully supported - l3: '' - - class: ibmstreams - l1: 'Yes' - l2: fully supported - l3: '' - - class: samza - l1: 'Yes' - l2: fully supported - l3: '' - - class: nemo - l1: 'Yes' - l2: fully supported - l3: '' - - class: jet - l1: 'Yes' - l2: fully supported - l3: '' - - - name: '[Meta]data driven triggers' - values: - - class: model - jira: BEAM-101 - l1: 'No' - l2: in response to data - l3: Triggers that fire in response to attributes of the data being processed. - - class: dataflow - l1: 'No' - l2: pending model support - l3: - - class: flink - l1: 'No' - l2: pending model support - l3: - - class: spark-rdd - l1: 'No' - l2: pending model support - l3: - - class: spark-dataset - l1: 'No' - l2: pending model support - l3: - - class: apex - l1: 'No' - l2: pending model support - l3: - - class: gearpump - l1: 'No' - l2: pending model support - l3: - - class: mapreduce - l1: 'No' - l2: '' - l3: - - class: jstorm - l1: 'No' - l2: pending model support - l3: - - class: ibmstreams - l1: 'No' - l2: pending model support - l3: - - class: samza - l1: 'No' - l2: pending model support - l3: - - class: nemo - l1: 'No' - l2: pending model support - l3: '' - - class: jet - l1: 'No' - l2: pending model support - l3: '' - - - name: Composite triggers - values: - - class: model - l1: 'Yes' - l2: compositions of one or more sub-triggers - l3: Triggers which compose other triggers in more complex structures, such as logical AND, logical OR, early/on-time/late, etc. - - class: dataflow - l1: 'Yes' - l2: fully supported - l3: '' - - class: flink - l1: 'Yes' - l2: fully supported - l3: '' - - class: spark-rdd - l1: 'Yes' - l2: fully supported - l3: '' - - class: spark-dataset - l1: 'Partially' - l2: fully supported in batch mode - l3: '' - - class: apex - l1: 'Yes' - l2: fully supported - l3: '' - - class: gearpump - l1: 'No' - l2: '' - l3: '' - - class: mapreduce - l1: 'No' - l2: '' - l3: '' - - class: jstorm - l1: 'Yes' - l2: fully supported - l3: '' - - class: ibmstreams - l1: 'Yes' - l2: fully supported - l3: '' - - class: samza - l1: 'Yes' - l2: fully supported - l3: '' - - class: nemo - l1: 'Yes' - l2: fully supported - l3: '' - - class: jet - l1: 'Yes' - l2: fully supported - l3: '' - - - name: Allowed lateness - values: - - class: model - l1: 'Yes' - l2: event-time bound on window lifetimes - l3: A way to bound the useful lifetime of a window (in event time), after which any unemitted results may be materialized, the window contents may be garbage collected, and any addtional late data that arrive for the window may be discarded. - - class: dataflow - l1: 'Yes' - l2: fully supported - l3: Fully supported in streaming mode. In batch mode no data is ever late. - - class: flink - l1: 'Yes' - l2: fully supported - l3: '' - - class: spark-rdd - l1: 'No' - l2: '' - l3: '' - - class: spark-dataset - l1: 'No' - l2: no streaming support in the runner - l3: '' - - class: apex - l1: 'Yes' - l2: fully supported - l3: '' - - class: gearpump - l1: 'Yes' - l2: fully supported - l3: '' - - class: mapreduce - l1: 'No' - l2: '' - l3: '' - - class: jstorm - l1: 'Yes' - l2: fully supported - l3: '' - - class: ibmstreams - l1: 'Yes' - l2: fully supported - l3: '' - - class: samza - l1: 'Yes' - l2: fully supported - l3: '' - - class: nemo - l1: 'Yes' - l2: fully supported - l3: '' - - class: jet - l1: 'Yes' - l2: fully supported - l3: '' - - - name: Timers - values: - - class: model - l1: 'Yes' - l2: delayed processing callbacks - l3: A fine-grained mechanism for performing work at some point in the future, in either the event-time or processing-time domain. Useful for orchestrating delayed events, timeouts, etc in complex state per-key, per-window state machines. - - class: dataflow - l1: 'Partially' - l2: non-merging windows - l3: Dataflow supports timers in non-merging windows. - - class: flink - l1: 'Partially' - l2: non-merging windows - l3: The Flink Runner supports timers in non-merging windows. - - class: spark-rdd - l1: 'Partially' - l2: fully supported in batch mode - l3: '' - - class: spark-dataset - l1: 'No' - l2: not implemented - l3: '' - - class: apex - l1: 'No' - l2: not implemented - l3: '' - - class: gearpump - l1: 'No' - l2: not implemented - l3: '' - - class: mapreduce - l1: 'No' - l2: '' - l3: '' - - class: jstorm - l1: 'Partially' - l2: non-merging windows - l3: '' - - class: ibmstreams - l1: 'Partially' - l2: non-merging windows - l3: '' - - class: samza - l1: 'Partially' - l2: non-merging windows - l3: The Samza Runner supports timers in non-merging windows. - - class: nemo - l1: 'No' - l2: not implemented - l3: '' - - class: jet - l1: 'Partially' - l2: non-merging windows - l3: '' - - - description: How do refinements relate? - anchor: how - color-b: 'b55' - color-y: 'd77' - color-p: 'faa' - color-n: 'ddd' - rows: - - - name: Discarding - values: - - class: model - l1: 'Yes' - l2: panes discard elements when fired - l3: Elements are discarded from accumulated state as their pane is fired. - - class: dataflow - l1: 'Yes' - l2: fully supported - l3: '' - - class: flink - l1: 'Yes' - l2: fully supported - l3: '' - - class: spark-rdd - l1: 'Yes' - l2: fully supported - l3: 'Spark streaming natively discards elements after firing.' - - class: spark-dataset - l1: 'Partially' - l2: fully supported in batch mode - l3: '' - - class: apex - l1: 'Yes' - l2: fully supported - l3: '' - - class: gearpump - l1: 'Yes' - l2: fully supported - l3: '' - - class: mapreduce - l1: 'No' - l2: batch-only runner - l3: '' - - class: jstorm - l1: 'Yes' - l2: fully supported - l3: '' - - class: ibmstreams - l1: 'Yes' - l2: fully supported - l3: '' - - class: samza - l1: 'Yes' - l2: fully supported - l3: '' - - class: nemo - l1: 'Yes' - l2: fully supported - l3: '' - - class: jet - l1: 'Yes' - l2: fully supported - l3: '' - - - name: Accumulating - values: - - class: model - l1: 'Yes' - l2: panes accumulate elements across firings - l3: Elements are accumulated in state across multiple pane firings for the same window. - - class: dataflow - l1: 'Yes' - l2: fully supported - l3: Requires that the accumulated pane fits in memory, after being passed through the combiner (if relevant) - - class: flink - l1: 'Yes' - l2: fully supported - l3: '' - - class: spark-rdd - l1: 'No' - l2: '' - l3: '' - - class: spark-dataset - l1: 'No' - l2: '' - l3: '' - - class: apex - l1: 'Yes' - l2: fully supported - l3: 'Size restriction, see combine support.' - - class: gearpump - l1: 'No' - l2: '' - l3: '' - - class: mapreduce - l1: 'No' - l2: '' - l3: '' - - class: jstorm - l1: 'Yes' - l2: fully supported - l3: '' - - class: ibmstreams - l1: 'Yes' - l2: fully supported - l3: '' - - class: samza - l1: 'Yes' - l2: fully supported - l3: '' - - class: nemo - l1: 'Yes' - l2: fully supported - l3: '' - - class: jet - l1: 'Yes' - l2: fully supported - l3: '' - - - name: 'Accumulating & Retracting' - values: - - class: model - jira: BEAM-91 - l1: 'No' - l2: accumulation plus retraction of old panes - l3: Elements are accumulated across multiple pane firings and old emitted values are retracted. Also known as "backsies" ;-D - - class: dataflow - l1: 'No' - l2: pending model support - l3: '' - - class: flink - l1: 'No' - l2: pending model support - l3: '' - - class: spark-rdd - l1: 'No' - l2: pending model support - l3: '' - - class: spark-dataset - l1: 'No' - l2: pending model support - l3: '' - - class: apex - l1: 'No' - l2: pending model support - l3: '' - - class: gearpump - l1: 'No' - l2: pending model support - l3: '' - - class: mapreduce - l1: 'No' - l2: '' - l3: '' - - class: jstorm - l1: 'No' - l2: pending model support - l3: '' - - class: ibmstreams - l1: 'No' - l2: pending model support - l3: '' - - class: samza - l1: 'No' - l2: pending model support - l3: '' - - class: nemo - l1: 'No' - l2: pending model support - l3: '' - - class: jet - l1: 'No' - l2: pending model support - l3: '' - - description: Additional common features not yet part of the Beam model - anchor: misc - color-b: 'aaa' - color-y: 'bbb' - color-p: 'ccc' - color-n: 'ddd' - rows: - - name: Drain - values: - - class: model - l1: 'Partially' - l2: - l3: APIs and semantics for draining a pipeline are under discussion. This would cause incomplete aggregations to be emitted regardless of trigger and tagged with metadata indicating it is incompleted. - - class: dataflow - l1: 'Partially' - l2: - l3: Dataflow has a native drain operation, but it does not work in the presence of event time timer loops. Final implemention pending model support. - - class: flink - l1: 'Partially' - l2: - l3: Flink supports taking a "savepoint" of the pipeline and shutting the pipeline down after its completion. - - class: spark-rdd - l1: - l2: - l3: - - class: spark-dataset - l1: - l2: - l3: - - class: apex - l1: - l2: - l3: - - class: gearpump - l1: - l2: - l3: - - class: mapreduce - l1: - l2: - l3: - - class: jstorm - l1: - l2: - l3: - - class: ibmstreams - l1: - l2: - l3: - - class: samza - l1: - l2: - l3: - - class: nemo - l1: - l2: - l3: - - name: Checkpoint - values: - - class: model - l1: 'Partially' - l2: - l3: APIs and semantics for saving a pipeline checkpoint are under discussion. This would be a runner-specific materialization of the pipeline state required to resume or duplicate the pipeline. - - class: dataflow - l1: 'No' - l2: - l3: - - class: flink - l1: 'Partially' - l2: - l3: Flink has a native savepoint capability. - - class: spark-rdd - l1: 'Partially' - l2: - l3: Spark has a native savepoint capability. - - class: spark-dataset - l1: 'No' - l2: - l3: not implemented - - class: apex - l1: - l2: - l3: - - class: gearpump - l1: - l2: - l3: - - class: mapreduce - l1: - l2: - l3: - - class: jstorm - l1: - l2: - l3: - - class: ibmstreams - l1: - l2: - l3: - - class: samza - l1: 'Partially' - l2: - l3: Samza has a native checkpoint capability. - - class: nemo - l1: - l2: - l3: - - class: jet - l1: - l2: - l3: diff --git a/website/src/_data/meetings.yml b/website/src/_data/meetings.yml deleted file mode 100644 index c207800d6939..000000000000 --- a/website/src/_data/meetings.yml +++ /dev/null @@ -1,39 +0,0 @@ -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Welcome to Jekyll! - -events: -- date: 2016/04/01 - time: "9:30 - 16:00 Pacific" - location: PayPal
San Jose, CA, USA - type: Dev/PPMC Meeting - materials: - - title: Presentation - PPMC Deep Dive - link: "https://docs.google.com/presentation/d/1uTb7dx4-Y2OM_B0_3XF_whwAL2FlDTTuq2QzP9sJ4Mg/edit?usp=sharing" - - - title: Notes - PPMC Deep Dive - link: "https://docs.google.com/document/d/1SXSLj7FMIgKqj43nTcczFpJzqASeUMUCpbyklk2fBkg/edit?usp=sharing" - notes: - -- date: 2016/05/04 - time: "8:00 - 11:00 Pacific" - location: Virtual - type: Technical Deep Dive - materials: - - title: Presentation - Beam Community Meeting - link: "https://drive.google.com/open?id=17i7SHViboWtLEZw27iabdMisPl987WWxvapJaXg_dEE" - - - title: Notes - Beam Community Meeting - link: "https://drive.google.com/open?id=1szhEE_pfhEtrQye61jXAidUcMW7oebZCRc2InUe3ou0" - notes: - -last_updated: 2016/05/16 diff --git a/website/src/_includes/button-pydoc.md b/website/src/_includes/button-pydoc.md deleted file mode 100644 index c0135aa8ec1a..000000000000 --- a/website/src/_includes/button-pydoc.md +++ /dev/null @@ -1,23 +0,0 @@ - - -{% capture button_url %}https://beam.apache.org/releases/pydoc/current/{{ include.path }}.html#{{ include.path }}.{{ include.class }}{% endcapture %} - -{% include button.md - url=button_url - logo="https://beam.apache.org/images/logos/sdks/python.png" - text="Pydoc" -%} - -


diff --git a/website/src/_includes/button.md b/website/src/_includes/button.md deleted file mode 100644 index de7a414a2e8d..000000000000 --- a/website/src/_includes/button.md +++ /dev/null @@ -1,21 +0,0 @@ - - -{% if include.attrib %} -{:{{ include.attrib }}}{% endif %} - - -
- {% if include.logo %}{{ include.text }} {% endif %}{{ include.text }} -
diff --git a/website/src/_includes/buttons-code-snippet.md b/website/src/_includes/buttons-code-snippet.md deleted file mode 100644 index 54ac91486f5f..000000000000 --- a/website/src/_includes/buttons-code-snippet.md +++ /dev/null @@ -1,43 +0,0 @@ - - -{% capture colab_logo %}https://github.com/googlecolab/open_in_colab/raw/master/images/icon32.png{% endcapture %} -{% capture github_logo %}https://www.tensorflow.org/images/GitHub-Mark-32px.png{% endcapture %} - -{% capture notebook_url %}https://colab.research.google.com/github/{{ site.branch_repo }}/{{ include.notebook }}{% endcapture %} -{% capture notebook_java %}{{ notebook_url }}-java.ipynb{% endcapture %} -{% capture notebook_py %}{{ notebook_url }}-py.ipynb{% endcapture %} -{% capture notebook_go %}{{ notebook_url }}-go.ipynb{% endcapture %} - -{% capture code_url %}https://github.com/{{ site.branch_repo }}{% endcapture %} -{% capture code_java %}{{ code_url }}/{{ include.java }}{% endcapture %} -{% capture code_py %}{{ code_url }}/{{ include.py }}{% endcapture %} -{% capture code_go %}{{ code_url }}/{{ include.go }}{% endcapture %} - -{% if include.java %} -{% if include.notebook %}{% include button.md url=notebook_java logo=colab_logo text="Run code now" attrib=".language-java .notebook-skip" %}{% endif %} -{% include button.md url=code_java logo=github_logo text="View source code" attrib=".language-java" %} -{% endif %} - -{% if include.py %} -{% if include.notebook %}{% include button.md url=notebook_py logo=colab_logo text="Run code now" attrib=".language-py .notebook-skip" %}{% endif %} -{% include button.md url=code_py logo=github_logo text="View source code" attrib=".language-py" %} -{% endif %} - -{% if include.go %} -{% if include.notebook %}{% include button.md url=notebook_go logo=colab_logo text="Run code now" attrib=".language-go .notebook-skip" %}{% endif %} -{% include button.md url=code_go logo=github_logo text="View source code" attrib=".language-go" %} -{% endif %} - -


diff --git a/website/src/_includes/capability-matrix-common.md b/website/src/_includes/capability-matrix-common.md deleted file mode 100644 index a5c7f4fef7c7..000000000000 --- a/website/src/_includes/capability-matrix-common.md +++ /dev/null @@ -1,20 +0,0 @@ - - diff --git a/website/src/_includes/capability-matrix-row-summary.md b/website/src/_includes/capability-matrix-row-summary.md deleted file mode 100644 index d2e37c50fe7c..000000000000 --- a/website/src/_includes/capability-matrix-row-summary.md +++ /dev/null @@ -1,14 +0,0 @@ - -
{% if val.l1 == 'Yes' %}✓{% elsif val.l1 == 'Partially' %}~{% else %}✕{% endif %}{% if val.jira %} ({{ val.jira }}){% endif %}
diff --git a/website/src/_includes/capability-matrix.md b/website/src/_includes/capability-matrix.md deleted file mode 100644 index 369de318189a..000000000000 --- a/website/src/_includes/capability-matrix.md +++ /dev/null @@ -1,48 +0,0 @@ - -
- - {% for category in cap-data.categories %} - - - - - - {% for x in cap-data.columns %} - - {% endfor %} - - {% for row in category.rows %} - - - {% for val in row.values %} - {% capture value-markdown %}{% include capability-matrix-row-{{ cap-view }}.md %}{% endcapture %} - - - {% endfor %} - - {% endfor %} - - - - {% endfor %} -
{{ x.name }}
{{ row.name }}{{ value-markdown }}
-
diff --git a/website/src/_includes/footer.html b/website/src/_includes/footer.html deleted file mode 100644 index 2052fa922afe..000000000000 --- a/website/src/_includes/footer.html +++ /dev/null @@ -1,66 +0,0 @@ - - - diff --git a/website/src/_includes/head.html b/website/src/_includes/head.html deleted file mode 100644 index aba87c49c0e4..000000000000 --- a/website/src/_includes/head.html +++ /dev/null @@ -1,42 +0,0 @@ - - - - - - - {% if page.title %}{{ page.title | escape }}{% else %}{{ site.title | escape }}{% endif %} - - - - - - - - - - - - - - - - diff --git a/website/src/_includes/icon-github.svg b/website/src/_includes/icon-github.svg deleted file mode 100644 index 6ac23401089c..000000000000 --- a/website/src/_includes/icon-github.svg +++ /dev/null @@ -1,19 +0,0 @@ - - diff --git a/website/src/_includes/icon-twitter.svg b/website/src/_includes/icon-twitter.svg deleted file mode 100644 index 2e71d184b3b2..000000000000 --- a/website/src/_includes/icon-twitter.svg +++ /dev/null @@ -1,19 +0,0 @@ - - diff --git a/website/src/_includes/page-toc.html b/website/src/_includes/page-toc.html deleted file mode 100644 index 05c7d1f3ea56..000000000000 --- a/website/src/_includes/page-toc.html +++ /dev/null @@ -1,88 +0,0 @@ - - -{% comment %} - Taken from https://github.com/allejo/jekyll-toc -{% endcomment %} -{% capture tocWorkspace %} - {% comment %} - "...like all things liquid - where there's a will, and ~36 hours to spare, there's usually a/some way" ~jaybe - - Usage: - {% include toc.html html=content sanitize=true class="inline_toc" id="my_toc" h_min=2 h_max=3 %} - - Parameters: - * html (string) - the HTML of compiled markdown generated by kramdown in Jekyll - - Optional Parameters: - * sanitize (bool) : false - when set to true, the headers will be stripped of any HTML in the TOC - * class (string) : '' - a CSS class assigned to the TOC - * id (string) : '' - an ID to assigned to the TOC - * h_min (int) : 1 - the minimum TOC header level to use; any header lower than this value will be ignored - * h_max (int) : 6 - the maximum TOC header level to use; any header greater than this value will be ignored - - Output: - An unordered list representing the table of contents of a markdown block. This snippet will only generate the table of contents and will NOT output the markdown given to it - {% endcomment %} - - {% capture my_toc %}{% endcapture %} - {% assign minHeader = include.h_min | default: 1 %} - {% assign maxHeader = include.h_max | default: 6 %} - {% assign nodes = include.html | split: ' maxHeader %} - {% continue %} - {% endif %} - - {% if firstHeader %} - {% assign firstHeader = false %} - {% assign minHeader = headerLevel %} - {% endif %} - - {% assign indentAmount = headerLevel | minus: minHeader | add: 1 %} - {% assign _workspace = node | split: '{% endcapture %} - {% assign header = _workspace[0] | replace: _hAttrToStrip, '' %} - - {% assign space = '' %} - {% for i in (1..indentAmount) %} - {% assign space = space | prepend: ' ' %} - {% endfor %} - - {% capture my_toc %}{{ my_toc }} -{{ space }}- [{% if include.sanitize %}{{ header | strip_html }}{% else %}{{ header }}{% endif %}](#{{ html_id }}){% endcapture %} - - {% endfor %} - - {% if include.class %} - {% capture my_toc %}{:.{{ include.class }}} -{{ my_toc | lstrip }}{% endcapture %} - {% endif %} - - {% if include.id %} - {% capture my_toc %}{: #{{ include.id }}} -{{ my_toc | lstrip }}{% endcapture %} - {% endif %} -{% endcapture %}{% assign tocWorkspace = '' %} -{{ my_toc | markdownify }} diff --git a/website/src/_includes/section-menu/contribute.html b/website/src/_includes/section-menu/contribute.html deleted file mode 100644 index cc9bdc42e366..000000000000 --- a/website/src/_includes/section-menu/contribute.html +++ /dev/null @@ -1,45 +0,0 @@ - - -
  • Contribute
  • -
  • Get started contributing
  • -
  • Get Help
  • -
  • - Technical Docs - - -
  • -
  • - Policies - -
  • -
  • - Committers - -
  • diff --git a/website/src/_includes/section-menu/documentation.html b/website/src/_includes/section-menu/documentation.html deleted file mode 100644 index f57e146c186d..000000000000 --- a/website/src/_includes/section-menu/documentation.html +++ /dev/null @@ -1,307 +0,0 @@ - - -
  • Documentation
  • -
  • Using the Documentation
  • -
  • - Pipeline development lifecycle - - -
  • -
  • - Beam programming guide - - -
  • - -
  • - Transform catalog - - - -
  • - -
  • - Common pipeline patterns - - -
  • - -
  • - Runtime systems - - -
  • - -
  • - Learning resources - - -
  • -
  • Beam Wiki
  • diff --git a/website/src/_includes/section-menu/roadmap.html b/website/src/_includes/section-menu/roadmap.html deleted file mode 100644 index 3cc7b5fb9119..000000000000 --- a/website/src/_includes/section-menu/roadmap.html +++ /dev/null @@ -1,49 +0,0 @@ - - -
  • Roadmap
  • -
  • Roadmap Highlights
  • -
  • Portability Framework
  • -
  • - Languages - - -
  • -
  • - Runners - - -
  • -
  • - Connectors - - -
  • diff --git a/website/src/_includes/section-menu/runners.html b/website/src/_includes/section-menu/runners.html deleted file mode 100644 index 3444e0d8bb2c..000000000000 --- a/website/src/_includes/section-menu/runners.html +++ /dev/null @@ -1,23 +0,0 @@ - - -
  • Runners
  • -
  • Capability Matrix
  • -
  • Direct Runner
  • -
  • Apache Apex
  • -
  • Apache Flink
  • -
  • Apache Gearpump
  • -
  • Apache Nemo
  • -
  • Apache Samza
  • -
  • Apache Spark
  • -
  • Google Cloud Dataflow
  • -
  • Hazelcast Jet
  • diff --git a/website/src/_includes/section-menu/sdks.html b/website/src/_includes/section-menu/sdks.html deleted file mode 100644 index 15d97a9de96e..000000000000 --- a/website/src/_includes/section-menu/sdks.html +++ /dev/null @@ -1,109 +0,0 @@ - - -
  • Languages
  • - -
  • - Java - -
  • - -
  • - Python - -
  • - -
  • - Go - -
  • - -
  • - SQL - -
  • diff --git a/website/src/_layouts/post.html b/website/src/_layouts/post.html deleted file mode 100644 index 94881cb751a8..000000000000 --- a/website/src/_layouts/post.html +++ /dev/null @@ -1,32 +0,0 @@ ---- -layout: default ---- - - -{% assign authors = page.authors %} - -
    - -
    -

    {{ page.title }}

    - -
    - -
    - {{ content }} -
    - -
    diff --git a/website/src/_layouts/section.html b/website/src/_layouts/section.html deleted file mode 100644 index 951e6e3c9d5d..000000000000 --- a/website/src/_layouts/section.html +++ /dev/null @@ -1,38 +0,0 @@ - - - - - {% include head.html %} - - {% include header.html %} -
    -
    - - -
    - - - -
    - {{ content }} -
    -
    - {% include footer.html %} - - diff --git a/website/src/_layouts/v2home.html b/website/src/_layouts/v2home.html deleted file mode 100644 index 2e0a08b6e8d6..000000000000 --- a/website/src/_layouts/v2home.html +++ /dev/null @@ -1,217 +0,0 @@ - - - - - - - Apache Beam - - - - - - - - - - - -
    - - -
    -
    -
    -
    -

    Apache Beam

    -
    -
    -
    -
    -

    - A unified programming model for batch and streaming -

    - -
    -
    - Beam provides an advanced unified programming model, allowing you to implement batch and streaming data processing jobs that you can run on any execution engine. -
    -
    -
    -
    -
    -
    -
    -
    -
    -
    -
    -
    - -
    -
    -

    Unified

    - -
    - Use a single programming model for both batch and streaming use cases. -
    -
    -
    -
    -
    - -
    -
    -

    Portable

    - -
    - Execute pipelines on multiple execution environments, including Apache Flink, Apache Spark, and Google Cloud Dataflow. -
    -
    -
    -
    -
    - -
    -
    -

    Extensible

    - -
    - Write and share new SDKs, IO connectors, and transformation libraries. -
    -
    -
    -
    -
    -
    -
    - {{ content }} -
    -
    -
    All News
    -
    -
    -
    -
    -
    -
    -
    -
    Open Source
    -
    - Beam is an Apache Software Foundation project, - available under the Apache v2 license. Beam is an open source community - contributions are appreciated! If you'd like to contribute, please see the Contribute section. -
    -
    -
    - -
    -
    -
    -
    -
    -
    -
    -
    - -
    -
    -
    Overview
    -
    - Apache Beam is a unified programming model you can use to create data processing pipelines. You start by building a program that defines the pipeline using one of the open source Beam SDKs. The pipeline is then executed by one of Beam’s supported distributed processing back-ends, which include Apache Flink, Apache Spark, and Google Cloud Dataflow. -
    -
    -
    -
    -
    -
    -
    -
    -
    -
    Documentation
    -
    - If you'd like to use Beam for your data processing tasks, use the Get Started section for an overview, quickstart, and examples. Then dive into the Documentation section for in-depth concepts and reference materials for the Beam Model, SDKs, and Runners. -
    -
    -
    - -
    -
    -
    -
    -
    -
    -
    -
    -
    -
    -
    -
    - © 2016 Apache Software Foundation - Privacy Policy
    - Apache Beam, Beam, Apache, the Apache feather logo, and the Apache Beam project logo are - trademarks of The Apache Software Foundation. -
    -
    -
    -
    -
    -
    - - - - - - - - - diff --git a/website/src/blog/index.md b/website/src/blog/index.md deleted file mode 100644 index 3be2a4cb37e0..000000000000 --- a/website/src/blog/index.md +++ /dev/null @@ -1,51 +0,0 @@ ---- -layout: default -title: "Beam Blog" -permalink: /blog/ ---- - - -# Apache Beam Blog - -This is the blog for the Apache Beam project. This blog contains news and updates -for the project. - -{% for post in site.posts %} -{% assign authors = post.authors %} - -### {{ post.title }} -{{ post.date | date: "%b %-d, %Y" }}{% if authors %} • -{% assign count = authors | size %}{% for name in authors %}{% if forloop.first == false and count > 2 %},{% endif %}{% if forloop.last and count > 1 %} &{% endif %}{% assign author = site.data.authors[name] %} {{ author.name }} {% if author.twitter %}[@{{ author.twitter }}]{% endif %}{% endfor %} -{% endif %} - -{{ post.excerpt }} - - -{% capture content_words %} - {{ post.content | number_of_words }} -{% endcapture %} -{% capture excerpt_words %} - {{ post.excerpt | number_of_words }} -{% endcapture %} -{% if excerpt_words != content_words %} -

    - -Read more  - -

    -{% endif %} - -
    -{% endfor %} diff --git a/website/src/coming-soon.md b/website/src/coming-soon.md deleted file mode 100644 index 88b0d87f9cac..000000000000 --- a/website/src/coming-soon.md +++ /dev/null @@ -1,22 +0,0 @@ ---- -layout: default ---- - - -# Documentation Coming Soon - -You've reached a page that's still in draft, or otherwise being developed! Please bear with us as we improve the documentation for Apache Beam. - -[Go Back]({{ site.baseurl }}/) to the main Beam site. diff --git a/website/src/community/logos.md b/website/src/community/logos.md deleted file mode 100644 index 8b5d822fa967..000000000000 --- a/website/src/community/logos.md +++ /dev/null @@ -1,80 +0,0 @@ ---- -layout: section -title: "Beam Logos" -section_menu: section-menu/community.html -permalink: /community/logos/ -redirect_from: - - /project/logos/ - - /material/ - - /contribute/logos/ ---- - - -# Apache Beam Logos - -This page contains project material for the Apache Beam project. - -## Project logos -You can download [this archive]({{ site.baseurl }}/{{ site.downloads }}/{{ site.data.logos.archive-file }}) -containing all of the logos or download the logos individually. - -### Scalable Vector Graphics (SVG) -These [SVG files](https://en.wikipedia.org/wiki/Scalable_Vector_Graphics) can -be resized easily and are suitable for print or web use. Click on the logo to -download it. - -{% for color in site.data.logos.colors %} -#### {{ color[1] }} -
    -
    -
    -{% for type in site.data.logos.types %} -
    -
    -beam-logo-{{ color[0] }}-{{ type }}.svg -

    -
    -{% endfor %} -
    -{% endfor %} - - -### Portable Network Graphics (PNG) -These [PNG files](https://en.wikipedia.org/wiki/Portable_Network_Graphics) are -available in a number of fixed sizes and are optimized for web use. - -{% for color in site.data.logos.colors %} -#### {{ color[1] }} -
    -
    -
    -{% for type in site.data.logos.types %} -
    -
    -beam-logo-{{ color[0] }}-{{ type }} -

    -
    -{% for size in site.data.logos.sizes %} -{{ size }}x{{ size }} -{% unless forloop.last %},{% endunless %} -{% endfor %} -
    -
    -{% endfor %} -
    -{% endfor %} - -## Colors and fonts -The Apache Beam project uses predefined colors and fonts. [This document]({{ site.baseurl }}/{{ site.downloads }}/palette.pdf) has more information. diff --git a/website/src/documentation/index.md b/website/src/documentation/index.md deleted file mode 100644 index de0906807367..000000000000 --- a/website/src/documentation/index.md +++ /dev/null @@ -1,70 +0,0 @@ ---- -layout: section -title: "Learn about Beam" -permalink: /documentation/ -section_menu: section-menu/documentation.html -redirect_from: - - /learn/ - - /docs/learn/ ---- - - -# Apache Beam Documentation - -This section provides in-depth conceptual information and reference material for the Beam Model, SDKs, and Runners: - -## Concepts - -Learn about the Beam Programming Model and the concepts common to all Beam SDKs and Runners. - -* Read the [Programming Guide]({{ site.baseurl }}/documentation/programming-guide/), which introduces all the key Beam concepts. -* Learn about Beam's [execution model]({{ site.baseurl }}/documentation/runtime/model) to better understand how pipelines execute. -* Visit [Learning Resources]({{ site.baseurl }}/documentation/resources/learning-resources) for some of our favorite articles and talks about Beam. - -## Pipeline Fundamentals - -* [Design Your Pipeline]({{ site.baseurl }}/documentation/pipelines/design-your-pipeline/) by planning your pipeline’s structure, choosing transforms to apply to your data, and determining your input and output methods. -* [Create Your Pipeline]({{ site.baseurl }}/documentation/pipelines/create-your-pipeline/) using the classes in the Beam SDKs. -* [Test Your Pipeline]({{ site.baseurl }}/documentation/pipelines/test-your-pipeline/) to minimize debugging a pipeline’s remote execution. - -## SDKs - -Find status and reference information on all of the available Beam SDKs. - -* [Java SDK]({{ site.baseurl }}/documentation/sdks/java/) -* [Python SDK]({{ site.baseurl }}/documentation/sdks/python/) -* [Go SDK]({{ site.baseurl }}/documentation/sdks/go/) - -## Runners - -A Beam Runner runs a Beam pipeline on a specific (often distributed) data processing system. - -### Available Runners - -* [DirectRunner]({{ site.baseurl }}/documentation/runners/direct/): Runs locally on your machine -- great for developing, testing, and debugging. -* [ApexRunner]({{ site.baseurl }}/documentation/runners/apex/): Runs on [Apache Apex](https://apex.apache.org). -* [FlinkRunner]({{ site.baseurl }}/documentation/runners/flink/): Runs on [Apache Flink](https://flink.apache.org). -* [SparkRunner]({{ site.baseurl }}/documentation/runners/spark/): Runs on [Apache Spark](https://spark.apache.org). -* [DataflowRunner]({{ site.baseurl }}/documentation/runners/dataflow/): Runs on [Google Cloud Dataflow](https://cloud.google.com/dataflow), a fully managed service within [Google Cloud Platform](https://cloud.google.com/). -* [GearpumpRunner]({{ site.baseurl }}/documentation/runners/gearpump/): Runs on [Apache Gearpump (incubating)](https://gearpump.apache.org). -* [SamzaRunner]({{ site.baseurl }}/documentation/runners/samza/): Runs on [Apache Samza](https://samza.apache.org). -* [NemoRunner]({{ site.baseurl }}/documentation/runners/nemo/): Runs on [Apache Nemo](https://nemo.apache.org). -* [JetRunner]({{ site.baseurl }}/documentation/runners/jet/): Runs on [Hazelcast Jet](https://jet.hazelcast.org/). - -### Choosing a Runner - -Beam is designed to enable pipelines to be portable across different runners. However, given every runner has different capabilities, they also have different abilities to implement the core concepts in the Beam model. The [Capability Matrix]({{ site.baseurl }}/documentation/runners/capability-matrix/) provides a detailed comparison of runner functionality. - -Once you have chosen which runner to use, see that runner's page for more information about any initial runner-specific setup as well as any required or optional `PipelineOptions` for configuring its execution. You may also want to refer back to the Quickstart for [Java]({{ site.baseurl }}/get-started/quickstart-java), [Python]({{ site.baseurl }}/get-started/quickstart-py) or [Go]({{ site.baseurl }}/get-started/quickstart-go) for instructions on executing the sample WordCount pipeline. diff --git a/website/src/documentation/transforms/java/index.md b/website/src/documentation/transforms/java/index.md deleted file mode 100644 index 71b3721449f4..000000000000 --- a/website/src/documentation/transforms/java/index.md +++ /dev/null @@ -1,82 +0,0 @@ ---- -layout: section -title: "Java transform catalog overview" -permalink: /documentation/transforms/java/overview/ -section_menu: section-menu/documentation.html ---- - - -# Java transform catalog overview - -## Element-wise - - - - - - - - - - - - - - - - -
    TransformDescription
    FilterGiven a predicate, filter out all elements that don't satisfy the predicate.
    FlatMapElementsApplies a function that returns a collection to every element in the input and - outputs all resulting elements.
    KeysExtracts the key from each element in a collection of key-value pairs.
    KvSwapSwaps the key and value of each element in a collection of key-value pairs.
    MapElementsApplies a function to every element in the input and outputs the result.
    ParDoThe most-general mechanism for applying a user-defined DoFn to every element - in the input collection.
    PartitionRoutes each input element to a specific output collection based on some partition - function.
    RegexFilters input string elements based on a regex. May also transform them based on the matching groups.
    ReifyTransforms for converting between explicit and implicit form of various Beam values.
    ToStringTransforms every element in an input collection to a string.
    WithKeysProduces a collection containing each element from the input collection converted to a key-value pair, with a key selected by applying a function to the input element.
    WithTimestampsApplies a function to determine a timestamp to each element in the output collection, - and updates the implicit timestamp associated with each input. Note that it is only safe to adjust timestamps forwards.
    ValuesExtracts the value from each element in a collection of key-value pairs.
    - - - -## Aggregation - - - - - - - - - - - - - - - - - - - -
    TransformDescription
    ApproximateQuantilesUses an approximation algorithm to estimate the data distribution within each aggregation using a specified number of quantiles.
    ApproximateUniqueUses an approximation algorithm to estimate the number of unique elements within each aggregation.
    CoGroupByKeySimilar to GroupByKey, but groups values associated with each key into a batch of a given size
    CombineTransforms to combine elements according to a provided CombineFn.
    CombineWithContextAn extended version of Combine which allows accessing side-inputs and other context.
    CountCounts the number of elements within each aggregation.
    DistinctProduces a collection containing distinct elements from the input collection.
    GroupByKeyTakes a keyed collection of elements and produces a collection where each element - consists of a key and all values associated with that key.
    GroupIntoBatchesBatches values associated with keys into Iterable batches of some size. Each batch contains elements associated with a specific key.
    HllCountEstimates the number of distinct elements and creates re-aggregatable sketches using the HyperLogLog++ algorithm.
    LatestSelects the latest element within each aggregation according to the implicit timestamp.
    MaxOutputs the maximum element within each aggregation.
    MeanComputes the average within each aggregation.
    MinOutputs the minimum element within each aggregation.
    SampleRandomly select some number of elements from each aggregation.
    SumCompute the sum of elements in each aggregation.
    TopCompute the largest element(s) in each aggregation.
    - - -## Other - - - - - - - -
    TransformDescription
    CreateCreates a collection from an in-memory list.
    FlattenGiven multiple input collections, produces a single output collection containing - all elements from all of the input collections.
    PAssertA transform to assert the contents of a PCollection used as part of testing a pipeline either locally or with a runner.
    ViewOperations for turning a collection into view that may be used as a side-input to a ParDo.
    WindowLogically divides up or groups the elements of a collection into finite - windows according to a provided WindowFn.
    \ No newline at end of file diff --git a/website/src/documentation/transforms/python/aggregation/latest.md b/website/src/documentation/transforms/python/aggregation/latest.md deleted file mode 100644 index 38dc93bd8c3a..000000000000 --- a/website/src/documentation/transforms/python/aggregation/latest.md +++ /dev/null @@ -1,26 +0,0 @@ ---- -layout: section -title: "Latest" -permalink: /documentation/transforms/python/aggregation/latest/ -section_menu: section-menu/documentation.html ---- - -# Latest - -## Examples -See [BEAM-7390](https://issues.apache.org/jira/browse/BEAM-7390) for updates. - -## Related transforms -* [Sample]({{ site.baseurl }}/documentation/transforms/python/aggregation/sample) to combine elements. takes samples of the elements in a collection. \ No newline at end of file diff --git a/website/src/documentation/transforms/python/index.md b/website/src/documentation/transforms/python/index.md deleted file mode 100644 index dad96b0ac798..000000000000 --- a/website/src/documentation/transforms/python/index.md +++ /dev/null @@ -1,86 +0,0 @@ ---- -layout: section -title: "Python transform catalog overview" -permalink: /documentation/transforms/python/overview/ -section_menu: section-menu/documentation.html ---- - - -# Python transform catalog overview - -## Element-wise - - - - - - - - - - - - - - - -
    TransformDescription
    FilterGiven a predicate, filter out all elements that don't satisfy the predicate.
    FlatMapApplies a function that returns a collection to every element in the input and - outputs all resulting elements.
    KeysExtracts the key from each element in a collection of key-value pairs.
    KvSwapSwaps the key and value of each element in a collection of key-value pairs.
    MapApplies a function to every element in the input and outputs the result.
    ParDoThe most-general mechanism for applying a user-defined DoFn to every element - in the input collection.
    PartitionRoutes each input element to a specific output collection based on some partition - function.
    RegexFilters input string elements based on a regex. May also transform them based on the matching groups.
    ReifyTransforms for converting between explicit and implicit form of various Beam values.
    ToStringTransforms every element in an input collection a string.
    WithTimestampsApplies a function to determine a timestamp to each element in the output collection, - and updates the implicit timestamp associated with each input. Note that it is only - safe to adjust timestamps forwards.
    ValuesExtracts the value from each element in a collection of key-value pairs.
    - - - -## Aggregation - - - - - - - - - - - - - - - - - - -
    TransformDescription
    ApproximateQuantilesNot available. See BEAM-6694 for updates.
    ApproximateUniqueNot available. See BEAM-6693 for updates.
    CoGroupByKeyTakes several keyed collections of elements and produces a collection where each element - consists of a key and all values associated with that key.
    CombineGloballyTransforms to combine elements.
    CombineWithContextNot available.
    CountCounts the number of elements within each aggregation.
    DistinctProduces a collection containing distinct elements from the input collection.
    GroupByKeyTakes a keyed collection of elements and produces a collection where each element - consists of a key and all values associated with that key.
    GroupIntoBatchesNot available. See BEAM-6696 for updates.
    LatestNot available. See BEAM-6695 for updates.
    MaxNot available.
    MeanComputes the average within each aggregation.
    MinNot available.
    SampleRandomly select some number of elements from each aggregation.
    SumNot available.
    TopCompute the largest element(s) in each aggregation.
    - - -## Other - - - - - - - - -
    TransformDescription
    CreateCreates a collection from an in-memory list.
    FlattenGiven multiple input collections, produces a single output collection containing - all elements from all of the input collections. -
    PAssertNot available.
    ReshuffleGiven an input collection, redistributes the elements between workers. This is - most useful for adjusting parallelism or preventing coupled failures.
    ViewNot available.
    WindowIntoLogically divides up or groups the elements of a collection into finite - windows according to a function.
    - diff --git a/website/src/feed.xml b/website/src/feed.xml deleted file mode 100644 index ccb6f511ff06..000000000000 --- a/website/src/feed.xml +++ /dev/null @@ -1,39 +0,0 @@ ---- -layout: null ---- - - - - - {{ site.title | xml_escape }} - {{ site.description | xml_escape }} - {{ site.url }}{{ site.baseurl }}/ - - Jekyll v{{ jekyll.version }} - {% for post in site.posts limit:10 %} - - {{ post.title | xml_escape }} - {{ post.content | xml_escape }} - {{ post.date | date_to_rfc822 }} - {{ post.url | prepend: site.baseurl | prepend: site.url }} - {{ post.url | prepend: site.baseurl | prepend: site.url }} - {% for tag in post.tags %} - {{ tag | xml_escape }} - {% endfor %} - {% for cat in post.categories %} - {{ cat | xml_escape }} - {% endfor %} - - {% endfor %} - - diff --git a/website/src/index.md b/website/src/index.md deleted file mode 100644 index e7fd2e7ed008..000000000000 --- a/website/src/index.md +++ /dev/null @@ -1,176 +0,0 @@ ---- -layout: default -body_class: body--index - -logos: -- title: APEX - image_url: /images/logo_apex.png - url: "https://apex.apache.org" -- title: Flink - image_url: /images/logo_flink.png - url: "https://flink.apache.org" -- title: Spark - image_url: /images/logo_spark.png - url: https://spark.apache.org/ -- title: Google Cloud Dataflow - image_url: /images/logo_google_cloud.png - url: https://cloud.google.com/dataflow/ -- title: Gearpump - image_url: /images/logo_gearpump.png - url: https://gearpump.apache.org/ -- title: Samza - image_url: /images/logo_samza.png - url: https://samza.apache.org/ - -pillars: -- title: Unified - body: Use a single programming model for both batch and streaming use cases. -- title: Portable - body: Execute pipelines on multiple execution environments. -- title: Extensible - body: Write and share new SDKs, IO connectors, and transformation libraries. - -cards: -- quote: "A framework that delivers the flexibility and advanced functionality our customers need." - name: –Talend -- quote: "Apache Beam has powerful semantics that solve real-world challenges of stream processing." - name: –PayPal -- quote: "Apache Beam represents a principled approach for analyzing data streams." - name: –data Artisans ---- - -
    -
    -
    -
    -
    -
    - Apache Beam: An advanced unified programming model -
    -
    - Implement batch and streaming data processing jobs that run on any execution engine. -
    - - -
    -
    -
    -
    -
    - The latest from the blog -
    -
    - {% for post in site.posts limit:3 %} - -
    {{post.title}}
    -
    {{ post.date | date: "%b %-d, %Y" }}
    -
    - {% endfor %} -
    -
    -
    -
    -
    -
    - -
    -
    - All about Apache Beam -
    -
    - {% for pillar in page.pillars %} -
    -
    - {{pillar.title}} -
    -
    - {{pillar.body}} -
    -
    - {% endfor %} -
    -
    - -
    -
    -Beam architecture -
    -
    - -
    -
    - Works with -
    -
    - {% for logo in page.logos %} - - {% endfor %} -
    -
    - -
    -
    -
    - Testimonials -
    -
    - {% for card in page.cards %} -
    -
    - {{card.quote}} -
    -
    - -
    - {{card.name}} -
    -
    -
    - {% endfor %} -
    -
    - Beam is an open source community and contributions are greatly appreciated! - If you’d like to contribute, please see the Contribute section. -
    -
    -
    - - diff --git a/website/src/v2/index.md b/website/src/v2/index.md deleted file mode 100644 index 43c0508b5280..000000000000 --- a/website/src/v2/index.md +++ /dev/null @@ -1,51 +0,0 @@ ---- -layout: v2home ---- - -
    -
    - -
    - - - -
    -
    - -
    -
    diff --git a/website/www/build_github_samples.sh b/website/www/build_github_samples.sh new file mode 100755 index 000000000000..b47e7d1286cf --- /dev/null +++ b/website/www/build_github_samples.sh @@ -0,0 +1,41 @@ +#!/usr/bin/env bash +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +set -euo pipefail + +MY_DIR="$(cd "$(dirname "$0")" && pwd)" +pushd "${MY_DIR}" &>/dev/null || exit 1 + +echo "Working directory: ${MY_DIR}" + +DIST_DIR=${1:-"./site/github_samples"} +echo "Dist directory: ${DIST_DIR}" + +CONTENT_DIR=${2:-"./site/content"} + +mapfile -t github_urls < <(grep -rh "{{< github_sample" "${CONTENT_DIR}" | sed -e 's/^.*"\(.*\)".*$/\1/g' | sort | uniq | sed 's/\/blob\//\//g' | xargs -n 1 echo) + +mkdir -pv "${DIST_DIR}" + +for url in "${github_urls[@]}" +do + fileName=$(echo "$url" | sed -e 's/\//_/g') + curl -o "$DIST_DIR"/"$fileName" "https://raw.githubusercontent.com$url" +done + +popd &>/dev/null || exit 1 diff --git a/website/www/check-links.sh b/website/www/check-links.sh new file mode 100755 index 000000000000..9fac1970d1f6 --- /dev/null +++ b/website/www/check-links.sh @@ -0,0 +1,93 @@ +#!/usr/bin/env bash +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +set -euo pipefail + + +function redraw_progress_bar { # int barsize, int base, int current, int top + # Source: https://stackoverflow.com/a/20311674 + local barsize=$1 + local base=$2 + local current=$3 + local top=$4 + local j=0 + local progress=$(( (barsize * (current - base)) / (top - base ) )) + echo -n "[" + for ((j=0; j < progress; j++)) ; do echo -n '='; done + echo -n '=>' + for ((j=progress; j < barsize ; j++)) ; do echo -n ' '; done + echo -n "] $current / $top " $'\r' +} + +if ! command -v lynx; then + echo "This script requires lynx to work properly." + echo + echo "For more information, look at: http://lynx.browser.org/" + exit 1 +fi + +MY_DIR="$(cd "$(dirname "$0")" && pwd)" +pushd "${MY_DIR}" &>/dev/null || exit 1 + +echo "Working directory: ${MY_DIR}" + +DIST_DIR=${1:-"./dist"} +echo "Dist directory: ${DIST_DIR}" + +echo "" + +if [[ ! -f "${DIST_DIR}/index.html" ]]; then + echo "You should build website first." + exit 1 +fi + +mkdir -pv "${DIST_DIR}" + +readarray -d '' pages < <(find "${DIST_DIR}" -name '*.html' -print0) +echo "Found ${#pages[@]} HTML files." + +echo "Searching links." +mapfile -t links < <(printf '%s\n' "${pages[@]}" | xargs -n 1 lynx -listonly -nonumbers -dump -display_charset=iso-8859-1 | grep -v " ") +mapfile -t external_links < <(printf '%s\n' "${links[@]}" | grep "^https\?://" | grep -v "http://localhost" | grep -v "http://link/" | grep -v "http://docker.local" | grep -v "https://github.com/apache/beam/edit/master/website/www/site/content/" | sort | uniq) +echo "Found ${#links[@]} links including ${#external_links[@]} unique external links." + +echo "Checking links." +invalid_links=() +i=1 +for external_link in "${external_links[@]}" +do + redraw_progress_bar 50 1 $i ${#external_links[@]} + + if ! curl -sSfL --max-time 60 --connect-timeout 30 --retry 3 -4 "${external_link}" > /dev/null ; then + invalid_links+=("${external_link}") + echo "${external_link}" + fi + i=$((i+1)) +done +# Clear line - hide progress bar +echo -n -e "\033[2K" + + +if [[ ${#invalid_links[@]} -ne 0 ]]; then + echo "Found ${#invalid_links[@]} invalid links: " + printf '%s\n' "${invalid_links[@]}" +else + echo "All links work" +fi + +popd &>/dev/null || exit 1 diff --git a/website/www/package.json b/website/www/package.json new file mode 100644 index 000000000000..a5b1cbea637d --- /dev/null +++ b/website/www/package.json @@ -0,0 +1,19 @@ +{ + "name": "apache-beam-website", + "version": "1.0.0", + "description": "Apache Beam website", + "repository": "apache/beam", + "license": "MIT", + "scripts": { + "build_github_samples": "./build_github_samples.sh", + "develop": "cd site && hugo server", + "build": "cross-env HUGO_ENV=production hugo -d ../dist -s site -v", + "start": "hugo -d ../dist -s site -vw" + }, + "dependencies": {}, + "devDependencies": { + "autoprefixer": "^9.7.4", + "cross-env": "^7.0.2", + "postcss-cli": "^7.1.0" + } +} diff --git a/website/src/_includes/capability-matrix-row-full.md b/website/www/site/archetypes/blog.md similarity index 71% rename from website/src/_includes/capability-matrix-row-full.md rename to website/www/site/archetypes/blog.md index 898203ae7c35..d8c12fef6b7f 100644 --- a/website/src/_includes/capability-matrix-row-full.md +++ b/website/www/site/archetypes/blog.md @@ -1,3 +1,11 @@ +--- +title: "{{ replace .Name "-" " " | title }}" +date: "{{ .Date }}" +categories: + - blog +authors: + - "Your Name" +--- -
    {{ val.l1 }}{% if val.l2 != '' %}: {{ val.l2 }}{% endif %}{% if val.jira %}
    ({{ val.jira }}){% endif %}

    {{ val.l3 }} diff --git a/website/src/_includes/capability-matrix-row-blog.md b/website/www/site/archetypes/default.md similarity index 81% rename from website/src/_includes/capability-matrix-row-blog.md rename to website/www/site/archetypes/default.md index 99685946b4bc..21d1b8ff6754 100644 --- a/website/src/_includes/capability-matrix-row-blog.md +++ b/website/www/site/archetypes/default.md @@ -1,3 +1,6 @@ +--- +title: "{{ replace .Name "-" " " | title }}" +--- -
    {% if val.l1 == 'Yes' %}✓{% elsif val.l1 == 'Partially' %}~{% else %}✕{% endif %}
    diff --git a/website/src/_sass/_bootstrap.scss b/website/www/site/assets/scss/_bootstrap.scss similarity index 100% rename from website/src/_sass/_bootstrap.scss rename to website/www/site/assets/scss/_bootstrap.scss diff --git a/website/src/_sass/_breakpoints.sass b/website/www/site/assets/scss/_breakpoints.sass similarity index 100% rename from website/src/_sass/_breakpoints.sass rename to website/www/site/assets/scss/_breakpoints.sass diff --git a/website/src/_sass/_button.sass b/website/www/site/assets/scss/_button.sass similarity index 100% rename from website/src/_sass/_button.sass rename to website/www/site/assets/scss/_button.sass diff --git a/website/src/_sass/_cards.sass b/website/www/site/assets/scss/_cards.sass similarity index 100% rename from website/src/_sass/_cards.sass rename to website/www/site/assets/scss/_cards.sass diff --git a/website/src/_sass/_ctas.sass b/website/www/site/assets/scss/_ctas.sass similarity index 100% rename from website/src/_sass/_ctas.sass rename to website/www/site/assets/scss/_ctas.sass diff --git a/website/src/_sass/_footer.sass b/website/www/site/assets/scss/_footer.sass similarity index 100% rename from website/src/_sass/_footer.sass rename to website/www/site/assets/scss/_footer.sass diff --git a/website/src/_sass/_global.sass b/website/www/site/assets/scss/_global.sass similarity index 100% rename from website/src/_sass/_global.sass rename to website/www/site/assets/scss/_global.sass diff --git a/website/src/_sass/_graphic.sass b/website/www/site/assets/scss/_graphic.sass similarity index 100% rename from website/src/_sass/_graphic.sass rename to website/www/site/assets/scss/_graphic.sass diff --git a/website/src/_sass/_header.sass b/website/www/site/assets/scss/_header.sass similarity index 100% rename from website/src/_sass/_header.sass rename to website/www/site/assets/scss/_header.sass diff --git a/website/src/_sass/_hero.sass b/website/www/site/assets/scss/_hero.sass similarity index 100% rename from website/src/_sass/_hero.sass rename to website/www/site/assets/scss/_hero.sass diff --git a/website/src/_sass/_layout.scss b/website/www/site/assets/scss/_layout.scss similarity index 100% rename from website/src/_sass/_layout.scss rename to website/www/site/assets/scss/_layout.scss diff --git a/website/src/_sass/_logos.sass b/website/www/site/assets/scss/_logos.sass similarity index 100% rename from website/src/_sass/_logos.sass rename to website/www/site/assets/scss/_logos.sass diff --git a/website/src/_sass/_navbar.sass b/website/www/site/assets/scss/_navbar.sass similarity index 100% rename from website/src/_sass/_navbar.sass rename to website/www/site/assets/scss/_navbar.sass diff --git a/website/src/_sass/_page-nav.sass b/website/www/site/assets/scss/_page-nav.sass similarity index 90% rename from website/src/_sass/_page-nav.sass rename to website/www/site/assets/scss/_page-nav.sass index d3769836e9d9..542a4222b20a 100644 --- a/website/src/_sass/_page-nav.sass +++ b/website/www/site/assets/scss/_page-nav.sass @@ -51,3 +51,12 @@ padding: 0 30px position: relative width: 100% + + #TableOfContents + > ul + padding-left: 0 + margin-bottom: 0 + ul + padding-left: 20 + ul + display: none diff --git a/website/src/_sass/_pillars.sass b/website/www/site/assets/scss/_pillars.sass similarity index 100% rename from website/src/_sass/_pillars.sass rename to website/www/site/assets/scss/_pillars.sass diff --git a/website/src/_sass/_section-nav.sass b/website/www/site/assets/scss/_section-nav.sass similarity index 100% rename from website/src/_sass/_section-nav.sass rename to website/www/site/assets/scss/_section-nav.sass diff --git a/website/src/_sass/_syntax-highlighting.scss b/website/www/site/assets/scss/_syntax-highlighting.scss similarity index 100% rename from website/src/_sass/_syntax-highlighting.scss rename to website/www/site/assets/scss/_syntax-highlighting.scss diff --git a/website/www/site/assets/scss/_table-wrapper.sass b/website/www/site/assets/scss/_table-wrapper.sass new file mode 100644 index 000000000000..299b0019f62c --- /dev/null +++ b/website/www/site/assets/scss/_table-wrapper.sass @@ -0,0 +1,24 @@ +/*! + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +.table-wrapper + > table + @extend .table + +.table-bordered-wrapper + > table + @extend .table-bordered diff --git a/website/src/_sass/_toggler-nav.scss b/website/www/site/assets/scss/_toggler-nav.scss similarity index 100% rename from website/src/_sass/_toggler-nav.scss rename to website/www/site/assets/scss/_toggler-nav.scss diff --git a/website/src/_sass/_type.sass b/website/www/site/assets/scss/_type.sass similarity index 100% rename from website/src/_sass/_type.sass rename to website/www/site/assets/scss/_type.sass diff --git a/website/src/_sass/_vars.sass b/website/www/site/assets/scss/_vars.sass similarity index 100% rename from website/src/_sass/_vars.sass rename to website/www/site/assets/scss/_vars.sass diff --git a/website/src/_sass/bootstrap/_alerts.scss b/website/www/site/assets/scss/bootstrap/_alerts.scss similarity index 100% rename from website/src/_sass/bootstrap/_alerts.scss rename to website/www/site/assets/scss/bootstrap/_alerts.scss diff --git a/website/src/_sass/bootstrap/_badges.scss b/website/www/site/assets/scss/bootstrap/_badges.scss similarity index 100% rename from website/src/_sass/bootstrap/_badges.scss rename to website/www/site/assets/scss/bootstrap/_badges.scss diff --git a/website/src/_sass/bootstrap/_breadcrumbs.scss b/website/www/site/assets/scss/bootstrap/_breadcrumbs.scss similarity index 100% rename from website/src/_sass/bootstrap/_breadcrumbs.scss rename to website/www/site/assets/scss/bootstrap/_breadcrumbs.scss diff --git a/website/src/_sass/bootstrap/_button-groups.scss b/website/www/site/assets/scss/bootstrap/_button-groups.scss similarity index 100% rename from website/src/_sass/bootstrap/_button-groups.scss rename to website/www/site/assets/scss/bootstrap/_button-groups.scss diff --git a/website/src/_sass/bootstrap/_buttons.scss b/website/www/site/assets/scss/bootstrap/_buttons.scss similarity index 100% rename from website/src/_sass/bootstrap/_buttons.scss rename to website/www/site/assets/scss/bootstrap/_buttons.scss diff --git a/website/src/_sass/bootstrap/_carousel.scss b/website/www/site/assets/scss/bootstrap/_carousel.scss similarity index 100% rename from website/src/_sass/bootstrap/_carousel.scss rename to website/www/site/assets/scss/bootstrap/_carousel.scss diff --git a/website/src/_sass/bootstrap/_close.scss b/website/www/site/assets/scss/bootstrap/_close.scss similarity index 100% rename from website/src/_sass/bootstrap/_close.scss rename to website/www/site/assets/scss/bootstrap/_close.scss diff --git a/website/src/_sass/bootstrap/_code.scss b/website/www/site/assets/scss/bootstrap/_code.scss similarity index 100% rename from website/src/_sass/bootstrap/_code.scss rename to website/www/site/assets/scss/bootstrap/_code.scss diff --git a/website/src/_sass/bootstrap/_component-animations.scss b/website/www/site/assets/scss/bootstrap/_component-animations.scss similarity index 100% rename from website/src/_sass/bootstrap/_component-animations.scss rename to website/www/site/assets/scss/bootstrap/_component-animations.scss diff --git a/website/src/_sass/bootstrap/_dropdowns.scss b/website/www/site/assets/scss/bootstrap/_dropdowns.scss similarity index 100% rename from website/src/_sass/bootstrap/_dropdowns.scss rename to website/www/site/assets/scss/bootstrap/_dropdowns.scss diff --git a/website/src/_sass/bootstrap/_forms.scss b/website/www/site/assets/scss/bootstrap/_forms.scss similarity index 100% rename from website/src/_sass/bootstrap/_forms.scss rename to website/www/site/assets/scss/bootstrap/_forms.scss diff --git a/website/src/_sass/bootstrap/_glyphicons.scss b/website/www/site/assets/scss/bootstrap/_glyphicons.scss similarity index 100% rename from website/src/_sass/bootstrap/_glyphicons.scss rename to website/www/site/assets/scss/bootstrap/_glyphicons.scss diff --git a/website/src/_sass/bootstrap/_grid.scss b/website/www/site/assets/scss/bootstrap/_grid.scss similarity index 100% rename from website/src/_sass/bootstrap/_grid.scss rename to website/www/site/assets/scss/bootstrap/_grid.scss diff --git a/website/src/_sass/bootstrap/_input-groups.scss b/website/www/site/assets/scss/bootstrap/_input-groups.scss similarity index 100% rename from website/src/_sass/bootstrap/_input-groups.scss rename to website/www/site/assets/scss/bootstrap/_input-groups.scss diff --git a/website/src/_sass/bootstrap/_jumbotron.scss b/website/www/site/assets/scss/bootstrap/_jumbotron.scss similarity index 100% rename from website/src/_sass/bootstrap/_jumbotron.scss rename to website/www/site/assets/scss/bootstrap/_jumbotron.scss diff --git a/website/src/_sass/bootstrap/_labels.scss b/website/www/site/assets/scss/bootstrap/_labels.scss similarity index 100% rename from website/src/_sass/bootstrap/_labels.scss rename to website/www/site/assets/scss/bootstrap/_labels.scss diff --git a/website/src/_sass/bootstrap/_list-group.scss b/website/www/site/assets/scss/bootstrap/_list-group.scss similarity index 100% rename from website/src/_sass/bootstrap/_list-group.scss rename to website/www/site/assets/scss/bootstrap/_list-group.scss diff --git a/website/src/_sass/bootstrap/_media.scss b/website/www/site/assets/scss/bootstrap/_media.scss similarity index 100% rename from website/src/_sass/bootstrap/_media.scss rename to website/www/site/assets/scss/bootstrap/_media.scss diff --git a/website/src/_sass/bootstrap/_mixins.scss b/website/www/site/assets/scss/bootstrap/_mixins.scss similarity index 100% rename from website/src/_sass/bootstrap/_mixins.scss rename to website/www/site/assets/scss/bootstrap/_mixins.scss diff --git a/website/src/_sass/bootstrap/_modals.scss b/website/www/site/assets/scss/bootstrap/_modals.scss similarity index 100% rename from website/src/_sass/bootstrap/_modals.scss rename to website/www/site/assets/scss/bootstrap/_modals.scss diff --git a/website/src/_sass/bootstrap/_navbar.scss b/website/www/site/assets/scss/bootstrap/_navbar.scss similarity index 100% rename from website/src/_sass/bootstrap/_navbar.scss rename to website/www/site/assets/scss/bootstrap/_navbar.scss diff --git a/website/src/_sass/bootstrap/_navs.scss b/website/www/site/assets/scss/bootstrap/_navs.scss similarity index 100% rename from website/src/_sass/bootstrap/_navs.scss rename to website/www/site/assets/scss/bootstrap/_navs.scss diff --git a/website/src/_sass/bootstrap/_normalize.scss b/website/www/site/assets/scss/bootstrap/_normalize.scss similarity index 100% rename from website/src/_sass/bootstrap/_normalize.scss rename to website/www/site/assets/scss/bootstrap/_normalize.scss diff --git a/website/src/_sass/bootstrap/_pager.scss b/website/www/site/assets/scss/bootstrap/_pager.scss similarity index 100% rename from website/src/_sass/bootstrap/_pager.scss rename to website/www/site/assets/scss/bootstrap/_pager.scss diff --git a/website/src/_sass/bootstrap/_pagination.scss b/website/www/site/assets/scss/bootstrap/_pagination.scss similarity index 100% rename from website/src/_sass/bootstrap/_pagination.scss rename to website/www/site/assets/scss/bootstrap/_pagination.scss diff --git a/website/src/_sass/bootstrap/_panels.scss b/website/www/site/assets/scss/bootstrap/_panels.scss similarity index 100% rename from website/src/_sass/bootstrap/_panels.scss rename to website/www/site/assets/scss/bootstrap/_panels.scss diff --git a/website/src/_sass/bootstrap/_popovers.scss b/website/www/site/assets/scss/bootstrap/_popovers.scss similarity index 100% rename from website/src/_sass/bootstrap/_popovers.scss rename to website/www/site/assets/scss/bootstrap/_popovers.scss diff --git a/website/src/_sass/bootstrap/_print.scss b/website/www/site/assets/scss/bootstrap/_print.scss similarity index 100% rename from website/src/_sass/bootstrap/_print.scss rename to website/www/site/assets/scss/bootstrap/_print.scss diff --git a/website/src/_sass/bootstrap/_progress-bars.scss b/website/www/site/assets/scss/bootstrap/_progress-bars.scss similarity index 100% rename from website/src/_sass/bootstrap/_progress-bars.scss rename to website/www/site/assets/scss/bootstrap/_progress-bars.scss diff --git a/website/src/_sass/bootstrap/_responsive-embed.scss b/website/www/site/assets/scss/bootstrap/_responsive-embed.scss similarity index 100% rename from website/src/_sass/bootstrap/_responsive-embed.scss rename to website/www/site/assets/scss/bootstrap/_responsive-embed.scss diff --git a/website/src/_sass/bootstrap/_responsive-utilities.scss b/website/www/site/assets/scss/bootstrap/_responsive-utilities.scss similarity index 100% rename from website/src/_sass/bootstrap/_responsive-utilities.scss rename to website/www/site/assets/scss/bootstrap/_responsive-utilities.scss diff --git a/website/src/_sass/bootstrap/_scaffolding.scss b/website/www/site/assets/scss/bootstrap/_scaffolding.scss similarity index 100% rename from website/src/_sass/bootstrap/_scaffolding.scss rename to website/www/site/assets/scss/bootstrap/_scaffolding.scss diff --git a/website/src/_sass/bootstrap/_tables.scss b/website/www/site/assets/scss/bootstrap/_tables.scss similarity index 100% rename from website/src/_sass/bootstrap/_tables.scss rename to website/www/site/assets/scss/bootstrap/_tables.scss diff --git a/website/src/_sass/bootstrap/_theme.scss b/website/www/site/assets/scss/bootstrap/_theme.scss similarity index 100% rename from website/src/_sass/bootstrap/_theme.scss rename to website/www/site/assets/scss/bootstrap/_theme.scss diff --git a/website/src/_sass/bootstrap/_thumbnails.scss b/website/www/site/assets/scss/bootstrap/_thumbnails.scss similarity index 100% rename from website/src/_sass/bootstrap/_thumbnails.scss rename to website/www/site/assets/scss/bootstrap/_thumbnails.scss diff --git a/website/src/_sass/bootstrap/_tooltip.scss b/website/www/site/assets/scss/bootstrap/_tooltip.scss similarity index 100% rename from website/src/_sass/bootstrap/_tooltip.scss rename to website/www/site/assets/scss/bootstrap/_tooltip.scss diff --git a/website/src/_sass/bootstrap/_type.scss b/website/www/site/assets/scss/bootstrap/_type.scss similarity index 100% rename from website/src/_sass/bootstrap/_type.scss rename to website/www/site/assets/scss/bootstrap/_type.scss diff --git a/website/src/_sass/bootstrap/_utilities.scss b/website/www/site/assets/scss/bootstrap/_utilities.scss similarity index 100% rename from website/src/_sass/bootstrap/_utilities.scss rename to website/www/site/assets/scss/bootstrap/_utilities.scss diff --git a/website/src/_sass/bootstrap/_variables.scss b/website/www/site/assets/scss/bootstrap/_variables.scss similarity index 100% rename from website/src/_sass/bootstrap/_variables.scss rename to website/www/site/assets/scss/bootstrap/_variables.scss diff --git a/website/src/_sass/bootstrap/_wells.scss b/website/www/site/assets/scss/bootstrap/_wells.scss similarity index 100% rename from website/src/_sass/bootstrap/_wells.scss rename to website/www/site/assets/scss/bootstrap/_wells.scss diff --git a/website/src/_sass/bootstrap/mixins/_alerts.scss b/website/www/site/assets/scss/bootstrap/mixins/_alerts.scss similarity index 100% rename from website/src/_sass/bootstrap/mixins/_alerts.scss rename to website/www/site/assets/scss/bootstrap/mixins/_alerts.scss diff --git a/website/src/_sass/bootstrap/mixins/_background-variant.scss b/website/www/site/assets/scss/bootstrap/mixins/_background-variant.scss similarity index 100% rename from website/src/_sass/bootstrap/mixins/_background-variant.scss rename to website/www/site/assets/scss/bootstrap/mixins/_background-variant.scss diff --git a/website/src/_sass/bootstrap/mixins/_border-radius.scss b/website/www/site/assets/scss/bootstrap/mixins/_border-radius.scss similarity index 100% rename from website/src/_sass/bootstrap/mixins/_border-radius.scss rename to website/www/site/assets/scss/bootstrap/mixins/_border-radius.scss diff --git a/website/src/_sass/bootstrap/mixins/_buttons.scss b/website/www/site/assets/scss/bootstrap/mixins/_buttons.scss similarity index 100% rename from website/src/_sass/bootstrap/mixins/_buttons.scss rename to website/www/site/assets/scss/bootstrap/mixins/_buttons.scss diff --git a/website/src/_sass/bootstrap/mixins/_center-block.scss b/website/www/site/assets/scss/bootstrap/mixins/_center-block.scss similarity index 100% rename from website/src/_sass/bootstrap/mixins/_center-block.scss rename to website/www/site/assets/scss/bootstrap/mixins/_center-block.scss diff --git a/website/src/_sass/bootstrap/mixins/_clearfix.scss b/website/www/site/assets/scss/bootstrap/mixins/_clearfix.scss similarity index 100% rename from website/src/_sass/bootstrap/mixins/_clearfix.scss rename to website/www/site/assets/scss/bootstrap/mixins/_clearfix.scss diff --git a/website/src/_sass/bootstrap/mixins/_forms.scss b/website/www/site/assets/scss/bootstrap/mixins/_forms.scss similarity index 100% rename from website/src/_sass/bootstrap/mixins/_forms.scss rename to website/www/site/assets/scss/bootstrap/mixins/_forms.scss diff --git a/website/src/_sass/bootstrap/mixins/_gradients.scss b/website/www/site/assets/scss/bootstrap/mixins/_gradients.scss similarity index 100% rename from website/src/_sass/bootstrap/mixins/_gradients.scss rename to website/www/site/assets/scss/bootstrap/mixins/_gradients.scss diff --git a/website/src/_sass/bootstrap/mixins/_grid-framework.scss b/website/www/site/assets/scss/bootstrap/mixins/_grid-framework.scss similarity index 100% rename from website/src/_sass/bootstrap/mixins/_grid-framework.scss rename to website/www/site/assets/scss/bootstrap/mixins/_grid-framework.scss diff --git a/website/src/_sass/bootstrap/mixins/_grid.scss b/website/www/site/assets/scss/bootstrap/mixins/_grid.scss similarity index 100% rename from website/src/_sass/bootstrap/mixins/_grid.scss rename to website/www/site/assets/scss/bootstrap/mixins/_grid.scss diff --git a/website/src/_sass/bootstrap/mixins/_hide-text.scss b/website/www/site/assets/scss/bootstrap/mixins/_hide-text.scss similarity index 100% rename from website/src/_sass/bootstrap/mixins/_hide-text.scss rename to website/www/site/assets/scss/bootstrap/mixins/_hide-text.scss diff --git a/website/src/_sass/bootstrap/mixins/_image.scss b/website/www/site/assets/scss/bootstrap/mixins/_image.scss similarity index 100% rename from website/src/_sass/bootstrap/mixins/_image.scss rename to website/www/site/assets/scss/bootstrap/mixins/_image.scss diff --git a/website/src/_sass/bootstrap/mixins/_labels.scss b/website/www/site/assets/scss/bootstrap/mixins/_labels.scss similarity index 100% rename from website/src/_sass/bootstrap/mixins/_labels.scss rename to website/www/site/assets/scss/bootstrap/mixins/_labels.scss diff --git a/website/src/_sass/bootstrap/mixins/_list-group.scss b/website/www/site/assets/scss/bootstrap/mixins/_list-group.scss similarity index 100% rename from website/src/_sass/bootstrap/mixins/_list-group.scss rename to website/www/site/assets/scss/bootstrap/mixins/_list-group.scss diff --git a/website/src/_sass/bootstrap/mixins/_nav-divider.scss b/website/www/site/assets/scss/bootstrap/mixins/_nav-divider.scss similarity index 100% rename from website/src/_sass/bootstrap/mixins/_nav-divider.scss rename to website/www/site/assets/scss/bootstrap/mixins/_nav-divider.scss diff --git a/website/src/_sass/bootstrap/mixins/_nav-vertical-align.scss b/website/www/site/assets/scss/bootstrap/mixins/_nav-vertical-align.scss similarity index 100% rename from website/src/_sass/bootstrap/mixins/_nav-vertical-align.scss rename to website/www/site/assets/scss/bootstrap/mixins/_nav-vertical-align.scss diff --git a/website/src/_sass/bootstrap/mixins/_opacity.scss b/website/www/site/assets/scss/bootstrap/mixins/_opacity.scss similarity index 100% rename from website/src/_sass/bootstrap/mixins/_opacity.scss rename to website/www/site/assets/scss/bootstrap/mixins/_opacity.scss diff --git a/website/src/_sass/bootstrap/mixins/_pagination.scss b/website/www/site/assets/scss/bootstrap/mixins/_pagination.scss similarity index 100% rename from website/src/_sass/bootstrap/mixins/_pagination.scss rename to website/www/site/assets/scss/bootstrap/mixins/_pagination.scss diff --git a/website/src/_sass/bootstrap/mixins/_panels.scss b/website/www/site/assets/scss/bootstrap/mixins/_panels.scss similarity index 100% rename from website/src/_sass/bootstrap/mixins/_panels.scss rename to website/www/site/assets/scss/bootstrap/mixins/_panels.scss diff --git a/website/src/_sass/bootstrap/mixins/_progress-bar.scss b/website/www/site/assets/scss/bootstrap/mixins/_progress-bar.scss similarity index 100% rename from website/src/_sass/bootstrap/mixins/_progress-bar.scss rename to website/www/site/assets/scss/bootstrap/mixins/_progress-bar.scss diff --git a/website/src/_sass/bootstrap/mixins/_reset-filter.scss b/website/www/site/assets/scss/bootstrap/mixins/_reset-filter.scss similarity index 100% rename from website/src/_sass/bootstrap/mixins/_reset-filter.scss rename to website/www/site/assets/scss/bootstrap/mixins/_reset-filter.scss diff --git a/website/src/_sass/bootstrap/mixins/_reset-text.scss b/website/www/site/assets/scss/bootstrap/mixins/_reset-text.scss similarity index 100% rename from website/src/_sass/bootstrap/mixins/_reset-text.scss rename to website/www/site/assets/scss/bootstrap/mixins/_reset-text.scss diff --git a/website/src/_sass/bootstrap/mixins/_resize.scss b/website/www/site/assets/scss/bootstrap/mixins/_resize.scss similarity index 100% rename from website/src/_sass/bootstrap/mixins/_resize.scss rename to website/www/site/assets/scss/bootstrap/mixins/_resize.scss diff --git a/website/src/_sass/bootstrap/mixins/_responsive-visibility.scss b/website/www/site/assets/scss/bootstrap/mixins/_responsive-visibility.scss similarity index 100% rename from website/src/_sass/bootstrap/mixins/_responsive-visibility.scss rename to website/www/site/assets/scss/bootstrap/mixins/_responsive-visibility.scss diff --git a/website/src/_sass/bootstrap/mixins/_size.scss b/website/www/site/assets/scss/bootstrap/mixins/_size.scss similarity index 100% rename from website/src/_sass/bootstrap/mixins/_size.scss rename to website/www/site/assets/scss/bootstrap/mixins/_size.scss diff --git a/website/src/_sass/bootstrap/mixins/_tab-focus.scss b/website/www/site/assets/scss/bootstrap/mixins/_tab-focus.scss similarity index 100% rename from website/src/_sass/bootstrap/mixins/_tab-focus.scss rename to website/www/site/assets/scss/bootstrap/mixins/_tab-focus.scss diff --git a/website/src/_sass/bootstrap/mixins/_table-row.scss b/website/www/site/assets/scss/bootstrap/mixins/_table-row.scss similarity index 100% rename from website/src/_sass/bootstrap/mixins/_table-row.scss rename to website/www/site/assets/scss/bootstrap/mixins/_table-row.scss diff --git a/website/src/_sass/bootstrap/mixins/_text-emphasis.scss b/website/www/site/assets/scss/bootstrap/mixins/_text-emphasis.scss similarity index 100% rename from website/src/_sass/bootstrap/mixins/_text-emphasis.scss rename to website/www/site/assets/scss/bootstrap/mixins/_text-emphasis.scss diff --git a/website/src/_sass/bootstrap/mixins/_text-overflow.scss b/website/www/site/assets/scss/bootstrap/mixins/_text-overflow.scss similarity index 100% rename from website/src/_sass/bootstrap/mixins/_text-overflow.scss rename to website/www/site/assets/scss/bootstrap/mixins/_text-overflow.scss diff --git a/website/src/_sass/bootstrap/mixins/_vendor-prefixes.scss b/website/www/site/assets/scss/bootstrap/mixins/_vendor-prefixes.scss similarity index 100% rename from website/src/_sass/bootstrap/mixins/_vendor-prefixes.scss rename to website/www/site/assets/scss/bootstrap/mixins/_vendor-prefixes.scss diff --git a/website/src/_sass/capability-matrix.scss b/website/www/site/assets/scss/capability-matrix.scss similarity index 100% rename from website/src/_sass/capability-matrix.scss rename to website/www/site/assets/scss/capability-matrix.scss diff --git a/website/src/css/site.scss b/website/www/site/assets/scss/main.scss similarity index 97% rename from website/src/css/site.scss rename to website/www/site/assets/scss/main.scss index 84390732d01b..a2983ad59473 100644 --- a/website/src/css/site.scss +++ b/website/www/site/assets/scss/main.scss @@ -1,5 +1,3 @@ ---- ---- /** Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -13,6 +11,7 @@ See the License for the specific language governing permissions and limitations under the License. */ + // Legacy. @import "bootstrap"; @import "capability-matrix"; @@ -40,3 +39,4 @@ @import "_pillars.sass"; @import "_section-nav.sass"; @import "_page-nav.sass"; +@import "_table-wrapper.sass"; diff --git a/website/www/site/config.toml b/website/www/site/config.toml new file mode 100644 index 000000000000..9c4bcd179672 --- /dev/null +++ b/website/www/site/config.toml @@ -0,0 +1,112 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +baseURL = "/" +title = "Apache Beam" + +enableRobotsTXT = true + +# Hugo allows theme composition (and inheritance). The precedence is from left to right. +theme = ["docsy"] + +# Will give values to .Lastmod etc. +enableGitInfo = true + +# Language settings +contentDir = "content/en" +defaultContentLanguage = "en" +defaultContentLanguageInSubdir = false +# Useful when translating. +enableMissingTranslationPlaceholders = true + +# Highlighting config +pygmentsCodeFences = true +pygmentsUseClasses = false +# Use the new Chroma Go highlighter in Hugo. +pygmentsUseClassic = false +#pygmentsOptions = "linenos=table" +# See https://help.farbox.com/pygments.html +pygmentsStyle = "tango" + +summaryLength = "unlimited" + +canonifyURLs = true + +[markup.goldmark.renderer] +unsafe= true + +[markup.highlight] +noClasses = false + +[markup] + [markup.tableOfContents] + endLevel = 4 + +## Configuration for BlackFriday markdown parser: https://github.com/russross/blackfriday +[blackfriday] +plainIDAnchors = true +hrefTargetBlank = true +angledQuotes = false +latexDashes = true + +# Image processing configuration. +[imaging] +resampleFilter = "CatmullRom" +quality = 75 +anchor = "smart" + +[services] +[services.googleAnalytics] +# Comment out the next line to disable GA tracking. Also disables the feature described in [params.ui.feedback]. +# id = "UA-73650088-1" + +# Language configuration + +[languages] +[languages.en] +title = "Apache Beam" +description = "Apache Beam is an open source, unified model and set of language-specific SDKs for defining and executing data processing workflows, and also data ingestion and integration flows, supporting Enterprise Integration Patterns (EIPs) and Domain Specific Languages (DSLs). Dataflow pipelines simplify the mechanics of large-scale batch and streaming data processing and can run on a number of runtimes like Apache Flink, Apache Spark, and Google Cloud Dataflow (a cloud service). Beam also brings DSL in different languages, allowing users to easily implement their data integration processes." +languageName ="English" +# Weight used for sorting. +weight = 1 + +# RSS template configuration +[outputs] +home = ["HTML","FEED"] + +[mediaTypes] +[mediaTypes."application/rss"] +suffixes = ["xml"] + +[outputFormats] +[outputFormats.FEED] +mediatype = "application/rss" +baseName = "feed" + +# Everything below this are Site Params + +# First one is picked as the Twitter card image if not set on page. +# images = ["images/feature-image.png"] + +# Repository configuration (URLs for in-page links to opening issues and suggesting changes) +github_repo = "https://github.com/apache/beam" +# An optional link to a related project repo. For example, the sibling repository where your product code lives. +github_project_repo = "https://github.com/apache/beam" + +[params] +description = "Apache Beam is an open source, unified model and set of language-specific SDKs for defining and executing data processing workflows, and also data ingestion and integration flows, supporting Enterprise Integration Patterns (EIPs) and Domain Specific Languages (DSLs). Dataflow pipelines simplify the mechanics of large-scale batch and streaming data processing and can run on a number of runtimes like Apache Flink, Apache Spark, and Google Cloud Dataflow (a cloud service). Beam also brings DSL in different languages, allowing users to easily implement their data integration processes." +release_latest = "2.20.0" +# The repository and branch where the files live in Github or Colab. This is used +# to serve and stage from your local branch, but publish to the master branch. +# e.g. https://github.com/{{< param branch_repo >}}/path/to/notebook.ipynb +# e.g. https://colab.sandbox.google.com/github/{{< param branch_repo >}}/path/to/notebook.ipynb +branch_repo = "apache/beam/blob/master" diff --git a/website/www/site/content/en/_index.md b/website/www/site/content/en/_index.md new file mode 100644 index 000000000000..719e5e12431c --- /dev/null +++ b/website/www/site/content/en/_index.md @@ -0,0 +1,17 @@ +--- +title: "Apache Beam" +--- + + diff --git a/website/src/_posts/2017-01-09-added-apex-runner.md b/website/www/site/content/en/blog/added-apex-runner.md similarity index 90% rename from website/src/_posts/2017-01-09-added-apex-runner.md rename to website/www/site/content/en/blog/added-apex-runner.md index a2cd2d4d1881..d0482d47210d 100644 --- a/website/src/_posts/2017-01-09-added-apex-runner.md +++ b/website/www/site/content/en/blog/added-apex-runner.md @@ -1,9 +1,10 @@ --- -layout: post title: "Release 0.4.0 adds a runner for Apache Apex" date: 2017-01-09 10:00:01 -0700 -excerpt_separator: -categories: blog +categories: + - blog +aliases: + - /blog/2017/01/09/added-apex-runner.html authors: - thw --- @@ -21,11 +22,11 @@ See the License for the specific language governing permissions and limitations under the License. --> -The latest release 0.4.0 of [Apache Beam]({{ site.baseurl }}/) adds a new runner for [Apache Apex](https://apex.apache.org/). We are excited to reach this initial milestone and are looking forward to continued collaboration between the Beam and Apex communities to advance the runner. +The latest release 0.4.0 of [Apache Beam](/) adds a new runner for [Apache Apex](https://apex.apache.org/). We are excited to reach this initial milestone and are looking forward to continued collaboration between the Beam and Apex communities to advance the runner. -Beam evolved from the Google Dataflow SDK and as incubator project has quickly adapted the Apache way, grown the community and attracts increasing interest from users that hope to benefit from a conceptual strong unified programming model that is portable between different big data processing frameworks (see [Streaming-101](https://www.oreilly.com/ideas/the-world-beyond-batch-streaming-101) and [Streaming-102](https://www.oreilly.com/ideas/the-world-beyond-batch-streaming-102)). Multiple Apache projects already provide runners for Beam (see [runners and capabilities matrix]({{ site.baseurl }}/documentation/runners/capability-matrix/)). +Beam evolved from the Google Dataflow SDK and as incubator project has quickly adapted the Apache way, grown the community and attracts increasing interest from users that hope to benefit from a conceptual strong unified programming model that is portable between different big data processing frameworks (see [Streaming-101](https://www.oreilly.com/ideas/the-world-beyond-batch-streaming-101) and [Streaming-102](https://www.oreilly.com/ideas/the-world-beyond-batch-streaming-102)). Multiple Apache projects already provide runners for Beam (see [runners and capabilities matrix](/documentation/runners/capability-matrix/)). Apex is a stream processing framework for low-latency, high-throughput, stateful and reliable processing of complex analytics pipelines on clusters. Apex was developed since 2012 and is used in production by large companies for real-time and batch processing at scale. @@ -41,7 +42,7 @@ A Beam runner needs to implement the translation from the Beam model to the unde ## Execution and Testing -In this release, the Apex runner executes the pipelines in embedded mode, where, similar to the direct runner, everything is executed in a single JVM. See [quickstart]({{ site.baseurl }}/get-started/quickstart/) on how to run the Beam examples with the Apex runner. +In this release, the Apex runner executes the pipelines in embedded mode, where, similar to the direct runner, everything is executed in a single JVM. See [quickstart](/get-started/quickstart/) on how to run the Beam examples with the Apex runner. Embedded mode is useful for development and debugging. Apex in production runs distributed on Apache Hadoop YARN clusters. An example how a Beam pipeline can be embedded into an Apex application package to run on YARN can be found [here](https://github.com/tweise/apex-samples/tree/master/beam-apex-wordcount) and support for direct launch in the runner is currently being worked on. diff --git a/website/src/_posts/2019-05-01-adding-data-sources-to-sql.md b/website/www/site/content/en/blog/adding-data-sources-to-sql.md similarity index 96% rename from website/src/_posts/2019-05-01-adding-data-sources-to-sql.md rename to website/www/site/content/en/blog/adding-data-sources-to-sql.md index f9d90e6e2126..244dda87d884 100644 --- a/website/src/_posts/2019-05-01-adding-data-sources-to-sql.md +++ b/website/www/site/content/en/blog/adding-data-sources-to-sql.md @@ -1,11 +1,12 @@ --- -layout: post title: "Adding new Data Sources to Beam SQL CLI" date: 2019-06-04 00:00:01 -0800 -excerpt_separator: -categories: blog +categories: + - blog +aliases: + - /blog/2019/06/04/adding-data-sources-to-sql.html authors: - - pabloem + - pabloem --- -categories: blog +categories: + - blog +aliases: + - /blog/2019/02/15/beam-2.10.0.html authors: - klk @@ -23,7 +24,7 @@ limitations under the License. --> We are happy to present the new 2.10.0 release of Beam. This release includes both improvements and new functionality. -See the [download page]({{ site.baseurl }}/get-started/downloads/#2100-2019-02-01) for this release. +See the [download page](/get-started/downloads/#2100-2019-02-01) for this release. For more information on changes in 2.10.0, check out the [detailed release notes](https://issues.apache.org/jira/secure/ReleaseNote.jspa?projectId=12319527&version=12344540). diff --git a/website/src/_posts/2019-03-05-beam-2.11.0.md b/website/www/site/content/en/blog/beam-2.11.0.md similarity index 96% rename from website/src/_posts/2019-03-05-beam-2.11.0.md rename to website/www/site/content/en/blog/beam-2.11.0.md index 9de98d180b4c..0f04370df1d3 100644 --- a/website/src/_posts/2019-03-05-beam-2.11.0.md +++ b/website/www/site/content/en/blog/beam-2.11.0.md @@ -1,9 +1,10 @@ --- -layout: post title: "Apache Beam 2.11.0" date: 2019-03-05 00:00:01 -0800 -excerpt_separator: -categories: blog +categories: + - blog +aliases: + - /blog/2019/03/05/beam-2.11.0.html authors: - altay @@ -23,7 +24,7 @@ limitations under the License. --> We are happy to present the new 2.11.0 release of Beam. This release includes both improvements and new functionality. -See the [download page]({{ site.baseurl }}/get-started/downloads/#2110-2019-02-26) for this release. +See the [download page](/get-started/downloads/#2110-2019-02-26) for this release. For more information on changes in 2.11.0, check out the [detailed release notes](https://issues.apache.org/jira/secure/ReleaseNote.jspa?projectId=12319527&version=12344775). diff --git a/website/src/_posts/2019-04-25-beam-2.12.0.md b/website/www/site/content/en/blog/beam-2.12.0.md similarity index 94% rename from website/src/_posts/2019-04-25-beam-2.12.0.md rename to website/www/site/content/en/blog/beam-2.12.0.md index 1d1aafb425f7..29659d1d75c4 100644 --- a/website/src/_posts/2019-04-25-beam-2.12.0.md +++ b/website/www/site/content/en/blog/beam-2.12.0.md @@ -1,9 +1,10 @@ --- -layout: post title: "Apache Beam 2.12.0" date: 2019-04-25 00:00:01 -0800 -excerpt_separator: -categories: blog +categories: + - blog +aliases: + - /blog/2019/04/25/beam-2.12.0.html authors: - apilloud @@ -23,7 +24,7 @@ limitations under the License. --> We are happy to present the new 2.12.0 release of Beam. This release includes both improvements and new functionality. -See the [download page]({{ site.baseurl }}/get-started/downloads/#2120-2019-04-25) for this release. +See the [download page](/get-started/downloads/#2120-2019-04-25) for this release. For more information on changes in 2.12.0, check out the [detailed release notes](https://jira.apache.org/jira/secure/ReleaseNote.jspa?projectId=12319527&version=12344944). diff --git a/website/src/_posts/2019-05-22-beam-2.13.0.md b/website/www/site/content/en/blog/beam-2.13.0.md similarity index 93% rename from website/src/_posts/2019-05-22-beam-2.13.0.md rename to website/www/site/content/en/blog/beam-2.13.0.md index c81aa2d2f9da..20f402efd3ef 100644 --- a/website/src/_posts/2019-05-22-beam-2.13.0.md +++ b/website/www/site/content/en/blog/beam-2.13.0.md @@ -1,11 +1,11 @@ --- -layout: post title: "Apache Beam 2.13.0" date: 2019-06-07 00:00:01 -0800 +categories: + - blog # Date above corrected but keep the old URL: -permalink: /blog/2019/05/22/beam-2.13.0.html -excerpt_separator: -categories: blog +aliases: + - /blog/2019/05/22/beam-2.13.0.html authors: - goenka @@ -25,7 +25,7 @@ limitations under the License. --> We are happy to present the new 2.13.0 release of Beam. This release includes both improvements and new functionality. -See the [download page]({{ site.baseurl }}/get-started/downloads/#2130-2019-05-21) for this release. +See the [download page](/get-started/downloads/#2130-2019-05-21) for this release. For more information on changes in 2.13.0, check out the [detailed release notes](https://jira.apache.org/jira/secure/ReleaseNote.jspa?projectId=12319527&version=12345166). diff --git a/website/src/_posts/2019-07-31-beam-2.14.0.md b/website/www/site/content/en/blog/beam-2.14.0.md similarity index 96% rename from website/src/_posts/2019-07-31-beam-2.14.0.md rename to website/www/site/content/en/blog/beam-2.14.0.md index 0f00e7195b46..3bd1292f8d1c 100644 --- a/website/src/_posts/2019-07-31-beam-2.14.0.md +++ b/website/www/site/content/en/blog/beam-2.14.0.md @@ -1,11 +1,10 @@ --- -layout: post title: "Apache Beam 2.14.0" date: 2019-07-31 00:00:01 -0800 -# If date above changes, still keep the old URL: -permalink: /blog/2019/07/31/beam-2.14.0.html -excerpt_separator: -categories: blog +categories: + - blog +aliases: + - /blog/2019/07/31/beam-2.14.0.html authors: - anton - altay @@ -26,7 +25,7 @@ limitations under the License. --> We are happy to present the new 2.14.0 release of Beam. This release includes both improvements and new functionality. -See the [download page]({{ site.baseurl }}/get-started/downloads/#2140-2019-08-01) for this release. +See the [download page](/get-started/downloads/#2140-2019-08-01) for this release. For more information on changes in 2.14.0, check out the [detailed release notes](https://issues.apache.org/jira/secure/ReleaseNote.jspa?projectId=12319527&version=12345431). diff --git a/website/src/_posts/2019-08-22-beam-2.15.0.md b/website/www/site/content/en/blog/beam-2.15.0.md similarity index 94% rename from website/src/_posts/2019-08-22-beam-2.15.0.md rename to website/www/site/content/en/blog/beam-2.15.0.md index 474cf131fce1..8392899f2ee5 100644 --- a/website/src/_posts/2019-08-22-beam-2.15.0.md +++ b/website/www/site/content/en/blog/beam-2.15.0.md @@ -1,11 +1,10 @@ --- -layout: post title: "Apache Beam 2.15.0" date: 2019-08-22 00:00:01 -0800 -# Date above corrected but keep the old URL: -permalink: /blog/2019/08/22/beam-2.15.0.html -excerpt_separator: -categories: blog +categories: + - blog +aliases: + - /blog/2019/08/22/beam-2.15.0.html authors: - yifanzou @@ -25,7 +24,7 @@ limitations under the License. --> We are happy to present the new 2.15.0 release of Beam. This release includes both improvements and new functionality. -See the [download page]({{ site.baseurl }}/get-started/downloads/#2150-2019-08-22) for this release. +See the [download page](/get-started/downloads/#2150-2019-08-22) for this release. For more information on changes in 2.15.0, check out the [detailed release notes](https://issues.apache.org/jira/secure/ReleaseNote.jspa?projectId=12319527&version=12345489). diff --git a/website/src/_posts/2019-10-07-beam-2.16.0.md b/website/www/site/content/en/blog/beam-2.16.0.md similarity index 96% rename from website/src/_posts/2019-10-07-beam-2.16.0.md rename to website/www/site/content/en/blog/beam-2.16.0.md index 935604487592..e26dafe3c0cb 100644 --- a/website/src/_posts/2019-10-07-beam-2.16.0.md +++ b/website/www/site/content/en/blog/beam-2.16.0.md @@ -1,11 +1,10 @@ --- -layout: post title: "Apache Beam 2.16.0" date: 2019-10-07 00:00:01 -0800 -# Date above corrected but keep the old URL: -permalink: /blog/2019/10/07/beam-2.16.0.html -excerpt_separator: -categories: blog +categories: + - blog +aliases: + - /blog/2019/10/07/beam-2.16.0.html authors: - markliu @@ -25,7 +24,7 @@ limitations under the License. --> We are happy to present the new 2.16.0 release of Beam. This release includes both improvements and new functionality. -See the [download page]({{ site.baseurl }}/get-started/downloads/#2160-2019-10-07) for this release. +See the [download page](/get-started/downloads/#2160-2019-10-07) for this release. For more information on changes in 2.16.0, check out the [detailed release notes](https://issues.apache.org/jira/secure/ReleaseNote.jspa?projectId=12319527&version=12345494). diff --git a/website/src/_posts/2020-01-06-beam-2.17.0.md b/website/www/site/content/en/blog/beam-2.17.0.md similarity index 94% rename from website/src/_posts/2020-01-06-beam-2.17.0.md rename to website/www/site/content/en/blog/beam-2.17.0.md index 5c7caeb16477..d256b5e4986a 100644 --- a/website/src/_posts/2020-01-06-beam-2.17.0.md +++ b/website/www/site/content/en/blog/beam-2.17.0.md @@ -1,11 +1,10 @@ --- -layout: post title: "Apache Beam 2.17.0" date: 2020-01-06 00:00:01 -0800 -# Date above corrected but keep the old URL: -permalink: /blog/2020/01/06/beam-2.17.0.html -excerpt_separator: -categories: blog +categories: + - blog +aliases: + - /blog/2020/01/06/beam-2.17.0.html authors: - ardagan @@ -25,9 +24,9 @@ limitations under the License. --> We are happy to present the new 2.17.0 release of Beam. This release includes both improvements and new functionality. -Users of the MongoDbIO connector are encouraged to upgrade to this release to address a [security vulnerability]({{ site.baseurl }}/security/CVE-2020-1929/). +Users of the MongoDbIO connector are encouraged to upgrade to this release to address a [security vulnerability](/security/CVE-2020-1929/). -See the [download page]({{ site.baseurl }}/get-started/downloads/#2170-2020-01-06) for this release. +See the [download page](/get-started/downloads/#2170-2020-01-06) for this release. For more information on changes in 2.17.0, check out the [detailed release notes](https://issues.apache.org/jira/secure/ReleaseNote.jspa?version=12345970&projectId=12319527). diff --git a/website/src/_posts/2020-01-13-beam-2.18.0.md b/website/www/site/content/en/blog/beam-2.18.0.md similarity index 97% rename from website/src/_posts/2020-01-13-beam-2.18.0.md rename to website/www/site/content/en/blog/beam-2.18.0.md index 2f6b68aa7a3d..e9275f92c2de 100644 --- a/website/src/_posts/2020-01-13-beam-2.18.0.md +++ b/website/www/site/content/en/blog/beam-2.18.0.md @@ -1,11 +1,11 @@ --- -layout: post title: "Apache Beam 2.18.0" date: 2020-01-23 00:00:01 -0800 +categories: + - blog # Date above corrected but keep the old URL: -permalink: /blog/2020/01/13/beam-2.18.0.html -excerpt_separator: -categories: blog +aliases: + - /blog/2020/01/13/beam-2.18.0.html authors: - udim - altay @@ -26,7 +26,7 @@ limitations under the License. --> We are happy to present the new 2.18.0 release of Beam. This release includes both improvements and new functionality. -See the [download page]({{ site.baseurl }}/get-started/downloads/#2180-2020-01-23) for this release. +See the [download page](/get-started/downloads/#2180-2020-01-23) for this release. For more information on changes in 2.18.0, check out the [detailed release notes](https://issues.apache.org/jira/secure/ReleaseNote.jspa?version=12346383&projectId=12319527). diff --git a/website/src/_posts/2020-02-04-beam-2.19.0.md b/website/www/site/content/en/blog/beam-2.19.0.md similarity index 96% rename from website/src/_posts/2020-02-04-beam-2.19.0.md rename to website/www/site/content/en/blog/beam-2.19.0.md index 31ffb4762e0e..8686ddee5012 100644 --- a/website/src/_posts/2020-02-04-beam-2.19.0.md +++ b/website/www/site/content/en/blog/beam-2.19.0.md @@ -1,11 +1,10 @@ --- -layout: post title: "Apache Beam 2.19.0" date: 2020-02-04 00:00:01 -0800 -# Date above corrected but keep the old URL: -permalink: /blog/2020/02/04/beam-2.19.0.html -excerpt_separator: -categories: blog +categories: + - blog +aliases: + - /blog/2020/02/04/beam-2.19.0.html authors: - boyuanzz @@ -25,7 +24,7 @@ limitations under the License. --> We are happy to present the new 2.19.0 release of Beam. This release includes both improvements and new functionality. -See the [download page]({{ site.baseurl }}/get-started/downloads/#2190-2020-02-04) for this release. +See the [download page](/get-started/downloads/#2190-2020-02-04) for this release. For more information on changes in 2.19.0, check out the [detailed release notes](https://issues.apache.org/jira/secure/ReleaseNote.jspa?projectId=12319527&version=12346582). diff --git a/website/src/_posts/2020-04-15-beam-2.20.0.md b/website/www/site/content/en/blog/beam-2.20.0.md similarity index 96% rename from website/src/_posts/2020-04-15-beam-2.20.0.md rename to website/www/site/content/en/blog/beam-2.20.0.md index a736f29e13e8..d214484e07cd 100644 --- a/website/src/_posts/2020-04-15-beam-2.20.0.md +++ b/website/www/site/content/en/blog/beam-2.20.0.md @@ -1,14 +1,14 @@ --- -layout: post title: "Apache Beam 2.20.0" date: 2020-04-15 00:00:01 -0800 # Date above corrected but keep the old URL: -permalink: /blog/2020/04/15/beam-2.20.0.html -excerpt_separator: -categories: blog +aliases: + - /blog/2020/03/01/beam-2.20.0.html + - /blog/2020/04/15/beam-2.20.0.html +categories: + - blog authors: - amaliujia - --- We are happy to present the new 2.20.0 release of Beam. This release includes both improvements and new functionality. -See the [download page]({{ site.baseurl }}/get-started/downloads/#2190-2020-02-04) for this release. +See the [download page](/get-started/downloads/#2190-2020-02-04) for this release. For more information on changes in 2.20.0, check out the [detailed release notes](https://issues.apache.org/jira/secure/ReleaseNote.jspa?projectId=12319527&version=12346780). diff --git a/website/src/_posts/2018-02-19-beam-2.3.0.md b/website/www/site/content/en/blog/beam-2.3.0.md similarity index 98% rename from website/src/_posts/2018-02-19-beam-2.3.0.md rename to website/www/site/content/en/blog/beam-2.3.0.md index e5d9aed23d47..4f12c77a45db 100644 --- a/website/src/_posts/2018-02-19-beam-2.3.0.md +++ b/website/www/site/content/en/blog/beam-2.3.0.md @@ -1,9 +1,10 @@ --- -layout: post title: "Apache Beam 2.3.0" date: 2018-02-19 00:00:01 -0800 -excerpt_separator: -categories: blog +categories: + - blog +aliases: + - /blog/2018/02/19/beam-2.3.0.html authors: - iemejia --- diff --git a/website/src/_posts/2018-06-26-beam-2.5.0.md b/website/www/site/content/en/blog/beam-2.5.0.md similarity index 97% rename from website/src/_posts/2018-06-26-beam-2.5.0.md rename to website/www/site/content/en/blog/beam-2.5.0.md index fe6d3baf1b73..631780302d5b 100644 --- a/website/src/_posts/2018-06-26-beam-2.5.0.md +++ b/website/www/site/content/en/blog/beam-2.5.0.md @@ -1,9 +1,10 @@ --- -layout: post title: "Apache Beam 2.5.0" date: 2018-06-26 00:00:01 -0800 -excerpt_separator: -categories: blog +categories: + - blog +aliases: + - /blog/2018/06/26/beam-2.5.0.html authors: - aromanenko --- @@ -28,7 +29,7 @@ please check the detailed release notes. # New Features / Improvements ## Go SDK support -The Go SDK has been officially accepted into the project, after an incubation period and community effort. Go pipelines run on Dataflow runner. More details are [here]({{ site.baseurl }}/documentation/sdks/go/). +The Go SDK has been officially accepted into the project, after an incubation period and community effort. Go pipelines run on Dataflow runner. More details are [here](/documentation/sdks/go/). ## Parquet support Support for Apache Parquet format was added. It uses Parquet 1.10 release which, thanks to AvroParquerWriter's API changes, allows FileIO.Sink implementation. diff --git a/website/src/_posts/2018-08-10-beam-2.6.0.md b/website/www/site/content/en/blog/beam-2.6.0.md similarity index 97% rename from website/src/_posts/2018-08-10-beam-2.6.0.md rename to website/www/site/content/en/blog/beam-2.6.0.md index dbc2bcf0e33f..8ca7be932a5d 100644 --- a/website/src/_posts/2018-08-10-beam-2.6.0.md +++ b/website/www/site/content/en/blog/beam-2.6.0.md @@ -1,9 +1,10 @@ --- -layout: post title: "Apache Beam 2.6.0" date: 2018-08-10 00:00:01 -0800 -excerpt_separator: -categories: blog +categories: + - blog +aliases: + - /blog/2018/08/10/beam-2.6.0.html authors: - pabloem - rfernand diff --git a/website/src/_posts/2018-10-03-beam-2.7.0.md b/website/www/site/content/en/blog/beam-2.7.0.md similarity index 90% rename from website/src/_posts/2018-10-03-beam-2.7.0.md rename to website/www/site/content/en/blog/beam-2.7.0.md index 8f386f6c548a..319e549eaf67 100644 --- a/website/src/_posts/2018-10-03-beam-2.7.0.md +++ b/website/www/site/content/en/blog/beam-2.7.0.md @@ -1,9 +1,10 @@ --- -layout: post title: "Apache Beam 2.7.0" date: 2018-10-03 00:00:01 -0800 -excerpt_separator: -categories: blog +categories: + - blog +aliases: + - /blog/2018/10/03/beam-2.7.0.html authors: - ccy @@ -23,7 +24,7 @@ limitations under the License. --> We are happy to present the new 2.7.0 release of Beam. This release includes both improvements and new functionality. -See the [download page]({{ site.baseurl }}/get-started/downloads/#270-lts-2018-10-02) for this release. +See the [download page](/get-started/downloads/#270-lts-2018-10-02) for this release. For more information on changes in 2.7.0, check out the [detailed release notes](https://issues.apache.org/jira/secure/ReleaseNote.jspa?projectId=12319527&version=12343654). @@ -44,8 +45,7 @@ For more information on changes in 2.7.0, check out the ### Portability * Experimental support for Python on local Flink runner for simple -examples, see latest information here: -{{ site.baseurl }}/contribute/portability/#status. +examples, see latest information [here](/contribute/portability/#status). ## Miscellaneous Fixes diff --git a/website/src/_posts/2018-10-29-beam-2.8.0.md b/website/www/site/content/en/blog/beam-2.8.0.md similarity index 95% rename from website/src/_posts/2018-10-29-beam-2.8.0.md rename to website/www/site/content/en/blog/beam-2.8.0.md index 701cb78b951e..400f249df416 100644 --- a/website/src/_posts/2018-10-29-beam-2.8.0.md +++ b/website/www/site/content/en/blog/beam-2.8.0.md @@ -1,9 +1,10 @@ --- -layout: post title: "Apache Beam 2.8.0" date: 2018-10-29 00:00:01 -0800 -excerpt_separator: -categories: blog +categories: + - blog +aliases: + - /blog/2018/10/29/beam-2.8.0.html authors: - altay @@ -23,7 +24,7 @@ limitations under the License. --> We are happy to present the new 2.8.0 release of Beam. This release includes both improvements and new functionality. -See the [download page]({{ site.baseurl }}/get-started/downloads/#280-2018-10-26) for this release. +See the [download page](/get-started/downloads/#280-2018-10-26) for this release. For more information on changes in 2.8.0, check out the [detailed release notes](https://issues.apache.org/jira/secure/ReleaseNote.jspa?projectId=12319527&version=12343985). diff --git a/website/src/_posts/2018-12-13-beam-2.9.0.md b/website/www/site/content/en/blog/beam-2.9.0.md similarity index 95% rename from website/src/_posts/2018-12-13-beam-2.9.0.md rename to website/www/site/content/en/blog/beam-2.9.0.md index 5f1bfc72c4da..f046b5ba31e4 100644 --- a/website/src/_posts/2018-12-13-beam-2.9.0.md +++ b/website/www/site/content/en/blog/beam-2.9.0.md @@ -1,9 +1,10 @@ --- -layout: post title: "Apache Beam 2.9.0" date: 2018-12-13 00:00:01 -0800 -excerpt_separator: -categories: blog +categories: + - blog +aliases: + - /blog/2018/12/13/beam-2.9.0.html authors: - chamikara @@ -23,7 +24,7 @@ limitations under the License. --> We are happy to present the new 2.9.0 release of Beam. This release includes both improvements and new functionality. -See the [download page]({{ site.baseurl }}/get-started/downloads/#290-2018-12-13) for this release. +See the [download page](/get-started/downloads/#290-2018-12-13) for this release. For more information on changes in 2.9.0, check out the [detailed release notes](https://issues.apache.org/jira/secure/ReleaseNote.jspa?projectId=12319527&version=12344258). diff --git a/website/src/_posts/2018-01-09-beam-a-look-back.md b/website/www/site/content/en/blog/beam-a-look-back.md similarity index 95% rename from website/src/_posts/2018-01-09-beam-a-look-back.md rename to website/www/site/content/en/blog/beam-a-look-back.md index 5c58bc9d6502..87d1bc217ee6 100644 --- a/website/src/_posts/2018-01-09-beam-a-look-back.md +++ b/website/www/site/content/en/blog/beam-a-look-back.md @@ -1,9 +1,10 @@ --- -layout: post title: "Apache Beam: A Look Back at 2017" date: 2018-01-09 00:00:01 -0800 -excerpt_separator: -categories: blog +categories: + - blog +aliases: + - /blog/2018/01/09/beam-a-look-back.html authors: - ianand - jbonofre @@ -22,7 +23,7 @@ See the License for the specific language governing permissions and limitations under the License. --> -On January 10, 2017, Apache Beam got [promoted]({{ site.baseurl }}/blog/2017/01/10/beam-graduates.html) +On January 10, 2017, Apache Beam got [promoted](/blog/2017/01/10/beam-graduates.html) as a Top-Level Apache Software Foundation project. It was an important milestone that validated the value of the project, legitimacy of its community, and heralded its growing adoption. In the past year, Apache Beam has been on a @@ -39,7 +40,7 @@ of diverse use cases. Here are some use cases that exemplify the versatility of Beam. Use Cases @@ -52,7 +53,7 @@ new features and fixes. The most obvious and encouraging sign of the growth of Apache Beam’s community, and validation of its core value proposition of portability, is the addition of -significant new [runners]({{ site.baseurl }}/documentation/runners/capability-matrix/) +significant new [runners](/documentation/runners/capability-matrix/) (i.e. execution engines). We entered 2017 with Apache Flink, Apache Spark 1.x, Google Cloud Dataflow, Apache Apex, and Apache Gearpump. In 2017, the following new and updated runners were developed: diff --git a/website/src/_posts/2017-05-17-beam-first-stable-release.md b/website/www/site/content/en/blog/beam-first-stable-release.md similarity index 98% rename from website/src/_posts/2017-05-17-beam-first-stable-release.md rename to website/www/site/content/en/blog/beam-first-stable-release.md index 440d0e8dd48a..9f3dc8669326 100644 --- a/website/src/_posts/2017-05-17-beam-first-stable-release.md +++ b/website/www/site/content/en/blog/beam-first-stable-release.md @@ -1,9 +1,10 @@ --- -layout: post title: "Apache Beam publishes the first stable release" date: 2017-05-17 00:00:01 -0800 -excerpt_separator: -categories: blog +categories: + - blog +aliases: + - /blog/2017/05/17/beam-first-stable-release.html authors: - davor - dhalperi diff --git a/website/src/_posts/2017-01-10-beam-graduates.md b/website/www/site/content/en/blog/beam-graduates.md similarity index 96% rename from website/src/_posts/2017-01-10-beam-graduates.md rename to website/www/site/content/en/blog/beam-graduates.md index 3939e886ef5d..0a6a0a6ac21f 100644 --- a/website/src/_posts/2017-01-10-beam-graduates.md +++ b/website/www/site/content/en/blog/beam-graduates.md @@ -1,9 +1,10 @@ --- -layout: post title: "Apache Beam established as a new top-level project" date: 2017-01-10 00:00:01 -0800 -excerpt_separator: -categories: blog +categories: + - blog +aliases: + - /blog/2017/01/10/beam-graduates.html authors: - davor --- @@ -73,6 +74,6 @@ and a member of the project management committee. Please consider joining us, whether as a user or a contributor, as we work towards our first release with API stability. If you’d like to try out Apache Beam today, check out the latest -[0.4.0 release]({{ site.baseurl }}/get-started/downloads/). We welcome +[0.4.0 release](/get-started/downloads/). We welcome contribution and participation from anyone through our mailing lists, issue tracker, pull requests, and events. diff --git a/website/src/_posts/2016-02-22-beam-has-a-logo.markdown b/website/www/site/content/en/blog/beam-has-a-logo.md similarity index 89% rename from website/src/_posts/2016-02-22-beam-has-a-logo.markdown rename to website/www/site/content/en/blog/beam-has-a-logo.md index fe15309993eb..377bbe90e246 100644 --- a/website/src/_posts/2016-02-22-beam-has-a-logo.markdown +++ b/website/www/site/content/en/blog/beam-has-a-logo.md @@ -1,11 +1,14 @@ --- -layout: post title: "Apache Beam has a logo!" date: 2016-02-22 10:21:48 -0800 -excerpt_separator: -categories: beam update website +categories: + - beam + - update + - website +aliases: + - /beam/update/website/2016/02/22/beam-has-a-logo.html authors: -- jamesmalone + - jamesmalone --- -categories: blog +categories: + - blog +aliases: + - /blog/2019/05/30/beam-kata-release.html authors: - henryken @@ -52,8 +53,8 @@ useful for people to learn more about Apache Beam, and eventually become Beam Ma
    -Beam Kata - IntelliJ Edu -Beam Kata - IntelliJ Edu +Beam Kata - IntelliJ Edu +Beam Kata - IntelliJ Edu -Beam Kata - PyCharm Edu -Beam Kata - PyCharm Edu +Beam Kata - PyCharm Edu +Beam Kata - PyCharm Edu diff --git a/website/src/_posts/2019-04-25-beam-kotlin.md b/website/www/site/content/en/blog/beam-kotlin.md similarity index 86% rename from website/src/_posts/2019-04-25-beam-kotlin.md rename to website/www/site/content/en/blog/beam-kotlin.md index 1969deb1e0e4..8b4cefaa06c7 100644 --- a/website/src/_posts/2019-04-25-beam-kotlin.md +++ b/website/www/site/content/en/blog/beam-kotlin.md @@ -1,9 +1,10 @@ --- -layout: post title: "Apache Beam + Kotlin = ❤️" date: 2019-04-25 00:00:01 -0800 -excerpt_separator: -categories: blog +categories: + - blog +aliases: + - /blog/2019/04/25/beam-kotlin.html authors: - harshithdwivedi @@ -27,7 +28,7 @@ Apache Beam samples are now available in Kotlin! -Kotlin +Kotlin If you are someone who's been working with Java in your professional career; there's a good chance that you've also heard of [Kotlin](https://kotlinlang.org/), which is an Open Sourced, statically typed language for JVM and is mostly being favoured by Android Developers due to the many myriad features which enable more concise and cleaner code than Java without sacrificing performance or safety. @@ -41,68 +42,68 @@ Here are few brief snippets of code that show how the Kotlin Samples compare to ### Java -```java +{{< highlight java >}} String filename = String.format( "%s-%s-of-%s%s", filenamePrefixForWindow(intervalWindow), shardNumber, numShards, outputFileHints.suggestedFilenameSuffix); -``` +{{< /highlight >}} ### Kotlin -```java +{{< highlight java >}} // String templating val filename = "$filenamePrefixForWindow(intervalWindow)-$shardNumber-of-$numShards${outputFileHints.suggestedFilenameSuffix)" -``` +{{< /highlight >}} ### Java -```java +{{< highlight java >}} public static class FormatAsTextFn extends SimpleFunction, String> { @Override public String apply(KV input) { return input.getKey() + ": " + input.getValue(); } } -``` +{{< /highlight >}} ## Kotlin -```java +{{< highlight java >}} public class FormatAsTextFn : SimpleFunction, String>() { override fun apply(input: KV) = "${input.key} : ${input.value}" //Single line functions } -``` +{{< /highlight >}} ### Java -```java +{{< highlight java >}} if(tableRow != null){ formatAndInsert(tableRow); } -``` +{{< /highlight >}} ### Kotlin -```java +{{< highlight java >}} tableRow?.let{ formatAndInsert(it) // No need for null checks } -``` +{{< /highlight >}} ### Java -```java +{{< highlight java >}} String tableName = "testTable"; -``` +{{< /highlight >}} ### Kotlin -```java +{{< highlight java >}} val tableName = "testTable" // Type inferencing -``` +{{< /highlight >}} ## Contributors Welcomed! diff --git a/website/src/_posts/2018-10-30-beam-summit-aftermath.md b/website/www/site/content/en/blog/beam-summit-aftermath.md similarity index 99% rename from website/src/_posts/2018-10-30-beam-summit-aftermath.md rename to website/www/site/content/en/blog/beam-summit-aftermath.md index 7c3087655b12..89c0e3d28d3f 100644 --- a/website/src/_posts/2018-10-30-beam-summit-aftermath.md +++ b/website/www/site/content/en/blog/beam-summit-aftermath.md @@ -1,9 +1,10 @@ --- -layout: post title: "Inaugural edition of the Beam Summit Europe 2018 - aftermath" date: 2018-10-31 00:00:01 -0800 -excerpt_separator: -categories: blog +categories: + - blog +aliases: + - /blog/2018/10/31/beam-summit-aftermath.html authors: - mbaetens --- @@ -108,4 +109,4 @@ We are also gathering feedback and thoughts on the Summit - please add your thou Overall, we hope our attendees enjoyed this first edition of our summit and want to thank **our sponsors Google, Datatonic, Vente-Exclusive** to make this possible. - \ No newline at end of file + diff --git a/website/src/_posts/2020-05-08-beam-summit-digital-2020.md b/website/www/site/content/en/blog/beam-summit-digital-2020.md similarity index 96% rename from website/src/_posts/2020-05-08-beam-summit-digital-2020.md rename to website/www/site/content/en/blog/beam-summit-digital-2020.md index 468758db7ce5..ed0853c58d58 100644 --- a/website/src/_posts/2020-05-08-beam-summit-digital-2020.md +++ b/website/www/site/content/en/blog/beam-summit-digital-2020.md @@ -2,13 +2,14 @@ layout: post title: "Beam Digital Summit is Coming, and it's Coming Fast!" date: 2020-05-08 00:00:01 -0800 -excerpt_separator: -categories: blog +aliases: + - /blog/2020/05/08/beam-summit-digital-2020.html +categories: + - blog authors: - pedro - mbaetens - mxm - --- Beam Summit Digital 2020 While we would have loved to see all of you in person, we have to accept that 2020 will not be the year for that. So, we are looking at this as an opportunity to have a bigger and more inclusive event, where people who would normally not be able to travel to the summit will now be able to join, learn and share with the rest of the community. diff --git a/website/src/_posts/2019-05-11-beam-summit-europe-2019.md b/website/www/site/content/en/blog/beam-summit-europe-2019.md similarity index 92% rename from website/src/_posts/2019-05-11-beam-summit-europe-2019.md rename to website/www/site/content/en/blog/beam-summit-europe-2019.md index ff36c2666ff6..59d2b4cd4d12 100644 --- a/website/src/_posts/2019-05-11-beam-summit-europe-2019.md +++ b/website/www/site/content/en/blog/beam-summit-europe-2019.md @@ -1,9 +1,10 @@ --- -layout: post title: "Beam community update!" date: 2019-05-11 00:00:01 -0800 -excerpt_separator: -categories: blog +categories: + - blog +aliases: + - /blog/2019/05/11/beam-summit-europe-2019.html authors: - mbaetens --- @@ -38,13 +39,13 @@ In that spirit, let's have an overview of the things that have happened, what th ## Meetups We've had a flurry of activity, with several meetups in the planning process and more popping up globally over time. As diversity of contributors is a core ASF value, this geographic spread is exciting for the community. Here's a picture from the latest Apache Beam meetup organized at Lyft in San Francisco: -Beam Meetup Bay Area +Beam Meetup Bay Area We have more [Bay Area meetups](https://www.meetup.com/San-Francisco-Apache-Beam) coming soon, and the community is looking into kicking off a meetup in Toronto! [London](https://www.meetup.com/London-Apache-Beam-Meetup) had its first meetup of 2019 at the start of April: -Beam Meetup London +Beam Meetup London and [Stockholm](https://www.meetup.com/Apache-Beam-Stockholm/events/260634514) had its second meetup at the start of May: @@ -60,7 +61,7 @@ Even if you can’t travel to these meetups, you can stay informed on the happen ## Summits The first summit of the year will be held in Berlin: -Beam Summit Europe Banner +Beam Summit Europe Banner You can find more info on the [website](https://beamsummit.org) and read about the inaugural edition of the Beam Summit Europe [here](https://beam.apache.org/blog/2018/10/31/beam-summit-aftermath.html). At these summits, you have the opportunity to meet with other Apache Beam creators and users, get expert advice, learn from the speaker sessions, and participate in workshops. @@ -86,3 +87,5 @@ Why are we organizing these summits: * We'd like to give folks a place to meet, congregate, and share ideas. * We know that offline interactions often changes the nature of the online ones in a positive manner. * Building an active and diverse community is part of the Apache Way. These summits provide an opportunity for us to engage people from different locations, companies, and backgrounds. + + diff --git a/website/src/_posts/2018-08-21-beam-summit-europe.md b/website/www/site/content/en/blog/beam-summit-europe.md similarity index 93% rename from website/src/_posts/2018-08-21-beam-summit-europe.md rename to website/www/site/content/en/blog/beam-summit-europe.md index c867c8c75126..5e0a38f9b5fe 100644 --- a/website/src/_posts/2018-08-21-beam-summit-europe.md +++ b/website/www/site/content/en/blog/beam-summit-europe.md @@ -1,9 +1,10 @@ --- -layout: post title: "Beam Summit Europe 2018" date: 2018-08-21 00:00:01 -0800 -excerpt_separator: -categories: blog +categories: + - blog +aliases: + - /blog/2018/08/21/beam-summit-europe.html authors: - mbaetens --- @@ -25,7 +26,7 @@ With a growing community of contributors and users, the Apache Beam project is o We are happy to invite you to this event, which will take place in **London** on **October 1st and 2nd of 2018**. -Beam Summit Europe 2018 flyer +Beam Summit Europe 2018 flyer ### What is the Beam Summit 2018? The summit is a 2 day, multi-track event. diff --git a/website/src/_posts/2019-03-18-beam-summit-site.md b/website/www/site/content/en/blog/beam-summit-site.md similarity index 95% rename from website/src/_posts/2019-03-18-beam-summit-site.md rename to website/www/site/content/en/blog/beam-summit-site.md index 8b6dcfa27c64..5a82b2d3796c 100644 --- a/website/src/_posts/2019-03-18-beam-summit-site.md +++ b/website/www/site/content/en/blog/beam-summit-site.md @@ -1,9 +1,10 @@ --- -layout: post title: "Announcing Beam Summit Site" date: 2019-03-18 00:00:01 -0800 -excerpt_separator: -categories: blog +categories: + - blog +aliases: + - /blog/2019/03/18/beam-summit-site.html authors: - aizhamal @@ -36,3 +37,4 @@ See you in Berlin! #beamsummit2019. + diff --git a/website/www/site/content/en/blog/capability-matrix.md b/website/www/site/content/en/blog/capability-matrix.md new file mode 100644 index 000000000000..a3e65734eb2b --- /dev/null +++ b/website/www/site/content/en/blog/capability-matrix.md @@ -0,0 +1,40 @@ +--- +title: "Clarifying & Formalizing Runner Capabilities" +date: 2016-03-17 11:00:00 -0700 +categories: + - beam + - capability +aliases: + - /beam/capability/2016/03/17/capability-matrix.html +authors: + - fjp + - takidau +--- + + +With initial code drops complete ([Dataflow SDK and Runner](https://github.com/apache/beam/pull/1), [Flink Runner](https://github.com/apache/beam/pull/12), [Spark Runner](https://github.com/apache/beam/pull/42)) and expressed interest in runner implementations for [Storm](https://issues.apache.org/jira/browse/BEAM-9), [Hadoop](https://issues.apache.org/jira/browse/BEAM-19), and [Gearpump](https://issues.apache.org/jira/browse/BEAM-79) (amongst others), we wanted to start addressing a big question in the Apache Beam (incubating) community: what capabilities will each runner be able to support? + + + +While we’d love to have a world where all runners support the full suite of semantics included in the Beam Model (formerly referred to as the [Dataflow Model](https://www.vldb.org/pvldb/vol8/p1792-Akidau.pdf)), practically speaking, there will always be certain features that some runners can’t provide. For example, a Hadoop-based runner would be inherently batch-based and may be unable to (easily) implement support for unbounded collections. However, that doesn’t prevent it from being extremely useful for a large set of uses. In other cases, the implementations provided by one runner may have slightly different semantics that those provided by another (e.g. even though the current suite of runners all support exactly-once delivery guarantees, an [Apache Samza](https://samza.apache.org/) runner, which would be a welcome addition, would currently only support at-least-once). + +To help clarify things, we’ve been working on enumerating the key features of the Beam model in a [capability matrix](/documentation/runners/capability-matrix/) for all existing runners, categorized around the four key questions addressed by the model: What / Where / When / How (if you’re not familiar with those questions, you might want to read through [Streaming 102](https://oreilly.com/ideas/the-world-beyond-batch-streaming-102) for an overview). This table will be maintained over time as the model evolves, our understanding grows, and runners are created or features added. + +Included below is a summary snapshot of our current understanding of the capabilities of the existing runners (see the [live version](/documentation/runners/capability-matrix/) for full details, descriptions, and Jira links); since integration is still under way, the system as whole isn’t yet in a completely stable, usable state. But that should be changing in the near future, and we’ll be updating loud and clear on this blog when the first supported Beam 1.0 release happens. + +In the meantime, these tables should help clarify where we expect to be in the very near term, and help guide expectations about what existing runners are capable of, and what features runner implementers will be tackling next. + +{{< capability-matrix-common >}} +{{< capability-matrix cap-data="capability-matrix-snapshot" cap-style="cap-summary" cap-view="blog" cap-other-view="full" cap-toggle-details=1 cap-display="block" >}} diff --git a/website/src/_posts/2016-06-15-first-release.md b/website/www/site/content/en/blog/first-release.md similarity index 89% rename from website/src/_posts/2016-06-15-first-release.md rename to website/www/site/content/en/blog/first-release.md index 4efd0491e04f..38f18132ba55 100644 --- a/website/src/_posts/2016-06-15-first-release.md +++ b/website/www/site/content/en/blog/first-release.md @@ -1,9 +1,11 @@ --- -layout: post title: "The first release of Apache Beam!" date: 2016-06-15 00:00:01 -0700 -excerpt_separator: -categories: beam release +categories: + - beam + - release +aliases: + - /beam/release/2016/06/15/first-release.html authors: - davor --- @@ -33,7 +35,7 @@ making them readily available for our users. The initial release includes the SDK for Java, along with three runners: Apache Flink, Apache Spark and Google Cloud Dataflow, a fully-managed cloud service. The release is available both in the [Maven Central Repository](https://search.maven.org/#search%7Cga%7C1%7Cg%3A%22org.apache.beam%22), -as well as a download from the [project’s website]({{ site.baseurl }}/get-started/downloads/). +as well as a download from the [project’s website](/get-started/downloads/). The goal of this release was process-oriented. In particular, the Beam community wanted to release existing functionality to our users, build and @@ -48,5 +50,5 @@ anticipated, perhaps one every 1-2 months. As always, the Beam community welcomes feedback. Stabilization, usability and the developer experience will be our focus for the next several months. If you have any comments or discover any issues, I’d like to invite you to reach out -to us via [user’s mailing list]({{ site.baseurl }}/get-started/support/) or the +to us via [user’s mailing list](/get-started/support/) or the [Apache JIRA issue tracker](https://issues.apache.org/jira/browse/BEAM/). diff --git a/website/src/_posts/2016-06-13-flink-batch-runner-milestone.md b/website/www/site/content/en/blog/flink-batch-runner-milestone.md similarity index 98% rename from website/src/_posts/2016-06-13-flink-batch-runner-milestone.md rename to website/www/site/content/en/blog/flink-batch-runner-milestone.md index 4156d6eae761..ef3ea15ebf20 100644 --- a/website/src/_posts/2016-06-13-flink-batch-runner-milestone.md +++ b/website/www/site/content/en/blog/flink-batch-runner-milestone.md @@ -1,9 +1,10 @@ --- -layout: post title: "How We Added Windowing to the Apache Flink Batch Runner" date: 2016-06-13 09:00:00 -0700 -excerpt_separator: -categories: blog +categories: + - blog +aliases: + - /blog/2016/06/13/flink-batch-runner-milestone.html authors: - aljoscha --- diff --git a/website/src/_posts/2017-02-01-graduation-media-recap.md b/website/www/site/content/en/blog/graduation-media-recap.md similarity index 95% rename from website/src/_posts/2017-02-01-graduation-media-recap.md rename to website/www/site/content/en/blog/graduation-media-recap.md index 482e86c29989..8a3ae8177633 100644 --- a/website/src/_posts/2017-02-01-graduation-media-recap.md +++ b/website/www/site/content/en/blog/graduation-media-recap.md @@ -1,9 +1,10 @@ --- -layout: post title: "Media recap of the Apache Beam graduation" date: 2017-02-01 00:00:01 -0800 -excerpt_separator: -categories: blog +categories: + - blog +aliases: + - /blog/2017/02/01/graduation-media-recap.html authors: - davor --- @@ -23,7 +24,7 @@ limitations under the License. One year ago today Apache Beam was accepted into incubation at the Apache Software Foundation. The community's work over the past year culminated, just -over three weeks ago, with an [announcement]({{ site.baseurl }}/blog/2017/01/10/beam-graduates.html) +over three weeks ago, with an [announcement](/blog/2017/01/10/beam-graduates.html) that Apache Beam has successfully graduated as a new Top-Level Project at the foundation. Graduation sparked an additional interest in the project, from corporate endorsements, news articles, interviews, to the volume of traffic to @@ -60,6 +61,6 @@ several times larger than before graduation. Hopefully these perspectives entice you to join us on this exciting ride, either as a user or a contributor, as we work towards our first release with API stability. If you’d like to try out Apache Beam today, check out the latest -[0.4.0 release]({{ site.baseurl }}/get-started/downloads/). We welcome +[0.4.0 release](/get-started/downloads/). We welcome contribution and participation from anyone through our mailing lists, issue tracker, pull requests, and events. diff --git a/website/src/_posts/2019-09-04-gsoc-19.md b/website/www/site/content/en/blog/gsoc-19.md similarity index 97% rename from website/src/_posts/2019-09-04-gsoc-19.md rename to website/www/site/content/en/blog/gsoc-19.md index 8fa16c6c37d7..930e3dbf7017 100644 --- a/website/src/_posts/2019-09-04-gsoc-19.md +++ b/website/www/site/content/en/blog/gsoc-19.md @@ -1,12 +1,13 @@ --- -layout: post title: "Google Summer of Code '19" date: 2019-09-04 00:00:01 -0800 -permalink: /blog/2019/09/04/gsoc-19.html -excerpt_separator: -categories: blog gsoc +categories: + - blog + - gsoc +aliases: + - /blog/2019/09/04/gsoc-19.html authors: -- ttanay + - ttanay --- -categories: blog +categories: + - blog +aliases: + - /blog/2019/06/11/looping-timers.html authors: - rez - klk @@ -27,7 +28,7 @@ variety of use cases. One specific use case is the analysis of time series data in which continuous sequences across window boundaries are important. A few fun challenges arise as you tackle this type of data and in this blog we will explore one of those in more detail and make use of the Timer API -([blog post]({{ site.baseurl }}/blog/2017/08/28/timely-processing.html)) +([blog post](/blog/2017/08/28/timely-processing.html)) using the "looping timer" pattern. @@ -172,7 +173,7 @@ So how do timers help? Well let's have a look at a new transform: Edit: Looping Timer State changed from Boolean to Long to allow for min value check. -```java +{{< highlight java >}} public static class LoopingStatefulTimer extends DoFn, KV> { Instant stopTimerTime; @@ -236,7 +237,7 @@ public static class LoopingStatefulTimer extends DoFn, KV}} There are two data values that the state API needs to keep: @@ -275,7 +276,7 @@ In the @OnTimer block, the following occurs: And that's it, let's add our transform back into the pipeline: -```java +{{< highlight java >}} // Apply a fixed window of duration 1 min and Sum the results p.apply(Create.timestamped(time_1, time_2, time_3)).apply( Window.>into(FixedWindows.of(Duration.standardMinutes(1)))) @@ -294,7 +295,7 @@ And that's it, let's add our transform back into the pipeline: } })); -``` +{{< /highlight >}} 1. In the first part of the pipeline we create FixedWindows and reduce the value per key down to a single Sum. @@ -342,7 +343,7 @@ feature sets. You can experiment with this pattern today using the DirectRunner. For other runners, please look out for their release notes on support for dealing with this use case in production. -([Capability Matrix]({{ site.baseurl }}/documentation/runners/capability-matrix/)) +([Capability Matrix](/documentation/runners/capability-matrix/)) Runner specific notes: diff --git a/website/src/_posts/2016-04-03-presentation-materials.md b/website/www/site/content/en/blog/presentation-materials.md similarity index 79% rename from website/src/_posts/2016-04-03-presentation-materials.md rename to website/www/site/content/en/blog/presentation-materials.md index b5f37a54e468..23bde0bbfafe 100644 --- a/website/src/_posts/2016-04-03-presentation-materials.md +++ b/website/www/site/content/en/blog/presentation-materials.md @@ -1,9 +1,11 @@ --- -layout: post title: "Apache Beam Presentation Materials" date: 2016-04-03 11:00:00 -0700 -excerpt_separator: -categories: beam capability +categories: + - beam + - capability +aliases: + - /beam/capability/2016/04/03/presentation-materials.html authors: - fjp - takidau @@ -22,7 +24,7 @@ See the License for the specific language governing permissions and limitations under the License. --> -Are you interested in giving a presentation about Apache Beam? Perhaps you want to talk about Apache Beam at a local Meetup or a convention. Excellent! The Apache Beam community is excited to expand and grow the community. To help kickstart this process, we are excited to announce an initial set of [Apache Beam presentation materials]({{ site.baseurl }}/contribute/presentation-materials/) which anyone can use to give a presentation about Apache Beam. +Are you interested in giving a presentation about Apache Beam? Perhaps you want to talk about Apache Beam at a local Meetup or a convention. Excellent! The Apache Beam community is excited to expand and grow the community. To help kickstart this process, we are excited to announce an initial set of [Apache Beam presentation materials](/contribute/presentation-materials/) which anyone can use to give a presentation about Apache Beam. @@ -34,4 +36,4 @@ As a community, we want to build a shared collection of high quality presentatio * Using Apache Beam with runners * [Google Cloud Dataflow](https://goo.gl/2ay8mi) -As Apache Beam grows, so will this repository of presentation materials. We are excited to add new materials as the Apache Beam ecosystem grows with new runners, SDKs, and so on. If you are interested in contributing content or have a request, please see the [Apache Beam presentation materials]({{ site.baseurl }}/contribute/presentation-materials/) page or email the [`user@beam.apache.org`](mailto:user@beam.apache.org) mailing list with your ideas or questions. +As Apache Beam grows, so will this repository of presentation materials. We are excited to add new materials as the Apache Beam ecosystem grows with new runners, SDKs, and so on. If you are interested in contributing content or have a request, please see the [Apache Beam presentation materials](/contribute/presentation-materials/) page or email the [`user@beam.apache.org`](mailto:user@beam.apache.org) mailing list with your ideas or questions. diff --git a/website/src/_posts/2016-02-25-python-sdk-now-public.markdown b/website/www/site/content/en/blog/python-sdk-now-public.md similarity index 93% rename from website/src/_posts/2016-02-25-python-sdk-now-public.markdown rename to website/www/site/content/en/blog/python-sdk-now-public.md index a6129b8f5958..76924f5b8d26 100644 --- a/website/src/_posts/2016-02-25-python-sdk-now-public.markdown +++ b/website/www/site/content/en/blog/python-sdk-now-public.md @@ -1,11 +1,14 @@ --- -layout: post title: "Dataflow Python SDK is now public!" date: 2016-02-25 13:00:00 -0800 -excerpt_separator: -categories: beam python sdk +categories: + - beam + - python + - sdk +aliases: + - /beam/python/sdk/2016/02/25/python-sdk-now-public.html authors: -- jamesmalone + - jamesmalone --- -categories: blog +categories: + - blog +aliases: + - /blog/2017/03/16/python-sdk-release.html authors: - altay --- @@ -21,17 +22,17 @@ See the License for the specific language governing permissions and limitations under the License. --> -Apache Beam’s latest release, version [0.6.0]({{ site.baseurl }}/get-started/downloads/), introduces a new SDK -- this time, for the Python programming language. The Python SDK joins the Java SDK as the second implementation of the Beam programming model. +Apache Beam’s latest release, version [0.6.0](/get-started/downloads/), introduces a new SDK -- this time, for the Python programming language. The Python SDK joins the Java SDK as the second implementation of the Beam programming model. The Python SDK incorporates all of the main concepts of the Beam model, including ParDo, GroupByKey, Windowing, and others. It features extensible IO APIs for writing bounded sources and sinks, and provides built-in implementation for reading and writing Text, Avro, and TensorFlow record files, as well as connectors to Google BigQuery and Google Cloud Datastore. -There are two runners capable of executing pipelines written with the Python SDK today: [Direct Runner]({{ site.baseurl }}/documentation/runners/direct/) and [Dataflow Runner]({{ site.baseurl }}/documentation/runners/dataflow/), both of which are currently limited to batch execution only. Upcoming features will shortly bring the benefits of the Python SDK to additional runners. +There are two runners capable of executing pipelines written with the Python SDK today: [Direct Runner](/documentation/runners/direct/) and [Dataflow Runner](/documentation/runners/dataflow/), both of which are currently limited to batch execution only. Upcoming features will shortly bring the benefits of the Python SDK to additional runners. #### Try the Apache Beam Python SDK -If you would like to try out the Python SDK, a good place to start is the [Quickstart]({{ site.baseurl }}/get-started/quickstart-py/). After that, you can take a look at additional [examples](https://github.com/apache/beam/tree/v0.6.0/sdks/python/apache_beam/examples), and deep dive into the [API reference](https://beam.apache.org/releases/pydoc/). +If you would like to try out the Python SDK, a good place to start is the [Quickstart](/get-started/quickstart-py/). After that, you can take a look at additional [examples](https://github.com/apache/beam/tree/v0.6.0/sdks/python/apache_beam/examples), and deep dive into the [API reference](https://beam.apache.org/releases/pydoc/). Let’s take a look at a quick example together. First, install the `apache-beam` package from PyPI and start your Python interpreter. @@ -82,4 +83,4 @@ Both of these improvements will enable the Python SDK to fulfill the mission of #### Join us! -Please consider joining us, whether as a user or a contributor, as we work towards our first release with API stability. If you’d like to try out Apache Beam today, check out the latest [0.6.0]({{ site.baseurl }}/get-started/downloads/) release. We welcome contributions and participation from anyone through our mailing lists, issue tracker, pull requests, and events. +Please consider joining us, whether as a user or a contributor, as we work towards our first release with API stability. If you’d like to try out Apache Beam today, check out the latest [0.6.0](/get-started/downloads/) release. We welcome contributions and participation from anyone through our mailing lists, issue tracker, pull requests, and events. diff --git a/website/src/_posts/2018-08-20-review-input-streaming-connectors.md b/website/www/site/content/en/blog/review-input-streaming-connectors.md similarity index 70% rename from website/src/_posts/2018-08-20-review-input-streaming-connectors.md rename to website/www/site/content/en/blog/review-input-streaming-connectors.md index f2305e11dbbf..7c4f7a912c75 100644 --- a/website/src/_posts/2018-08-20-review-input-streaming-connectors.md +++ b/website/www/site/content/en/blog/review-input-streaming-connectors.md @@ -1,9 +1,10 @@ --- -layout: post title: "A review of input streaming connectors" date: 2018-08-20 00:00:01 -0800 -excerpt_separator: -categories: blog +categories: + - blog +aliases: + - /blog/2018/08/20/review-input-streaming-connectors.html authors: - lkuligin - jphalip @@ -22,13 +23,13 @@ See the License for the specific language governing permissions and limitations under the License. --> -In this post, you'll learn about the current state of support for input streaming connectors in [Apache Beam]({{ site.baseurl }}/). For more context, you'll also learn about the corresponding state of support in [Apache Spark](https://spark.apache.org/). +In this post, you'll learn about the current state of support for input streaming connectors in [Apache Beam](/). For more context, you'll also learn about the corresponding state of support in [Apache Spark](https://spark.apache.org/). With batch processing, you might load data from any source, including a database system. Even if there are no specific SDKs available for those database systems, you can often resort to using a [JDBC](https://en.wikipedia.org/wiki/Java_Database_Connectivity) driver. With streaming, implementing a proper data pipeline is arguably more challenging as generally fewer source types are available. For that reason, this article particularly focuses on the streaming use case. ## Connectors for Java -Beam has an official [Java SDK]({{ site.baseurl }}/documentation/sdks/java/) and has several execution engines, called [runners]({{ site.baseurl }}/documentation/runners/capability-matrix/). In most cases it is fairly easy to transfer existing Beam pipelines written in Java or Scala to a Spark environment by using the [Spark Runner]({{ site.baseurl }}/documentation/runners/spark/). +Beam has an official [Java SDK](/documentation/sdks/java/) and has several execution engines, called [runners](/documentation/runners/capability-matrix/). In most cases it is fairly easy to transfer existing Beam pipelines written in Java or Scala to a Spark environment by using the [Spark Runner](/documentation/runners/spark/). Spark is written in Scala and has a [Java API](https://spark.apache.org/docs/latest/api/java/). Spark's source code compiles to [Java bytecode](https://en.wikipedia.org/wiki/Java_(programming_language)#Java_JVM_and_Bytecode) and the binaries are run by a [Java Virtual Machine](https://en.wikipedia.org/wiki/Java_virtual_machine). Scala code is interoperable with Java and therefore has native compatibility with Java libraries (and vice versa). @@ -54,7 +55,7 @@ Below are the main streaming input connectors for available for Beam and Spark D Local
    (Using the file:// URI) - TextIO + TextIO textFileStream
    (Spark treats most Unix systems as HDFS-compatible, but the location should be accessible from all nodes) @@ -62,7 +63,7 @@ Below are the main streaming input connectors for available for Beam and Spark D HDFS
    (Using the hdfs:// URI) - FileIO + HadoopFileSystemOptions + FileIO + HadoopFileSystemOptions HdfsUtils @@ -72,7 +73,7 @@ Below are the main streaming input connectors for available for Beam and Spark D Cloud Storage
    (Using the gs:// URI) - FileIO + GcsOptions + FileIO + GcsOptions hadoopConfiguration and textFileStream @@ -81,7 +82,7 @@ and FileIO + S3Options + FileIO + S3Options @@ -89,7 +90,7 @@ and KafkaIO + KafkaIO spark-streaming-kafka @@ -97,7 +98,7 @@ and KinesisIO + KinesisIO spark-streaming-kinesis @@ -105,7 +106,7 @@ and PubsubIO + PubsubIO spark-streaming-pubsub from Apache Bahir @@ -115,7 +116,7 @@ and Read Transforms + Read Transforms receiverStream @@ -124,7 +125,7 @@ and io.textio + io.textio textFileStream @@ -154,7 +155,7 @@ Below are the main streaming input connectors for available for Beam and Spark D HDFS - io.hadoopfilesystem + io.hadoopfilesystem hadoopConfiguration (Access through sc._jsc with Py4J) and textFileStream @@ -165,7 +166,7 @@ and io.gcp.gcsio + io.gcp.gcsio textFileStream @@ -197,7 +198,7 @@ and io.gcp.pubsub + io.gcp.pubsub N/A @@ -207,7 +208,7 @@ and BoundedSource and RangeTracker + BoundedSource and RangeTracker N/A @@ -222,7 +223,7 @@ Since Scala code is interoperable with Java and therefore has native compatibili ### Go -A [Go SDK]({{ site.baseurl }}/documentation/sdks/go/) for Apache Beam is under active development. It is currently experimental and is not recommended for production. Spark does not have an official Go SDK. +A [Go SDK](/documentation/sdks/go/) for Apache Beam is under active development. It is currently experimental and is not recommended for production. Spark does not have an official Go SDK. ### R @@ -234,5 +235,5 @@ We hope this article inspired you to try new and interesting ways of connecting Check out the following links for further information: -* See a full list of all built-in and in-progress [I/O Transforms]({{ site.baseurl }}/documentation/io/built-in/) for Apache Beam. -* Learn about some Apache Beam mobile gaming pipeline [examples]({{ site.baseurl }}/get-started/mobile-gaming-example/). +* See a full list of all built-in and in-progress [I/O Transforms](/documentation/io/built-in/) for Apache Beam. +* Learn about some Apache Beam mobile gaming pipeline [examples](/get-started/mobile-gaming-example/). diff --git a/website/src/_posts/2019-04-19-season-of-docs.md b/website/www/site/content/en/blog/season-of-docs.md similarity index 95% rename from website/src/_posts/2019-04-19-season-of-docs.md rename to website/www/site/content/en/blog/season-of-docs.md index 2d72c94a6567..906228cada95 100644 --- a/website/src/_posts/2019-04-19-season-of-docs.md +++ b/website/www/site/content/en/blog/season-of-docs.md @@ -1,9 +1,10 @@ --- -layout: post title: "Apache Beam is applying to Season of Docs" date: 2019-04-19 00:00:01 -0800 -excerpt_separator: -categories: blog +categories: + - blog +aliases: + - /blog/2019/04/19/season-of-docs.html authors: - aizhamal @@ -27,7 +28,7 @@ The Apache Beam community is thrilled to announce its application to the first e -Season of Docs 2019 flyer +Season of Docs 2019 flyer [Season of Docs](https://developers.google.com/season-of-docs/) is a unique program that pairs technical writers with open source mentors to contribute to open source. This creates an opportunity to introduce the technical writer to an open source community and provide guidance while the writer works on a real world open source project. We, in the Apache Beam community, would love to take this chance and invite technical writers to collaborate with us, and help us improve our documentation in many ways. diff --git a/website/src/_posts/2016-08-03-six-months.md b/website/www/site/content/en/blog/six-months.md similarity index 88% rename from website/src/_posts/2016-08-03-six-months.md rename to website/www/site/content/en/blog/six-months.md index 599bcb6e4be7..24b773c5ad17 100644 --- a/website/src/_posts/2016-08-03-six-months.md +++ b/website/www/site/content/en/blog/six-months.md @@ -1,9 +1,10 @@ --- -layout: post title: "Apache Beam: Six Months in Incubation" date: 2016-08-03 00:00:01 -0700 -excerpt_separator: -categories: blog +categories: + - blog +aliases: + - /blog/2016/08/03/six-months.html authors: - fjp --- @@ -52,5 +53,5 @@ But perhaps most importantly, we’re committed to building an involved, welcomi * Had multiple talks on Beam at venues including ApacheCon, Hadoop Summit, Kafka Summit, JBCN Barcelona, and Strata. * Presented at multiple existing meetups and are starting to organize some of our own. -While it’s nice to reflect back on all we’ve done, we’re working full _stream_ ahead towards a stable release and graduation from incubator. And we’d love your help -- join the [mailing lists]({{ site.baseurl }}/get-started/support/), check out the [contribution guide]({{ site.baseurl }}/contribute/contribution-guide/), and grab a [starter task](https://issues.apache.org/jira/browse/BEAM-520?jql=project%20%3D%20BEAM%20AND%20resolution%20%3D%20Unresolved%20AND%20labels%20in%20(newbie%2C%20starter)) from Jira! +While it’s nice to reflect back on all we’ve done, we’re working full _stream_ ahead towards a stable release and graduation from incubator. And we’d love your help -- join the [mailing lists](/get-started/support/), check out the [contribution guide](/contribute/contribution-guide/), and grab a [starter task](https://issues.apache.org/jira/browse/BEAM-520?jql=project%20%3D%20BEAM%20AND%20resolution%20%3D%20Unresolved%20AND%20labels%20in%20(newbie%2C%20starter)) from Jira! diff --git a/website/src/_posts/2016-05-18-splitAtFraction-method.md b/website/www/site/content/en/blog/splitAtFraction-method.md similarity index 96% rename from website/src/_posts/2016-05-18-splitAtFraction-method.md rename to website/www/site/content/en/blog/splitAtFraction-method.md index 3d92a9b93c23..0ae5b7346933 100644 --- a/website/src/_posts/2016-05-18-splitAtFraction-method.md +++ b/website/www/site/content/en/blog/splitAtFraction-method.md @@ -1,9 +1,10 @@ --- -layout: post title: "Dynamic work rebalancing for Beam" date: 2016-05-18 11:00:00 -0700 -excerpt_separator: -categories: blog +categories: + - blog +aliases: + - /blog/2016/05/18/splitAtFraction-method.html authors: - dhalperi --- diff --git a/website/src/_posts/2017-08-04-splittable-do-fn.md b/website/www/site/content/en/blog/splittable-do-fn.md similarity index 94% rename from website/src/_posts/2017-08-04-splittable-do-fn.md rename to website/www/site/content/en/blog/splittable-do-fn.md index 9df9005325b4..89a39879f366 100644 --- a/website/src/_posts/2017-08-04-splittable-do-fn.md +++ b/website/www/site/content/en/blog/splittable-do-fn.md @@ -1,9 +1,10 @@ --- -layout: post title: "Powerful and modular IO connectors with Splittable DoFn in Apache Beam" date: 2017-08-16 00:00:01 -0800 -excerpt_separator: -categories: blog +categories: + - blog +aliases: + - /blog/2017/08/16/splittable-do-fn.html authors: - jkff --- @@ -24,7 +25,7 @@ limitations under the License. One of the most important parts of the Apache Beam ecosystem is its quickly growing set of connectors that allow Beam pipelines to read and write data to various data storage systems ("IOs"). Currently, Beam ships [over 20 IO -connectors]({{ site.baseurl }}/documentation/io/built-in/) with many more in +connectors](/documentation/io/built-in/) with many more in active development. As user demands for IO connectors grew, our work on improving the related Beam APIs (in particular, the Source API) produced an unexpected result: a generalization of Beam's most basic primitive, `DoFn`. @@ -48,7 +49,7 @@ and `ParDo(execute sub-query)`. Some IOs considerably more complicated pipelines. Expansion of the JdbcIO.read() composite transform @@ -78,32 +79,28 @@ result, the pipeline can suffer from poor performance due to stragglers. * In the Kafka example, implementing the second `ParDo` is *simply impossible* with a regular `DoFn`, because it would need to output an infinite number of -records per each input element `topic, partition` *([stateful processing]({{ -site.baseurl }}/blog/2017/02/13/stateful-processing.html) comes close, but it +records per each input element `topic, partition` *([stateful processing](/blog/2017/02/13/stateful-processing.html) comes close, but it has other limitations that make it insufficient for this task*). ## Beam Source API Apache Beam historically provides a Source API -([BoundedSource](https://beam.apache.org/releases/javadoc/{{ site.release_latest }}/org/apache/beam/sdk/io/BoundedSource.html) +([BoundedSource](https://beam.apache.org/releases/javadoc/{{< param release_latest >}}/org/apache/beam/sdk/io/BoundedSource.html) and -[UnboundedSource](https://beam.apache.org/releases/javadoc/{{ -site.release_latest }}/org/apache/beam/sdk/io/UnboundedSource.html)) which does +[UnboundedSource](https://beam.apache.org/releases/javadoc/{{< param release_latest >}}/org/apache/beam/sdk/io/UnboundedSource.html)) which does not have these limitations and allows development of efficient data sources for batch and streaming systems. Pipelines use this API via the -[`Read.from(Source)`](https://beam.apache.org/releases/javadoc/{{ -site.release_latest }}/org/apache/beam/sdk/io/Read.html) built-in `PTransform`. +[`Read.from(Source)`](https://beam.apache.org/releases/javadoc/{{< param release_latest >}}/org/apache/beam/sdk/io/Read.html) built-in `PTransform`. The Source API is largely similar to that of most other data processing frameworks, and allows the system to read data in parallel using multiple workers, as well as checkpoint and resume reading from an unbounded data source. Additionally, the Beam -[`BoundedSource`](https://beam.apache.org/releases/javadoc/{{ site.release_latest }}/org/apache/beam/sdk/io/BoundedSource.html) +[`BoundedSource`](https://beam.apache.org/releases/javadoc/{{< param release_latest >}}/org/apache/beam/sdk/io/BoundedSource.html) API provides advanced features such as progress reporting and [dynamic -rebalancing]({{ site.baseurl }}/blog/2016/05/18/splitAtFraction-method.html) +rebalancing](/blog/2016/05/18/splitAtFraction-method.html) (which together enable autoscaling), and -[`UnboundedSource`](https://beam.apache.org/releases/javadoc/{{ -site.release_latest }}/org/apache/beam/sdk/io/UnboundedSource.html) supports +[`UnboundedSource`](https://beam.apache.org/releases/javadoc/{{< param release_latest >}}/org/apache/beam/sdk/io/UnboundedSource.html) supports reporting the source's watermark and backlog *(until SDF, we believed that "batch" and "streaming" data sources are fundamentally different and thus require fundamentally different APIs)*. @@ -215,7 +212,7 @@ offset, and `ReadFn` may interpret it as *read records whose starting offsets are in the given range*. Specifying parts of work for an element using restrictions @@ -245,7 +242,7 @@ inf)* to be processed later, effectively checkpointing and resuming the call; this can be repeated forever. Splitting an infinite restriction into a finite primary and infinite residual @@ -261,7 +258,7 @@ following diagram, where "magic" stands for the runner-specific ability to split the restrictions and schedule processing of residuals. Execution of an SDF - pairing with a restriction, splitting
     restrictions, processing element/restriction pairs @@ -314,12 +311,11 @@ If a block is claimed successfully, then the call outputs all records in this data block, otherwise, it terminates. Processing a restriction by claiming blocks inside it -For more details, see [Restrictions, blocks and -positions](https://s.apache.org/splittable-do-fn#heading=h.vjs7pzbb7kw) in the +For more details, see [Restrictions, blocks and positions](https://s.apache.org/splittable-do-fn#heading=h.vjs7pzbb7kw) in the design proposal document. ### Code example @@ -349,7 +345,7 @@ smaller restrictions, and a few others. The "Hello World" of SDF is a counter, which takes pairs *(x, N)* as input and produces pairs *(x, 0), (x, 1), …, (x, N-1)* as output. -```java +{{< highlight java >}} class CountFn extends DoFn, KV> { @ProcessElement public void process(ProcessContext c, OffsetRangeTracker tracker) { @@ -367,9 +363,9 @@ class CountFn extends DoFn, KV> { PCollection> input = …; PCollection> output = input.apply( ParDo.of(new CountFn()); -``` +{{< /highlight >}} -```py +{{< highlight py >}} class CountFn(DoFn): def process(element, tracker=DoFn.RestrictionTrackerParam) for i in xrange(*tracker.current_restriction()): @@ -379,7 +375,7 @@ class CountFn(DoFn): def get_initial_restriction(element): return (0, element[1]) -``` +{{< /highlight >}} This short `DoFn` subsumes the functionality of [CountingSource](https://github.com/apache/beam/blob/master/sdks/java/core/src/main/java/org/apache/beam/sdk/io/CountingSource.java), @@ -401,7 +397,7 @@ A slightly more complex example is the `ReadFn` considered above, which reads data from Avro files and illustrates the idea of *blocks*: we provide pseudocode to illustrate the approach. -```java +{{< highlight java >}} class ReadFn extends DoFn { @ProcessElement void process(ProcessContext c, OffsetRangeTracker tracker) { @@ -427,9 +423,9 @@ class ReadFn extends DoFn { return new OffsetRange(0, new File(filename).getSize()); } } -``` +{{< /highlight >}} -```py +{{< highlight py >}} class AvroReader(DoFn): def process(filename, tracker=DoFn.RestrictionTrackerParam) with fileio.ChannelFactory.open(filename) as file: @@ -449,7 +445,7 @@ class AvroReader(DoFn): def get_initial_restriction(self, filename): return (0, fileio.ChannelFactory.size_in_bytes(filename)) -``` +{{< /highlight >}} This hypothetical `DoFn` reads records from a single Avro file. Notably missing is the code for expanding a filepattern: it no longer needs to be part of this @@ -475,8 +471,7 @@ IO connectors. However, a large amount of work is in progress or planned. As of August 2017, SDF is available for use in the Beam Java Direct runner and Dataflow Streaming runner, and implementation is in progress in the Flink and -Apex runners; see [capability matrix]({{ site.baseurl -}}/documentation/runners/capability-matrix/) for the current status. Support +Apex runners; see [capability matrix](/documentation/runners/capability-matrix/) for the current status. Support for SDF in the Python SDK is [in active development](https://s.apache.org/splittable-do-fn-python). diff --git a/website/src/_posts/2017-02-13-stateful-processing.md b/website/www/site/content/en/blog/stateful-processing.md similarity index 95% rename from website/src/_posts/2017-02-13-stateful-processing.md rename to website/www/site/content/en/blog/stateful-processing.md index 1105b7ce9173..897d29f11f1f 100644 --- a/website/src/_posts/2017-02-13-stateful-processing.md +++ b/website/www/site/content/en/blog/stateful-processing.md @@ -1,9 +1,10 @@ --- -layout: post title: "Stateful processing with Apache Beam" date: 2017-02-13 00:00:01 -0800 -excerpt_separator: -categories: blog +categories: + - blog +aliases: + - /blog/2017/02/13/stateful-processing.html authors: - klk --- @@ -36,7 +37,7 @@ snippets!** > **Warning: new features ahead!**: This is a very new aspect of the Beam > model. Runners are still adding support. You can try it out today on multiple > runners, but do check the [runner capability -> matrix]({{ site.baseurl }}/documentation/runners/capability-matrix/) for +> matrix](/documentation/runners/capability-matrix/) for > the current status in each runner. First, a quick recap: In Beam, a big data processing _pipeline_ is a directed, @@ -44,7 +45,7 @@ acyclic graph of parallel operations called _`PTransforms`_ processing data from _`PCollections`_. I'll expand on that by walking through this illustration: A Beam Pipeline - PTransforms are boxes - PCollections are arrows @@ -66,7 +67,7 @@ key of the element. Thus the `GroupByKey`/`CombinePerKey` transform gathers all green squares to produce a single output element. ParDo and GroupByKey/CombinePerKey: 
         Elementwise versus aggregating computations @@ -77,7 +78,7 @@ extension to the Beam programming model: **per-element operation augmented with mutable state**. Stateful ParDo - sequential per-key processing with persistent state @@ -85,7 +86,7 @@ In the illustration above, ParDo now has a bit of durable, consistent state on the side, which can be read and written during the processing of each element. The state is partitioned by key, so it is drawn as having disjoint sections for each color. It is also partitioned per window, but I thought plaid -A plaid storage cylinder would be a bit much :-). I'll talk about why state is partitioned this way a bit later, via my first example. @@ -106,7 +107,7 @@ persistent mutable state while processing each input element. Consider this illustration: Stateful DoFn - 
         the runner controls input but the DoFn controls storage and output @@ -144,7 +145,7 @@ that a runner might invoke it on a per-key basis to build an accumulator and extract an output from the final accumulator: CombineFn - the runner controls input, storage, and output @@ -178,7 +179,7 @@ of a `CombineFn` on a number of inputs and later combine them in a classic divide-and-conquer architecture, as in this picture: Divide-and-conquer aggregation with a CombineFn @@ -210,7 +211,7 @@ SDK, I'll go over this example from the level of the model. In pictures, you want to write a transform that maps input to output like this: Assigning arbitrary but unique indices to each element @@ -243,12 +244,13 @@ the full state of your transform as a table, where the rows are named according to names you use in your program, like `"index"`, and the columns are key+window pairs, like this: +{{< table >}} | | (key, window)1 | (key, window)2 | (key, window)3 | ... | |---------------|---------------------------|---------------------------|---------------------------|-----| | `"index"` | `3` | `7` | `15` | ... | | `"fizzOrBuzz?"` | `"fizz"` | `"7"` | `"fizzbuzz"` | ... | | ... | ... | ... | ... | ... | -{:.table} +{{< /table >}} (if you have a superb spatial sense, feel free to imagine this as a cube where keys and windows are independent dimensions) @@ -276,7 +278,7 @@ write stateful processing code using Beam's Java SDK. Here is the code for a stateful `DoFn` that assigns an arbitrary-but-consistent index to each element on a per key-and-window basis: -```java +{{< highlight java >}} new DoFn, KV>>() { // A state cell holding a single Integer per key+window @@ -293,9 +295,9 @@ new DoFn, KV>>() { index.write(current+1); } } -``` +{{< /highlight >}} -```py +{{< highlight py >}} class IndexAssigningStatefulDoFn(DoFn): INDEX_STATE = CombiningStateSpec('index', sum) @@ -304,7 +306,7 @@ class IndexAssigningStatefulDoFn(DoFn): current_index = index.read() yield (value, current_index) index.add(1) -``` +{{< /highlight >}} Let's dissect this: @@ -356,7 +358,7 @@ If you try to express the building of your model as a `CombineFn`, you may have trouble with `mergeAccumulators`. Assuming you could express that, it might look something like this: -```java +{{< highlight java >}} class ModelFromEventsFn extends CombineFn { @Override public abstract Model createAccumulator() { @@ -377,9 +379,9 @@ class ModelFromEventsFn extends CombineFn { public abstract Model extractOutput(Model accumulator) { return accumulator; } } -``` +{{< /highlight >}} -```py +{{< highlight py >}} class ModelFromEventsFn(apache_beam.core.CombineFn): def create_accumulator(self): @@ -394,7 +396,7 @@ class ModelFromEventsFn(apache_beam.core.CombineFn): def extract_output(self, model): return model -``` +{{< /highlight >}} Now you have a way to compute the model of a particular user for a window as `Combine.perKey(new ModelFromEventsFn())`. How would you apply this model to @@ -404,7 +406,7 @@ elements of a `PCollection` is to read it as a side input to a `ParDo` transform. So you could side input the model and check the stream of events against it, outputting the prediction, like so: -```java +{{< highlight java >}} PCollection> events = ... final PCollectionView> userModels = events @@ -425,9 +427,9 @@ PCollection> predictions = events … c.output(KV.of(userId, model.prediction(event))) … } })); -``` +{{< /highlight >}} -```py +{{< highlight py >}} # Events is a collection of (user, event) pairs. events = (p | ReadFromEventSource() | beam.WindowInto(....)) @@ -446,7 +448,7 @@ def event_prediction(user_event, models): # Predictions is a collection of (user, prediction) pairs. predictions = events | beam.Map(event_prediction, user_models) -``` +{{< /highlight >}} In this pipeline, there is just one model emitted by the `Combine.perKey(...)` per user, per window, which is then prepared for side input by the `View.asMap()` @@ -464,7 +466,7 @@ generic Beam feature for managing completeness versus latency tradeoffs. So here is the same pipeline with an added trigger that outputs a new model one second after input arrives: -```java +{{< highlight java >}} PCollection> events = ... PCollectionView> userModels = events @@ -475,9 +477,9 @@ PCollectionView> userModels = events .apply(Combine.perKey(new ModelFromEventsFn())) .apply(View.asMap()); -``` +{{< /highlight >}} -```py +{{< highlight py >}} events = ... user_models = beam.pvalue.AsDict( @@ -487,7 +489,7 @@ user_models = beam.pvalue.AsDict( trigger.AfterCount(1), trigger.AfterProcessingTime(1))) | beam.CombinePerKey(ModelFromEventsFn())) -``` +{{< /highlight >}} This is often a pretty nice tradeoff between latency and cost: If a huge flood of events comes in a second, then you will only emit one new model, so you @@ -509,7 +511,7 @@ Stateful processing lets you address both the latency problem of side inputs and the cost problem of excessive uninteresting output. Here is the code, using only features I have already introduced: -```java +{{< highlight java >}} new DoFn, KV>() { @StateId("model") @@ -540,9 +542,9 @@ new DoFn, KV>() { } } }; -``` +{{< /highlight >}} -```py +{{< highlight py >}} class ModelStatefulFn(beam.DoFn): PREVIOUS_PREDICTION = BagStateSpec('previous_pred_state', PredictionCoder()) @@ -568,7 +570,7 @@ class ModelStatefulFn(beam.DoFn): previous_pred_state.clear() previous_pred_state.add(new_prediction) yield (user, new_prediction) -``` +{{< /highlight >}} Let's walk through it, @@ -627,10 +629,10 @@ If you are new to Beam, I hope you are now interested in seeing if Beam with stateful processing addresses your use case. If you are already using Beam, I hope this new addition to the model unlocks new use cases for you. Do check the [capability -matrix]({{ site.baseurl }}/documentation/runners/capability-matrix/) to +matrix](/documentation/runners/capability-matrix/) to see the level of support for this new model feature on your favorite backend(s). And please do join the community at -[user@beam.apache.org]({{ site.baseurl }}/get-started/support). We'd love to +[user@beam.apache.org](/get-started/support). We'd love to hear from you. diff --git a/website/src/_posts/2016-10-12-strata-hadoop-world-and-beam.md b/website/www/site/content/en/blog/strata-hadoop-world-and-beam.md similarity index 87% rename from website/src/_posts/2016-10-12-strata-hadoop-world-and-beam.md rename to website/www/site/content/en/blog/strata-hadoop-world-and-beam.md index 2bc1560762bb..80ee8696c1aa 100644 --- a/website/src/_posts/2016-10-12-strata-hadoop-world-and-beam.md +++ b/website/www/site/content/en/blog/strata-hadoop-world-and-beam.md @@ -1,11 +1,13 @@ --- -layout: post title: "Strata+Hadoop World and Beam" date: 2016-10-11 09:00:00 -0800 -excerpt_separator: -categories: beam update +categories: + - beam + - update +aliases: + - /beam/update/2016/10/11/strata-hadoop-world-and-beam.html authors: -- jesseanderson + - jesseanderson --- -Exercise time +Exercise time If you want to take a look at the tutorial materials, we’ve put them up [on GitHub](https://github.com/eljefe6a/beamexample). This includes the [actual slides](https://github.com/eljefe6a/beamexample/blob/master/BeamTutorial/slides.pdf) as well as the [exercises](https://github.com/eljefe6a/beamexample/tree/master/BeamTutorial/src/main/java/org/apache/beam/examples/tutorial/game) that we covered. If you’re looking to learn a little about Beam, this is a good way to start. The exercises are based on an imaginary mobile game where data needs processing and are based on code in the [Beam examples directory](https://github.com/apache/beam/tree/master/examples/java/src/main/java/org/apache/beam/examples/complete/game). The code has TODOs for where you need to fill in code or there are full sample solutions to look over our code. You can run these examples on your own machine or on a cluster using a runner that Beam supports. @@ -37,9 +39,9 @@ We heard [loud and clear](https://twitter.com/jessetanderson/status/781124173108 On management and thought leader side, Beam went from “what’s Beam?” at previous conferences to “I’m interested in Beam.” or “I’ve formed an informed opinion on Beam.” at this conference. This is one of the metrics I look for in early technology adoption. -So much brainpower answering questions +So much brainpower answering questions We rounded out the tutorial with live demonstrations of Beam running on Apache Spark, Apache Flink, the local runner, and DataFlow runner. Then, we brought in the big brainpower and had a Q and A session. -If you’re attending a conference, we encourage you to look for a Beam session. If you want to use these materials to give your own Beam talk or tutorial, we’re happy to help you. In addition to this tutorial, we have [other presentation materials]({{ site.baseurl }}/contribute/presentation-materials/). You can reach out to us on the [user mailing list]({{ site.baseurl }}/get-started/support/). +If you’re attending a conference, we encourage you to look for a Beam session. If you want to use these materials to give your own Beam talk or tutorial, we’re happy to help you. In addition to this tutorial, we have [other presentation materials](/contribute/presentation-materials/). You can reach out to us on the [user mailing list](/get-started/support/). diff --git a/website/src/_posts/2016-10-20-test-stream.md b/website/www/site/content/en/blog/test-stream.md similarity index 89% rename from website/src/_posts/2016-10-20-test-stream.md rename to website/www/site/content/en/blog/test-stream.md index be940e98ab19..c21fb2fdc7d8 100644 --- a/website/src/_posts/2016-10-20-test-stream.md +++ b/website/www/site/content/en/blog/test-stream.md @@ -1,11 +1,12 @@ --- -layout: post title: "Testing Unbounded Pipelines in Apache Beam" date: 2016-10-20 10:00:00 -0800 -excerpt_separator: -categories: blog +categories: + - blog +aliases: + - /blog/2016/10/20/test-stream.html authors: -- tgroh + - tgroh --- -categories: blog +categories: + - blog +aliases: + - /blog/2017/08/28/timely-processing.html authors: - klk --- @@ -22,7 +23,7 @@ limitations under the License. --> In a [prior blog -post]({{ site.baseurl }}/blog/2017/02/13/stateful-processing.html), I +post](/blog/2017/02/13/stateful-processing.html), I introduced the basics of stateful processing in Apache Beam, focusing on the addition of state to per-element processing. So-called _timely_ processing complements stateful processing in Beam by letting you set timers to request a @@ -76,7 +77,7 @@ distributed across computers in any way, yielding essentially limitless parallelism. ParDo offers limitless parallelism @@ -104,7 +105,7 @@ green squares, etc. In a real application, you may have millions of keys, so the parallelism is still massive. Gathering elements per key then combining them @@ -120,8 +121,7 @@ and timers. However, _your_ code is just a declarative expression of the aggregation operator. The runner can choose a variety of ways to execute your operator. -I went over this in detail in [my prior post focused on state alone]({{ -site.baseurl }}/blog/2017/02/13/stateful-processing.html). Since you do not +I went over this in detail in [my prior post focused on state alone](/blog/2017/02/13/stateful-processing.html). Since you do not observe elements in any defined order, nor manipulate mutable state or timers directly, I call this neither stateful nor timely processing. @@ -158,7 +158,7 @@ access to state (the color-partitioned cylinder on the right) and can set timers to receive callbacks (the colorful clocks on the left). Gathering elements per key then timely, stateful processing @@ -184,7 +184,7 @@ Let's set up the state we need to track batches of elements. As each element comes in, we will write the element to a buffer while tracking the number of elements we have buffered. Here are the state cells in code: -```java +{{< highlight java >}} new DoFn() { @StateId("buffer") @@ -195,9 +195,9 @@ new DoFn() { … TBD … } -``` +{{< /highlight >}} -```py +{{< highlight py >}} class StatefulBufferingFn(beam.DoFn): BUFFER_STATE = BagStateSpec('buffer', EventCoder()) @@ -205,7 +205,7 @@ class StatefulBufferingFn(beam.DoFn): COUNT_STATE = CombiningValueStateSpec('count', VarIntCoder(), combiners.SumCombineFn()) -``` +{{< /highlight >}} Walking through the code, we have: @@ -217,7 +217,7 @@ method. We will choose a limit on the size of the buffer, `MAX_BUFFER_SIZE`. If our buffer reaches this size, we will perform a single RPC to enrich all the events, and output. -```java +{{< highlight java >}} new DoFn() { private static final int MAX_BUFFER_SIZE = 500; @@ -250,9 +250,9 @@ new DoFn() { … TBD … } -``` +{{< /highlight >}} -```py +{{< highlight py >}} class StatefulBufferingFn(beam.DoFn): MAX_BUFFER_SIZE = 500; @@ -277,12 +277,12 @@ class StatefulBufferingFn(beam.DoFn): yield event count_state.clear() buffer_state.clear() -``` +{{< /highlight >}} Here is an illustration to accompany the code: Batching elements in state, then performing RPCs @@ -319,7 +319,7 @@ completeness for a `PCollection` - such as when a window expires. For our example, let us add an event time timer so that when the window expires, any events remaining in the buffer are processed. -```java +{{< highlight java >}} new DoFn() { … @@ -351,9 +351,9 @@ new DoFn() { } } } -``` +{{< /highlight >}} -```py +{{< highlight py >}} class StatefulBufferingFn(beam.DoFn): … @@ -380,7 +380,7 @@ class StatefulBufferingFn(beam.DoFn): buffer_state.clear() count_state.clear() -``` +{{< /highlight >}} Let's unpack the pieces of this snippet: @@ -404,7 +404,7 @@ Let's unpack the pieces of this snippet: Illustrating this logic, we have the diagram below: Batched RPCs with window expiration @@ -435,7 +435,7 @@ timer has not been set, then we set it for the current moment plus `MAX_BUFFER_DURATION`. After the allotted processing time has passed, a callback will fire and enrich and emit any buffered elements. -```java +{{< highlight java >}} new DoFn() { … @@ -477,9 +477,9 @@ new DoFn() { … same expiry as above … } -``` +{{< /highlight >}} -```py +{{< highlight py >}} class StatefulBufferingFn(beam.DoFn): … @@ -512,12 +512,12 @@ class StatefulBufferingFn(beam.DoFn): buffer_state.clear() count_state.clear() -``` +{{< /highlight >}} Here is an illustration of the final code: Batching elements in state, then performing RPCs @@ -561,7 +561,7 @@ minutes sliding by 10 minutes, the stateful, timely transform should transparently work correctly. Two windowing strategies for the same stateful and timely transform @@ -589,7 +589,7 @@ processing logic be applicable to archived events just as easily as incoming near-real-time data. Unified stateful processing over streams and file archives @@ -603,10 +603,9 @@ falls within documented allowances. I'll end this post in the same way I ended the last. I hope you will go try out Beam with stateful, timely processing. If it opens up new possibilities for you, then great! If not, we want to hear about it. Since this is a new feature, -please check the [capability matrix]({{ site.baseurl -}}/documentation/runners/capability-matrix/) to see the level of support for +please check the [capability matrix](/documentation/runners/capability-matrix/) to see the level of support for your preferred Beam backend(s). And please do join the Beam community at -[user@beam.apache.org]({{ site.baseurl }}/get-started/support) and follow +[user@beam.apache.org](/get-started/support) and follow [@ApacheBeam](https://twitter.com/ApacheBeam) on Twitter. diff --git a/website/src/_posts/2016-05-20-where-is-my-pcollection-dot-map.md b/website/www/site/content/en/blog/where-is-my-pcollection-dot-map.md similarity index 97% rename from website/src/_posts/2016-05-20-where-is-my-pcollection-dot-map.md rename to website/www/site/content/en/blog/where-is-my-pcollection-dot-map.md index 7de8c390b7de..b7f608649863 100644 --- a/website/src/_posts/2016-05-20-where-is-my-pcollection-dot-map.md +++ b/website/www/site/content/en/blog/where-is-my-pcollection-dot-map.md @@ -1,9 +1,10 @@ --- -layout: post title: "Where's my PCollection.map()?" date: 2016-05-27 09:00:00 -0700 -excerpt_separator: -categories: blog +categories: + - blog +aliases: + - /blog/2016/05/27/where-is-my-pcollection-dot-map.html authors: - robertwb --- @@ -95,7 +96,7 @@ Though PTransforms are generally constructed at the site at which they're used, As pipelines grow and evolve, it is useful to structure your pipeline into modular, often reusable components, and PTransforms allow one to do this nicely in a data-processing pipeline. In addition, modular PTransforms also expose the logical structure of your code to the system (e.g. for monitoring). Of the three different representations of the WordCount pipeline below, only the structured view captures the high-level intent of the pipeline. Letting even the simple operations be PTransforms means there's less of an abrupt edge to packaging things up into composite operations. -Three different visualizations of a simple WordCount pipeline +Three different visualizations of a simple WordCount pipeline
    Three different visualizations of a simple WordCount pipeline which computes the number of occurrences of every word in a set of text files. The flat view gives the full DAG of all operations performed. The execution view groups operations according to how they're executed, e.g. after performing runner-specific optimizations like function composition. The structured view nests operations according to their grouping in PTransforms. diff --git a/website/src/community/contact-us.md b/website/www/site/content/en/community/contact-us.md similarity index 95% rename from website/src/community/contact-us.md rename to website/www/site/content/en/community/contact-us.md index 77056c51900b..cca25cda1224 100644 --- a/website/src/community/contact-us.md +++ b/website/www/site/content/en/community/contact-us.md @@ -1,9 +1,7 @@ --- -layout: section title: "Contact Us" -permalink: /community/contact-us/ -section_menu: section-menu/community.html -redirect_from: +aliases: + - /community/ - /use/issue-tracking/ - /use/mailing-lists/ - /get-started/support/ @@ -27,6 +25,9 @@ limitations under the License. There are many ways to reach the Beam user and developer communities - use whichever one seems best. + +
    + | How to contact us | When to use it | | ----------------- | ---------------| | [user@](https://lists.apache.org/list.html?user@beam.apache.org) mailing list | User support and questions ([Subscribe](mailto:user-subscribe@beam.apache.org)[^1], [Unsubscribe](mailto:user-unsubscribe@beam.apache.org)[^1], [Archives](https://lists.apache.org/list.html?user@beam.apache.org)) | @@ -36,7 +37,8 @@ whichever one seems best. | [JIRA bug tracker](https://issues.apache.org/jira/browse/BEAM) | Report bugs / discover known issues | | [StackOverflow](https://stackoverflow.com/questions/tagged/apache-beam) | Ask and answer user support questions | | [Slack](https://s.apache.org/beam-slack-channel) | Chat with users and developers in the ASF Slack. Note: Please [join the #beam channel](https://s.apache.org/beam-slack-channel) after you [created an account](https://s.apache.org/slack-invite). Please do not ask Beam questions in #general. | -{:.table} + +
    If you have questions about how to use Apache Beam, we recommend you try out the [user@](https://lists.apache.org/list.html?user@beam.apache.org) mailing list, and [StackOverflow](https://stackoverflow.com/questions/tagged/apache-beam). diff --git a/website/src/community/in-person.md b/website/www/site/content/en/community/in-person.md similarity index 95% rename from website/src/community/in-person.md rename to website/www/site/content/en/community/in-person.md index bd8e6fa6e2f1..729428033293 100644 --- a/website/src/community/in-person.md +++ b/website/www/site/content/en/community/in-person.md @@ -1,9 +1,6 @@ --- -layout: section title: "Contact Us" -permalink: /community/in-person/ -section_menu: section-menu/community.html -redirect_from: +aliases: - /use/issue-tracking/ - /use/mailing-lists/ - /get-started/support/ @@ -30,6 +27,7 @@ While the official communication happens on the mailing list, and you can find u We occasionally meet up in various locations around the globe. Active or to-be-started meetups include: +{{< table >}} | Meetup City | Name | | ----------------- | ---------------| | Stockholm | [Apache Beam Stockholm](https://www.meetup.com/Apache-Beam-Stockholm/) | @@ -42,7 +40,7 @@ We occasionally meet up in various locations around the globe. Active or to-be-s | Austin, TX | [Austin Apache Beam Meetup](https://www.meetup.com/Austin-Beam-Meetup/) | | Bangalore | [Bangalore Apache Beam Meetup](https://www.meetup.com/Bangalore-Apache-Beam/) | | Paris | [Paris Apache Beam](https://www.meetup.com/Paris-Apache-Beam-Meetup/) |: -{:.table} +{{< /table >}} The above are the meetups that are already known to the community (please add if you are organizing one!). For Meetups that are tagged with 'Apache Beam', see the [list](https://www.meetup.com/topics/apache-beam/). diff --git a/website/src/community/integrations.md b/website/www/site/content/en/community/integrations.md similarity index 90% rename from website/src/community/integrations.md rename to website/www/site/content/en/community/integrations.md index a4f501b3bbc0..17331a3be53d 100644 --- a/website/src/community/integrations.md +++ b/website/www/site/content/en/community/integrations.md @@ -1,9 +1,5 @@ --- -layout: section title: "Integrations" -permalink: /community/integrations/ -section_menu: section-menu/community.html -redirect_from: --- + +# Apache Beam Logos + +This page contains project material for the Apache Beam project. + +## Project logos + +You can download [this archive](/downloads/logos.zip) +containing all of the logos or download the logos individually. + +### Scalable Vector Graphics (SVG) +These [SVG files](https://en.wikipedia.org/wiki/Scalable_Vector_Graphics) can +be resized easily and are suitable for print or web use. Click on the logo to +download it. + +{{< colors/svg >}} + +### Portable Network Graphics (PNG) +These [PNG files](https://en.wikipedia.org/wiki/Portable_Network_Graphics) are +available in a number of fixed sizes and are optimized for web use. + +{{< colors/png >}} + +## Colors and fonts +The Apache Beam project uses predefined colors and fonts. [This document](/downloads/palette.pdf) has more information. diff --git a/website/src/community/policies.md b/website/www/site/content/en/community/policies.md similarity index 94% rename from website/src/community/policies.md rename to website/www/site/content/en/community/policies.md index 525749d0bd4b..83688ca23212 100644 --- a/website/src/community/policies.md +++ b/website/www/site/content/en/community/policies.md @@ -1,8 +1,5 @@ --- -layout: section title: "Policies" -permalink: /community/policies/ -section_menu: section-menu/community.html --- + + diff --git a/website/src/contribute/feature-branches.md b/website/www/site/content/en/contribute/feature-branches.md similarity index 93% rename from website/src/contribute/feature-branches.md rename to website/www/site/content/en/contribute/feature-branches.md index d4735834459f..24853e162160 100644 --- a/website/src/contribute/feature-branches.md +++ b/website/www/site/content/en/contribute/feature-branches.md @@ -1,8 +1,5 @@ --- -layout: section title: "Beam Feature Branches" -permalink: /contribute/feature-branches/ -section_menu: section-menu/contribute.html --- + See the [download page](/get-started/downloads/{$DOWNLOAD_ANCHOR}) for this release. For more information on changes in {$RELEASE_VERSION}, check out the [detailed release notes]({$JIRA_RELEASE_NOTES}). @@ -891,7 +886,7 @@ Template: 1. Maven artifacts deployed to the staging repository of [repository.apache.org](https://repository.apache.org/content/repositories/) 1. Source distribution deployed to the dev repository of [dist.apache.org](https://dist.apache.org/repos/dist/dev/beam/) -1. Website pull request proposed to list the [release]({{ site.baseurl }}/get-started/downloads/), publish the [Java API reference manual](https://beam.apache.org/releases/javadoc/), and publish the [Python API reference manual](https://beam.apache.org/releases/pydoc/). +1. Website pull request proposed to list the [release](/get-started/downloads/), publish the [Java API reference manual](https://beam.apache.org/releases/javadoc/), and publish the [Python API reference manual](https://beam.apache.org/releases/pydoc/). 1. Docker images are published to [DockerHub](https://hub.docker.com/search?q=apache%2Fbeam&type=image) with tags: {RELEASE}_rc{RC_NUM}. You can (optionally) also do additional verification by: @@ -1208,7 +1203,7 @@ _Note_: -Prepourl and -Pver can be found in the RC vote email sent by Release Ma * Run GameStats with Dataflow Runner ``` python -m apache_beam.examples.complete.game.game_stats \ - --project=${YOUR_PROJECT} \ + --project=${YOUR_PROJECT} \ --region=${GCE_REGION} \ --topic projects/${YOUR_PROJECT}/topics/${YOUR_PUBSUB_TOPIC} \ --dataset ${USER}_test \ @@ -1293,7 +1288,7 @@ Create and push a new signed tag for the released version by copying the tag for ### Merge website pull request -Merge the website pull request to [list the release]({{ site.baseurl }}/get-started/downloads/), publish the [Python API reference manual](https://beam.apache.org/releases/pydoc/), the [Java API reference manual](https://beam.apache.org/releases/javadoc/) and Blogpost created earlier. +Merge the website pull request to [list the release](/get-started/downloads/), publish the [Python API reference manual](https://beam.apache.org/releases/pydoc/), the [Java API reference manual](https://beam.apache.org/releases/javadoc/) and Blogpost created earlier. ### Mark the version as released in JIRA @@ -1312,7 +1307,7 @@ __NOTE__: Only PMC members have permissions to do it, ping [dev@](mailto:dev@bea * Maven artifacts released and indexed in the [Maven Central Repository](https://search.maven.org/#search%7Cga%7C1%7Cg%3A%22org.apache.beam%22) * Source distribution available in the release repository of [dist.apache.org](https://dist.apache.org/repos/dist/release/beam/) * Source distribution removed from the dev repository of [dist.apache.org](https://dist.apache.org/repos/dist/dev/beam/) -* Website pull request to [list the release]({{ site.baseurl }}/get-started/downloads/) and publish the [API reference manual](https://beam.apache.org/releases/javadoc/) merged +* Website pull request to [list the release](/get-started/downloads/) and publish the [API reference manual](https://beam.apache.org/releases/javadoc/) merged * Release tagged in the source code repository * Release version finalized in JIRA. (Note: Not all committers have administrator access to JIRA. If you end up getting permissions errors ask on the mailing list for assistance.) * Release version is listed at reporter.apache.org diff --git a/website/src/contribute/runner-guide.md b/website/www/site/content/en/contribute/runner-guide.md similarity index 97% rename from website/src/contribute/runner-guide.md rename to website/www/site/content/en/contribute/runner-guide.md index c0f6d573b870..a50277d1b554 100644 --- a/website/src/contribute/runner-guide.md +++ b/website/www/site/content/en/contribute/runner-guide.md @@ -1,8 +1,5 @@ --- -layout: section title: "Runner Authoring Guide" -section_menu: section-menu/contribute.html -permalink: /contribute/runner-guide/ --- + + diff --git a/website/src/contribute/testing.md b/website/www/site/content/en/contribute/testing.md similarity index 86% rename from website/src/contribute/testing.md rename to website/www/site/content/en/contribute/testing.md index 37d21e942682..ccd5d73fc42f 100644 --- a/website/src/contribute/testing.md +++ b/website/www/site/content/en/contribute/testing.md @@ -15,4 +15,8 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ---> \ No newline at end of file +--> + + diff --git a/website/www/site/content/en/documentation/_index.md b/website/www/site/content/en/documentation/_index.md new file mode 100644 index 000000000000..66b0a014a47c --- /dev/null +++ b/website/www/site/content/en/documentation/_index.md @@ -0,0 +1,67 @@ +--- +title: "Learn about Beam" +aliases: + - /learn/ + - /docs/learn/ +--- + + +# Apache Beam Documentation + +This section provides in-depth conceptual information and reference material for the Beam Model, SDKs, and Runners: + +## Concepts + +Learn about the Beam Programming Model and the concepts common to all Beam SDKs and Runners. + +* Read the [Programming Guide](/documentation/programming-guide/), which introduces all the key Beam concepts. +* Learn about Beam's [execution model](/documentation/runtime/model) to better understand how pipelines execute. +* Visit [Learning Resources](/documentation/resources/learning-resources) for some of our favorite articles and talks about Beam. + +## Pipeline Fundamentals + +* [Design Your Pipeline](/documentation/pipelines/design-your-pipeline/) by planning your pipeline’s structure, choosing transforms to apply to your data, and determining your input and output methods. +* [Create Your Pipeline](/documentation/pipelines/create-your-pipeline/) using the classes in the Beam SDKs. +* [Test Your Pipeline](/documentation/pipelines/test-your-pipeline/) to minimize debugging a pipeline’s remote execution. + +## SDKs + +Find status and reference information on all of the available Beam SDKs. + +* [Java SDK](/documentation/sdks/java/) +* [Python SDK](/documentation/sdks/python/) +* [Go SDK](/documentation/sdks/go/) + +## Runners + +A Beam Runner runs a Beam pipeline on a specific (often distributed) data processing system. + +### Available Runners + +* [DirectRunner](/documentation/runners/direct/): Runs locally on your machine -- great for developing, testing, and debugging. +* [ApexRunner](/documentation/runners/apex/): Runs on [Apache Apex](https://apex.apache.org). +* [FlinkRunner](/documentation/runners/flink/): Runs on [Apache Flink](https://flink.apache.org). +* [SparkRunner](/documentation/runners/spark/): Runs on [Apache Spark](https://spark.apache.org). +* [DataflowRunner](/documentation/runners/dataflow/): Runs on [Google Cloud Dataflow](https://cloud.google.com/dataflow), a fully managed service within [Google Cloud Platform](https://cloud.google.com/). +* [GearpumpRunner](/documentation/runners/gearpump/): Runs on [Apache Gearpump (incubating)](https://gearpump.apache.org). +* [SamzaRunner](/documentation/runners/samza/): Runs on [Apache Samza](https://samza.apache.org). +* [NemoRunner](/documentation/runners/nemo/): Runs on [Apache Nemo](https://nemo.apache.org). +* [JetRunner](/documentation/runners/jet/): Runs on [Hazelcast Jet](https://jet.hazelcast.org/). + +### Choosing a Runner + +Beam is designed to enable pipelines to be portable across different runners. However, given every runner has different capabilities, they also have different abilities to implement the core concepts in the Beam model. The [Capability Matrix](/documentation/runners/capability-matrix/) provides a detailed comparison of runner functionality. + +Once you have chosen which runner to use, see that runner's page for more information about any initial runner-specific setup as well as any required or optional `PipelineOptions` for configuring its execution. You may also want to refer back to the Quickstart for [Java](/get-started/quickstart-java), [Python](/get-started/quickstart-py) or [Go](/get-started/quickstart-go) for instructions on executing the sample WordCount pipeline. diff --git a/website/src/documentation/dsls/sql/calcite/aggregate-functions.md b/website/www/site/content/en/documentation/dsls/sql/calcite/aggregate-functions.md similarity index 85% rename from website/src/documentation/dsls/sql/calcite/aggregate-functions.md rename to website/www/site/content/en/documentation/dsls/sql/calcite/aggregate-functions.md index 1416ad956fac..b6a9b9a97ebb 100644 --- a/website/src/documentation/dsls/sql/calcite/aggregate-functions.md +++ b/website/www/site/content/en/documentation/dsls/sql/calcite/aggregate-functions.md @@ -1,9 +1,7 @@ --- -layout: section +type: languages title: "Beam Calcite SQL aggregate functions" -section_menu: section-menu/sdks.html -permalink: /documentation/dsls/sql/calcite/aggregate-functions/ -redirect_from: /documentation/dsls/sql/aggregate-functions/ +aliases: /documentation/dsls/sql/aggregate-functions/ --- -[Built-in I/O Transforms]({{site.baseurl}}/documentation/io/built-in/) +[Built-in I/O Transforms](/documentation/io/built-in/) # Google BigQuery I/O connector - +{{< language-switcher java py >}} The Beam SDKs include built-in transforms that can read data from and write data to [Google BigQuery](https://cloud.google.com/bigquery) tables. @@ -37,39 +28,45 @@ to [Google BigQuery](https://cloud.google.com/bigquery) tables. -{:.language-java} +{{< paragraph class="language-java" >}} To use BigQueryIO, add the Maven artifact dependency to your `pom.xml` file. +{{< /paragraph >}} -```java +{{< highlight java >}} org.apache.beam beam-sdks-java-io-google-cloud-platform - {{ site.release_latest }} + {{< param release_latest >}} -``` +{{< /highlight >}} -{:.language-java} +{{< paragraph class="language-java" >}} Additional resources: +{{< /paragraph >}} -{:.language-java} +{{< paragraph class="language-java" wrap="span" >}} * [BigQueryIO source code](https://github.com/apache/beam/tree/master/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery) -* [BigQueryIO Javadoc](https://beam.apache.org/releases/javadoc/{{ site.release_latest }}/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIO.html) +* [BigQueryIO Javadoc](https://beam.apache.org/releases/javadoc/{{< param release_latest >}}/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIO.html) * [Google BigQuery documentation](https://cloud.google.com/bigquery/docs) +{{< /paragraph >}} -{:.language-py} +{{< paragraph class="language-py" >}} To use BigQueryIO, you must install the Google Cloud Platform dependencies by running `pip install apache-beam[gcp]`. +{{< /paragraph >}} -{:.language-py} +{{< paragraph class="language-py" >}} Additional resources: +{{< /paragraph >}} -{:.language-py} +{{< paragraph class="language-py" wrap="span" >}} * [BigQueryIO source code](https://github.com/apache/beam/blob/master/sdks/python/apache_beam/io/gcp/bigquery.py) -* [BigQueryIO Pydoc](https://beam.apache.org/releases/pydoc/{{ site.release_latest }}/apache_beam.io.gcp.bigquery.html) +* [BigQueryIO Pydoc](https://beam.apache.org/releases/pydoc/{{< param release_latest >}}/apache_beam.io.gcp.bigquery.html) * [Google BigQuery documentation](https://cloud.google.com/bigquery/docs) +{{< /paragraph >}} ## BigQuery basics @@ -105,50 +102,53 @@ To specify a table with a string, use the format `[project_id]:[dataset_id].[table_id]` to specify the fully-qualified BigQuery table name. -```java -{% github_sample /apache/beam/blob/master/examples/java/src/main/java/org/apache/beam/examples/snippets/Snippets.java tag:BigQueryTableSpec -%}``` -```py -{% github_sample /apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets.py tag:model_bigqueryio_table_spec -%}``` +{{< highlight java >}} +{{< github_sample "/apache/beam/blob/master/examples/java/src/main/java/org/apache/beam/examples/snippets/Snippets.java" BigQueryTableSpec >}} +{{< /highlight >}} + +{{< highlight py >}} +{{< github_sample "/apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets.py" model_bigqueryio_table_spec >}} +{{< /highlight >}} You can also omit `project_id` and use the `[dataset_id].[table_id]` format. If you omit the project ID, Beam uses the default project ID from your - [pipeline options](https://beam.apache.org/releases/javadoc/{{ site.release_latest }}/org/apache/beam/sdk/extensions/gcp/options/GcpOptions.html). + [pipeline options](https://beam.apache.org/releases/javadoc/{{< param release_latest >}}/org/apache/beam/sdk/extensions/gcp/options/GcpOptions.html). - [pipeline options](https://beam.apache.org/releases/pydoc/{{ site.release_latest }}/apache_beam.options.pipeline_options.html#apache_beam.options.pipeline_options.GoogleCloudOptions). + [pipeline options](https://beam.apache.org/releases/pydoc/{{< param release_latest >}}/apache_beam.options.pipeline_options.html#apache_beam.options.pipeline_options.GoogleCloudOptions). -```java -{% github_sample /apache/beam/blob/master/examples/java/src/main/java/org/apache/beam/examples/snippets/Snippets.java tag:BigQueryTableSpecWithoutProject -%}``` -```py -{% github_sample /apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets.py tag:model_bigqueryio_table_spec_without_project -%}``` +{{< highlight java >}} +{{< github_sample "/apache/beam/blob/master/examples/java/src/main/java/org/apache/beam/examples/snippets/Snippets.java" BigQueryTableSpecWithoutProject >}} +{{< /highlight >}} + +{{< highlight py >}} +{{< github_sample "/apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets.py" model_bigqueryio_table_spec_without_project >}} +{{< /highlight >}} #### Using a TableReference To specify a table with a `TableReference`, create a new `TableReference` using the three parts of the BigQuery table name. -```java -{% github_sample /apache/beam/blob/master/examples/java/src/main/java/org/apache/beam/examples/snippets/Snippets.java tag:BigQueryTableSpecObject -%}``` -```py -{% github_sample /apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets.py tag:model_bigqueryio_table_spec_object -%}``` +{{< highlight java >}} +{{< github_sample "/apache/beam/blob/master/examples/java/src/main/java/org/apache/beam/examples/snippets/Snippets.java" BigQueryTableSpecObject >}} +{{< /highlight >}} + +{{< highlight py >}} +{{< github_sample "/apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets.py" model_bigqueryio_table_spec_object >}} +{{< /highlight >}} -{:.language-java} -The Beam SDK for Java also provides the [`parseTableSpec`](https://beam.apache.org/releases/javadoc/{{ site.release_latest }}/org/apache/beam/sdk/io/gcp/bigquery/BigQueryHelpers.html) +{{< paragraph class="language-java" >}} +The Beam SDK for Java also provides the [`parseTableSpec`](https://beam.apache.org/releases/javadoc/{{< param release_latest >}}/org/apache/beam/sdk/io/gcp/bigquery/BigQueryHelpers.html) helper method, which constructs a `TableReference` object from a String that contains the fully-qualified BigQuery table name. However, the static factory methods for BigQueryIO transforms accept the table name as a String and construct a `TableReference` object for you. - +{{< /paragraph >}} ### Table rows @@ -172,16 +172,17 @@ BigQueryIO allows you to use all of these data types. The following example shows the correct format for data types used when reading from and writing to BigQuery: -```java -{% github_sample /apache/beam/blob/master/examples/java/src/main/java/org/apache/beam/examples/snippets/Snippets.java tag:BigQueryDataTypes -%}``` -```py -{% github_sample /apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets.py tag:model_bigqueryio_data_types -%}``` +{{< highlight java >}} +{{< github_sample "/apache/beam/blob/master/examples/java/src/main/java/org/apache/beam/examples/snippets/Snippets.java" BigQueryDataTypes >}} +{{< /highlight >}} + +{{< highlight py >}} +{{< github_sample "/apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets.py" model_bigqueryio_data_types >}} +{{< /highlight >}} -{:.language-java} +{{< paragraph class="language-java" >}} As of Beam 2.7.0, the NUMERIC data type is supported. This data type supports high-precision decimal numbers (precision of 38 digits, scale of 9 digits). The GEOGRAPHY data type works with Well-Known Text (See [https://en.wikipedia.org/wiki/Well-known_text](https://en.wikipedia.org/wiki/Well-known_text) @@ -189,10 +190,11 @@ format for reading and writing to BigQuery. BigQuery IO requires values of BYTES datatype to be encoded using base64 encoding when writing to BigQuery. When bytes are read from BigQuery they are returned as base64-encoded strings. +{{< /paragraph >}} -{:.language-py} +{{< paragraph class="language-py" >}} As of Beam 2.7.0, the NUMERIC data type is supported. This data type supports high-precision decimal numbers (precision of 38 digits, scale of 9 digits). The GEOGRAPHY data type works with Well-Known Text (See [https://en.wikipedia.org/wiki/Well-known_text](https://en.wikipedia.org/wiki/Well-known_text) @@ -200,6 +202,7 @@ format for reading and writing to BigQuery. BigQuery IO requires values of BYTES datatype to be encoded using base64 encoding when writing to BigQuery. When bytes are read from BigQuery they are returned as base64-encoded bytes. +{{< /paragraph >}} ## Reading from BigQuery @@ -218,11 +221,12 @@ list of limitations. -{:.language-java} +{{< paragraph class="language-java" >}} The Beam SDK for Java has two BigQueryIO read methods. Both of these methods allow you to read from a table, or read fields using a query string. +{{< /paragraph >}} -{:.language-java} +{{< paragraph class="language-java" wrap="span" >}} 1. `read(SerializableFunction)` reads Avro-formatted records and uses a specified parsing function to parse them into a `PCollection` of custom typed objects. Each element in the `PCollection` represents a single row in the @@ -236,73 +240,83 @@ allow you to read from a table, or read fields using a query string. 2-3 times slower in performance compared to `read(SerializableFunction)`. The [example code](#reading-from-a-table) for reading from a table shows how to use `readTableRows`. +{{< /paragraph >}} -{:.language-java} +{{< paragraph class="language-java" >}} ***Note:*** `BigQueryIO.read()` is deprecated as of Beam SDK 2.2.0. Instead, use `read(SerializableFunction)` to parse BigQuery rows from Avro `GenericRecord` into your custom type, or use `readTableRows()` to parse them into JSON `TableRow` objects. +{{< /paragraph >}} -{:.language-py} +{{< paragraph class="language-py" >}} To read from a BigQuery table using the Beam SDK for Python, apply a `Read` transform on a `BigQuerySource`. Read returns a `PCollection` of dictionaries, where each element in the `PCollection` represents a single row in the table. Integer values in the `TableRow` objects are encoded as strings to match BigQuery's exported JSON format. +{{< /paragraph >}} ### Reading from a table -{:.language-java} +{{< paragraph class="language-java" >}} To read an entire BigQuery table, use the `from` method with a BigQuery table name. This example uses `readTableRows`. +{{< /paragraph >}} -{:.language-py} +{{< paragraph class="language-py" >}} To read an entire BigQuery table, use the `table` parameter with the BigQuery table name. +{{< /paragraph >}} The following code reads an entire table that contains weather station data and then extracts the `max_temperature` column. -```java -{% github_sample /apache/beam/blob/master/examples/java/src/main/java/org/apache/beam/examples/snippets/Snippets.java tag:BigQueryReadTable -%}``` -```py -{% github_sample /apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets.py tag:model_bigqueryio_read_table -%}``` +{{< highlight java >}} +{{< github_sample "/apache/beam/blob/master/examples/java/src/main/java/org/apache/beam/examples/snippets/Snippets.java" BigQueryReadTable >}} +{{< /highlight >}} + +{{< highlight py >}} +{{< github_sample "/apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets.py" model_bigqueryio_read_table >}} +{{< /highlight >}} ### Reading with a query string -{:.language-java} +{{< paragraph class="language-java" >}} If you don't want to read an entire table, you can supply a query string with the `fromQuery` method. This example uses `read(SerializableFunction)`. +{{< /paragraph >}} -{:.language-py} +{{< paragraph class="language-py" >}} If you don't want to read an entire table, you can supply a query string to `BigQuerySource` by specifying the `query` parameter. +{{< /paragraph >}} The following code uses a SQL query to only read the `max_temperature` column. -```java -{% github_sample /apache/beam/blob/master/examples/java/src/main/java/org/apache/beam/examples/snippets/Snippets.java tag:BigQueryReadQuery -%}``` -```py -{% github_sample /apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets.py tag:model_bigqueryio_read_query -%}``` +{{< highlight java >}} +{{< github_sample "/apache/beam/blob/master/examples/java/src/main/java/org/apache/beam/examples/snippets/Snippets.java" BigQueryReadQuery >}} +{{< /highlight >}} + +{{< highlight py >}} +{{< github_sample "/apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets.py" model_bigqueryio_read_query >}} +{{< /highlight >}} You can also use BigQuery's standard SQL dialect with a query string, as shown in the following example: -```java -{% github_sample /apache/beam/blob/master/examples/java/src/main/java/org/apache/beam/examples/snippets/Snippets.java tag:BigQueryReadQueryStdSQL -%}``` -```py -{% github_sample /apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets.py tag:model_bigqueryio_read_query_std_sql -%}``` +{{< highlight java >}} +{{< github_sample "/apache/beam/blob/master/examples/java/src/main/java/org/apache/beam/examples/snippets/Snippets.java" BigQueryReadQueryStdSQL >}} +{{< /highlight >}} + +{{< highlight py >}} +{{< github_sample "/apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets.py" model_bigqueryio_read_query_std_sql >}} +{{< /highlight >}} ### Using the BigQuery Storage API {#storage-api} @@ -337,26 +351,28 @@ the BigQuery Storage API and column projection to read public samples of weather data from a BigQuery table. You can view the [full source code on GitHub](https://github.com/apache/beam/blob/master/examples/java/src/main/java/org/apache/beam/examples/cookbook/BigQueryTornadoes.java). -```java +{{< highlight java >}} rowsFromBigQuery = p.apply( BigQueryIO.readTableRows() .from(options.getInput()) .withMethod(Method.DIRECT_READ) .withSelectedFields(Lists.newArrayList("month", "tornado")); -``` -```py +{{< /highlight >}} + +{{< highlight py >}} # The SDK for Python does not support the BigQuery Storage API. -``` +{{< /highlight >}} The following code snippet reads with a query string. -```java +{{< highlight java >}} // Snippet not yet available (BEAM-7034). -``` -```py +{{< /highlight >}} + +{{< highlight py >}} # The SDK for Python does not support the BigQuery Storage API. -``` +{{< /highlight >}} ## Writing to BigQuery @@ -389,11 +405,12 @@ should create a table if the destination table does not exist. -{:.language-java} +{{< paragraph class="language-java" >}} Use `.withCreateDisposition` to specify the create disposition. Valid enum values are: +{{< /paragraph >}} -{:.language-java} +{{< paragraph class="language-java" wrap="span" >}} * `Write.CreateDisposition.CREATE_IF_NEEDED`: Specifies that the write operation should create a new table if one does not exist. If you use this value, you must provide a table schema with the `withSchema` method. @@ -402,21 +419,23 @@ values are: * `Write.CreateDisposition.CREATE_NEVER`: Specifies that a table should never be created. If the destination table does not exist, the write operation fails. +{{< /paragraph >}} -{:.language-py} +{{< paragraph class="language-py" >}} Use the `create_disposition` parameter to specify the create disposition. Valid enum values are: +{{< /paragraph >}} -{:.language-py} +{{< paragraph class="language-py" wrap="span" >}} * `BigQueryDisposition.CREATE_IF_NEEDED`: Specifies that the write operation should create a new table if one does not exist. If you use this value, you must provide a table schema. `CREATE_IF_NEEDED` is the default behavior. * `BigQueryDisposition.CREATE_NEVER`: Specifies that a table should never be created. If the destination table does not exist, the write operation fails. - +{{< /paragraph >}} If you specify `CREATE_IF_NEEDED` as the create disposition and you don't supply a table schema, the transform might fail at runtime if the destination table does @@ -430,11 +449,12 @@ existing table. -{:.language-java} +{{< paragraph class="language-java" >}} Use `.withWriteDisposition` to specify the write disposition. Valid enum values are: +{{< /paragraph >}} -{:.language-java} +{{< paragraph class="language-java" wrap="span" >}} * `Write.WriteDisposition.WRITE_EMPTY`: Specifies that the write operation should fail at runtime if the destination table is not empty. `WRITE_EMPTY` is the default behavior. @@ -445,14 +465,16 @@ are: * `Write.WriteDisposition.WRITE_APPEND`: Specifies that the write operation should append the rows to the end of the existing table. +{{< /paragraph >}} -{:.language-py} +{{< paragraph class="language-py" >}} Use the `write_disposition` parameter to specify the write disposition. Valid enum values are: +{{< /paragraph >}} -{:.language-py} +{{< paragraph class="language-py" wrap="span" >}} * `BigQueryDisposition.WRITE_EMPTY`: Specifies that the write operation should fail at runtime if the destination table is not empty. `WRITE_EMPTY` is the default behavior. @@ -463,7 +485,7 @@ enum values are: * `BigQueryDisposition.WRITE_APPEND`: Specifies that the write operation should append the rows to the end of the existing table. - +{{< /paragraph >}} When you use `WRITE_EMPTY`, the check for whether or not the destination table is empty can occur before the actual write operation. This check doesn't @@ -478,16 +500,17 @@ fail later when the write attempts happen. If your BigQuery write operation creates a new table, you must provide schema information. The schema contains information about each field in the table. -{:.language-java} +{{< paragraph class="language-java" >}} To create a table schema in Java, you can either use a `TableSchema` object, or use a string that contains a JSON-serialized `TableSchema` object. +{{< /paragraph >}} -{:.language-py} +{{< paragraph class="language-py" >}} To create a table schema in Python, you can either use a `TableSchema` object, or use a string that defines a list of fields. Single string based schemas do not support nested fields, repeated fields, or specifying a BigQuery mode for fields (the mode will always be set to `NULLABLE`). - +{{< /paragraph >}} #### Using a TableSchema @@ -495,7 +518,7 @@ To create and use a table schema as a `TableSchema` object, follow these steps. -{:.language-java} +{{< paragraph class="language-java" wrap="span" >}} 1. Create a list of `TableFieldSchema` objects. Each `TableFieldSchema` object represents a field in the table. @@ -504,79 +527,84 @@ To create and use a table schema as a `TableSchema` object, follow these steps. 3. Use the `withSchema` method to provide your table schema when you apply a write transform. - +{{< /paragraph >}} -{:.language-py} +{{< paragraph class="language-py" wrap="span" >}} 1. Create a `TableSchema` object. 2. Create and append a `TableFieldSchema` object for each field in your table. 3. Next, use the `schema` parameter to provide your table schema when you apply a write transform. Set the parameter’s value to the `TableSchema` object. - +{{< /paragraph >}} The following example code shows how to create a `TableSchema` for a table with two fields (source and quote) of type string. -```java -{% github_sample /apache/beam/blob/master/examples/java/src/main/java/org/apache/beam/examples/snippets/Snippets.java tag:BigQuerySchemaObject -%}``` -```py -{% github_sample /apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets.py tag:model_bigqueryio_schema_object -%}``` +{{< highlight java >}} +{{< github_sample "/apache/beam/blob/master/examples/java/src/main/java/org/apache/beam/examples/snippets/Snippets.java" BigQuerySchemaObject >}} +{{< /highlight >}} + +{{< highlight py >}} +{{< github_sample "/apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets.py" model_bigqueryio_schema_object >}} +{{< /highlight >}} #### Using a string -{:.language-java} +{{< paragraph class="language-java" >}} To create and use a table schema as a string that contains JSON-serialized `TableSchema` object, follow these steps. +{{< /paragraph >}} -{:.language-java} +{{< paragraph class="language-java" wrap="span" >}} 1. Create a string that contains a JSON-serialized `TableSchema` object. 2. Use the `withJsonSchema` method to provide your table schema when you apply a write transform. - +{{< /paragraph >}} -{:.language-py} +{{< paragraph class="language-py" >}} To create and use a table schema as a string, follow these steps. +{{< /paragraph >}} -{:.language-py} +{{< paragraph class="language-py" wrap="span" >}} 1. Create a single comma separated string of the form "field1:type1,field2:type2,field3:type3" that defines a list of fields. The type should specify the field’s BigQuery type. 2. Use the `schema` parameter to provide your table schema when you apply a write transform. Set the parameter’s value to the string. - +{{< /paragraph >}} The following example shows how to use a string to specify the same table schema as the previous example. -```java -{% github_sample /apache/beam/blob/master/examples/java/src/main/java/org/apache/beam/examples/snippets/Snippets.java tag:BigQuerySchemaJson -%}``` -```py -{% github_sample /apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets.py tag:model_bigqueryio_schema -%}``` +{{< highlight java >}} +{{< github_sample "/apache/beam/blob/master/examples/java/src/main/java/org/apache/beam/examples/snippets/Snippets.java" BigQuerySchemaJson >}} +{{< /highlight >}} + +{{< highlight py >}} +{{< github_sample "/apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets.py" model_bigqueryio_schema >}} +{{< /highlight >}} ### Setting the insertion method -{:.language-py} +{{< paragraph class="language-py" >}} > The Beam SDK for Python does not currently support specifying the insertion method. +{{< /paragraph >}} BigQueryIO supports two methods of inserting data into BigQuery: load jobs and streaming inserts. Each insertion method provides different tradeoffs of cost, @@ -587,112 +615,128 @@ for more information about these tradeoffs. BigQueryIO chooses a default insertion method based on the input `PCollection`. -{:.language-py} +{{< paragraph class="language-py" >}} BigQueryIO uses load jobs when you apply a BigQueryIO write transform to a bounded `PCollection`. +{{< /paragraph >}} -{:.language-java} +{{< paragraph class="language-java" >}} BigQueryIO uses load jobs in the following situations: +{{< /paragraph >}} -{:.language-java} +{{< paragraph class="language-java" wrap="span" >}} * When you apply a BigQueryIO write transform to a bounded `PCollection`. * When you apply a BigQueryIO write transform to an unbounded `PCollection` and use `BigQueryIO.write().withTriggeringFrequency()` to set the triggering frequency. * When you specify load jobs as the insertion method using `BigQueryIO.write().withMethod(FILE_LOADS)`. +{{< /paragraph >}} -{:.language-py} +{{< paragraph class="language-py" >}} BigQueryIO uses streaming inserts when you apply a BigQueryIO write transform to an unbounded `PCollection`. +{{< /paragraph >}} -{:.language-java} +{{< paragraph class="language-java" >}} BigQueryIO uses streaming inserts in the following situations: +{{< /paragraph >}} -{:.language-java} +{{< paragraph class="language-java" wrap="span" >}} * When you apply a BigQueryIO write transform to an unbounded `PCollection` and do not set the triggering frequency. * When you specify streaming inserts as the insertion method using `BigQueryIO.write().withMethod(STREAMING_INSERTS)`. +{{< /paragraph >}} -{:.language-java} +{{< paragraph class="language-java" >}} You can use `withMethod` to specify the desired insertion method. See -[Write.Method](https://beam.apache.org/releases/javadoc/{{ site.release_latest }}/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIO.Write.Method.html) +[Write.Method](https://beam.apache.org/releases/javadoc/{{< param release_latest >}}/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIO.Write.Method.html) for the list of the available methods and their restrictions. +{{< /paragraph >}} -{:.language-java} +{{< paragraph class="language-java" >}} ***Note:*** If you use batch loads in a streaming pipeline, you must use `withTriggeringFrequency` to specify a triggering frequency and `withNumFileShards` to specify number of file shards written. - +{{< /paragraph >}} ### Writing to a table -{:.language-java} +{{< paragraph class="language-java" >}} To write to a BigQuery table, apply either a `writeTableRows` or `write` transform. +{{< /paragraph >}} -{:.language-py} +{{< paragraph class="language-py" >}} To write to a BigQuery table, apply the `WriteToBigQuery` transform. `WriteToBigQuery` supports both batch mode and streaming mode. You must apply the transform to a `PCollection` of dictionaries. In general, you'll need to use another transform, such as `ParDo`, to format your output data into a collection. +{{< /paragraph >}} The following examples use this `PCollection` that contains quotes. -```java -{% github_sample /apache/beam/blob/master/examples/java/src/main/java/org/apache/beam/examples/snippets/Snippets.java tag:BigQueryWriteInput -%}``` -```py -{% github_sample /apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets.py tag:model_bigqueryio_write_input -%}``` +{{< highlight java >}} +{{< github_sample "/apache/beam/blob/master/examples/java/src/main/java/org/apache/beam/examples/snippets/Snippets.java" BigQueryWriteInput >}} +{{< /highlight >}} + +{{< highlight py >}} +{{< github_sample "/apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets.py" model_bigqueryio_write_input >}} +{{< /highlight >}} -{:.language-java} +{{< paragraph class="language-java" >}} The `writeTableRows` method writes a `PCollection` of BigQuery `TableRow` objects to a BigQuery table. Each element in the `PCollection` represents a single row in the table. This example uses `writeTableRows` to write quotes to a `PCollection`. The write operation creates a table if needed; if the table already exists, it will be replaced. +{{< /paragraph >}} -{:.language-py} +{{< paragraph class="language-py" >}} The following example code shows how to apply a `WriteToBigQuery` transform to write a `PCollection` of dictionaries to a BigQuery table. The write operation creates a table if needed; if the table already exists, it will be replaced. +{{< /paragraph >}} -```java -{% github_sample /apache/beam/blob/master/examples/java/src/main/java/org/apache/beam/examples/snippets/Snippets.java tag:BigQueryWriteTable -%}``` -```py -{% github_sample /apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets.py tag:model_bigqueryio_write -%}``` +{{< highlight java >}} +{{< github_sample "/apache/beam/blob/master/examples/java/src/main/java/org/apache/beam/examples/snippets/Snippets.java" BigQueryWriteTable >}} +{{< /highlight >}} + +{{< highlight py >}} +{{< github_sample "/apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets.py" model_bigqueryio_write >}} +{{< /highlight >}} -{:.language-java} +{{< paragraph class="language-java" >}} The `write` transform writes a `PCollection` of custom typed objects to a BigQuery table. Use `.withFormatFunction(SerializableFunction)` to provide a formatting function that converts each input element in the `PCollection` into a `TableRow`. This example uses `write` to write a `PCollection`. The write operation creates a table if needed; if the table already exists, it will be replaced. +{{< /paragraph >}} -```java -{% github_sample /apache/beam/blob/master/examples/java/src/main/java/org/apache/beam/examples/snippets/Snippets.java tag:BigQueryWriteFunction -%}``` +{{< highlight java >}} +{{< github_sample "/apache/beam/blob/master/examples/java/src/main/java/org/apache/beam/examples/snippets/Snippets.java" BigQueryWriteFunction >}} +{{< /highlight >}} -{:.language-java} +{{< paragraph class="language-java" >}} When you use streaming inserts, you can decide what to do with failed records. You can either keep retrying, or return the failed records in a separate `PCollection` using the `WriteResult.getFailedInserts()` method. +{{< /paragraph >}} ### Using dynamic destinations -{:.language-py} +{{< paragraph class="language-py" >}} > The Beam SDK for Python does not currently support dynamic destinations. +{{< /paragraph >}} You can use the dynamic destinations feature to write elements in a `PCollection` to different BigQuery tables, possibly with different schemas. @@ -706,11 +750,12 @@ In addition, you can also write your own types that have a mapping function to -{:.language-java} +{{< paragraph class="language-java" >}} To use dynamic destinations, you must create a `DynamicDestinations` object and implement the following methods: +{{< /paragraph >}} -{:.language-java} +{{< paragraph class="language-java" wrap="span" >}} * `getDestination`: Returns an object that `getTable` and `getSchema` can use as the destination key to compute the destination table and/or schema. @@ -720,23 +765,27 @@ implement the following methods: * `getSchema`: Returns the table schema (as a `TableSchema` object) for the destination key. +{{< /paragraph >}} -{:.language-java} +{{< paragraph class="language-java" >}} Then, use `write().to` with your `DynamicDestinations` object. This example uses a `PCollection` that contains weather data and writes the data into a different table for each year. +{{< /paragraph >}} + +{{< highlight java >}} +{{< github_sample "/apache/beam/blob/master/examples/java/src/main/java/org/apache/beam/examples/snippets/Snippets.java" BigQueryWriteDynamicDestinations >}} +{{< /highlight >}} -```java -{% github_sample /apache/beam/blob/master/examples/java/src/main/java/org/apache/beam/examples/snippets/Snippets.java tag:BigQueryWriteDynamicDestinations -%}``` -```py +{{< highlight py >}} # The Beam SDK for Python does not currently support dynamic destinations. -``` +{{< /highlight >}} ### Using time partitioning -{:.language-py} +{{< paragraph class="language-py" >}} > The Beam SDK for Python does not currently support time partitioning. +{{< /paragraph >}} BigQuery time partitioning divides your table into smaller partitions, which is called a [partitioned table](https://cloud.google.com/bigquery/docs/partitioned-tables). @@ -744,25 +793,29 @@ Partitioned tables make it easier for you to manage and query your data. -{:.language-java} +{{< paragraph class="language-java" >}} To use BigQuery time partitioning, use one of these two methods: +{{< /paragraph >}} -{:.language-java} +{{< paragraph class="language-java" wrap="span" >}} * `withTimePartitioning`: This method takes a `TimePartitioning` class, and is only usable if you are writing to a single table. * `withJsonTimePartitioning`: This method is the same as `withTimePartitioning`, but takes a JSON-serialized String object. +{{< /paragraph >}} -{:.language-java} +{{< paragraph class="language-java" >}} This example generates one partition per day. +{{< /paragraph >}} + +{{< highlight java >}} +{{< github_sample "/apache/beam/blob/master/examples/java/src/main/java/org/apache/beam/examples/snippets/Snippets.java" BigQueryTimePartitioning >}} +{{< /highlight >}} -```java -{% github_sample /apache/beam/blob/master/examples/java/src/main/java/org/apache/beam/examples/snippets/Snippets.java tag:BigQueryTimePartitioning -%}``` -```py +{{< highlight py >}} # The Beam SDK for Python does not currently support time partitioning. -``` +{{< /highlight >}} ## Limitations diff --git a/website/src/documentation/io/built-in-hadoop.md b/website/www/site/content/en/documentation/io/built-in/hadoop.md similarity index 90% rename from website/src/documentation/io/built-in-hadoop.md rename to website/www/site/content/en/documentation/io/built-in/hadoop.md index 09fcd7fc1341..89282d209cc5 100644 --- a/website/src/documentation/io/built-in-hadoop.md +++ b/website/www/site/content/en/documentation/io/built-in/hadoop.md @@ -1,8 +1,5 @@ --- -layout: section title: "Apache Hadoop Input/Output Format IO" -section_menu: section-menu/documentation.html -permalink: /documentation/io/built-in/hadoop/ --- -[Built-in I/O Transforms]({{site.baseurl}}/documentation/io/built-in/) +[Built-in I/O Transforms](/documentation/io/built-in/) # Apache Parquet I/O connector - +{{< language-switcher java py >}} The Beam SDKs include built-in transforms that can read data from and write data to [Apache Parquet](https://parquet.apache.org) files. @@ -37,63 +28,72 @@ to [Apache Parquet](https://parquet.apache.org) files. -{:.language-java} +{{< paragraph class="language-java" >}} To use ParquetIO, add the Maven artifact dependency to your `pom.xml` file. +{{< /paragraph >}} -```java +{{< highlight java >}} org.apache.beam beam-sdks-java-io-parquet - {{ site.release_latest }} + {{< param release_latest >}} -``` +{{< /highlight >}} -{:.language-java} +{{< paragraph class="language-java" >}} Additional resources: +{{< /paragraph >}} -{:.language-java} +{{< paragraph class="language-java" wrap="span" >}} * [ParquetIO source code](https://github.com/apache/beam/blob/master/sdks/java/io/parquet/src/main/java/org/apache/beam/sdk/io/parquet/ParquetIO.java) -* [ParquetIO Javadoc](https://beam.apache.org/releases/javadoc/{{ site.release_latest }}/org/apache/beam/sdk/io/parquet/ParquetIO.html) +* [ParquetIO Javadoc](https://beam.apache.org/releases/javadoc/{{< param release_latest >}}/org/apache/beam/sdk/io/parquet/ParquetIO.html) +{{< /paragraph >}} -{:.language-py} -ParquetIO comes preinstalled with the Apache Beam python sdk.. +{{< paragraph class="language-py" >}} +ParquetIO comes preinstalled with the Apache Beam python sdk..{{< param release_latest >}} +{{< /paragraph >}} -{:.language-py} +{{< paragraph class="language-py" >}} Additional resources: +{{< /paragraph >}} -{:.language-py} +{{< paragraph class="language-py" wrap="span" >}} * [ParquetIO source code](https://github.com/apache/beam/blob/master/sdks/python/apache_beam/io/parquetio.py) -* [ParquetIO Pydoc](https://beam.apache.org/releases/pydoc/{{ site.release_latest }}/apache_beam.io.parquetio.html) +* [ParquetIO Pydoc](https://beam.apache.org/releases/pydoc/{{< param release_latest >}}/apache_beam.io.parquetio.html) +{{< /paragraph >}} -{:.language-java} +{{< paragraph class="language-java" >}} #### Using ParquetIO with Spark before 2.4 +{{< /paragraph >}} -{:.language-java} +{{< paragraph class="language-java" >}} `ParquetIO` depends on an API introduced in Apache Parquet 1.10.0. **Spark 2.4.x is compatible and no additional steps are necessary**. Older versions of Spark will not work out of the box since a pre-installed version of Parquet libraries will take precedence during execution. The following workaround should be applied. +{{< /paragraph >}} -{:.language-java} +{{< paragraph class="language-java" >}} > **Note**: The following technique allows you to execute your pipeline with `ParquetIO` correctly. > The Parquet files that are consumed or generated by this Beam connector should remain interoperable with the other tools on your cluster. +{{< /paragraph >}} -{:.language-java} +{{< paragraph class="language-java" >}} Include the Parquet artifact normally and ensure that it brings in the correct version of Parquet as a transitive dependency. +{{< /paragraph >}} -{:.language-java} -``` +{{< highlight java >}} org.apache.beam beam-sdks-java-io-parquet ${beam.version} -``` +{{< /highlight >}} -{:.language-java} +{{< paragraph class="language-java" >}} Relocate the following packages: +{{< /paragraph >}} -{:.language-java} -``` +{{< highlight java >}} org.apache.maven.plugins maven-shade-plugin @@ -142,7 +142,8 @@ Relocate the following packages: -``` +{{< /highlight >}} -{:.language-java} +{{< paragraph class="language-java" >}} This technique has been tested to work on Spark 2.2.3, Spark 2.3.3 and Spark 2.4.3 (although it is optional for Spark 2.4+). +{{< /paragraph >}} diff --git a/website/src/documentation/io/developing-io-java.md b/website/www/site/content/en/documentation/io/developing-io-java.md similarity index 96% rename from website/src/documentation/io/developing-io-java.md rename to website/www/site/content/en/documentation/io/developing-io-java.md index 388229c8b8a2..cda134543d59 100644 --- a/website/src/documentation/io/developing-io-java.md +++ b/website/www/site/content/en/documentation/io/developing-io-java.md @@ -1,9 +1,6 @@ --- -layout: section title: "Apache Beam: Developing I/O connectors for Java" -section_menu: section-menu/documentation.html -permalink: /documentation/io/developing-io-java/ -redirect_from: /documentation/io/authoring-java/ +aliases: /documentation/io/authoring-java/ --- -![This is a sequence diagram that shows the lifecycle of the Source]( - {{ "/images/source-sequence-diagram.svg" | prepend: site.baseurl }}) +![This is a sequence diagram that shows the lifecycle of the Source](/images/source-sequence-diagram.svg) ### Using ParDo and GroupByKey @@ -173,8 +169,8 @@ For **file-based sinks**, you can use the `FileBasedSink` abstraction that is provided by both the Java and Python SDKs. See our language specific implementation guides for more details: -* [Developing I/O connectors for Java]({{ site.baseurl }}/documentation/io/developing-io-java/) -* [Developing I/O connectors for Python]({{ site.baseurl }}/documentation/io/developing-io-python/) +* [Developing I/O connectors for Java](/documentation/io/developing-io-java/) +* [Developing I/O connectors for Python](/documentation/io/developing-io-python/) diff --git a/website/src/documentation/io/developing-io-python.md b/website/www/site/content/en/documentation/io/developing-io-python.md similarity index 90% rename from website/src/documentation/io/developing-io-python.md rename to website/www/site/content/en/documentation/io/developing-io-python.md index fdb2a7669053..3e581e01dd13 100644 --- a/website/src/documentation/io/developing-io-python.md +++ b/website/www/site/content/en/documentation/io/developing-io-python.md @@ -1,9 +1,6 @@ --- -layout: section title: "Apache Beam: Developing I/O connectors for Python" -section_menu: section-menu/documentation.html -permalink: /documentation/io/developing-io-python/ -redirect_from: +aliases: - /documentation/io/authoring-python/ - /documentation/sdks/python-custom-io/ --- @@ -26,13 +23,13 @@ To connect to a data store that isn’t supported by Beam’s existing I/O connectors, you must create a custom I/O connector that usually consist of a source and a sink. All Beam sources and sinks are composite transforms; however, the implementation of your custom I/O depends on your use case. Before you -start, read the [new I/O connector overview]({{ site.baseurl }}/documentation/io/developing-io-overview/) +start, read the [new I/O connector overview](/documentation/io/developing-io-overview/) for an overview of developing a new I/O connector, the available implementation options, and how to choose the right option for your use case. -This guide covers using the [Source and FileBasedSink interfaces](https://beam.apache.org/releases/pydoc/{{ site.release_latest }}/apache_beam.io.iobase.html) +This guide covers using the [Source and FileBasedSink interfaces](https://beam.apache.org/releases/pydoc/{{< param release_latest >}}/apache_beam.io.iobase.html) for Python. The Java SDK offers the same functionality, but uses a slightly -different API. See [Developing I/O connectors for Java]({{ site.baseurl }}/documentation/io/developing-io-java/) +different API. See [Developing I/O connectors for Java](/documentation/io/developing-io-java/) for information specific to the Java SDK. ## Basic code requirements {#basic-code-reqs} @@ -62,7 +59,7 @@ multiple worker instances in parallel. As such, the code you provide for methods available in the [source_test_utils module](https://github.com/apache/beam/blob/master/sdks/python/apache_beam/io/source_test_utils.py) to develop tests for your source. -In addition, see the [PTransform style guide]({{ site.baseurl }}/contribute/ptransform-style-guide/) +In addition, see the [PTransform style guide](/contribute/ptransform-style-guide/) for Beam's transform style guidance. ## Implementing the Source interface @@ -83,7 +80,7 @@ Supply the logic for your new source by creating the following classes: a wrapper. You can find these classes in the -[apache_beam.io.iobase module](https://beam.apache.org/releases/pydoc/{{ site.release_latest }}/apache_beam.io.iobase.html). +[apache_beam.io.iobase module](https://beam.apache.org/releases/pydoc/{{< param release_latest >}}/apache_beam.io.iobase.html). ### Implementing the BoundedSource subclass @@ -185,13 +182,15 @@ See [AvroSource](https://github.com/apache/beam/blob/master/sdks/python/apache_b The following example, `CountingSource`, demonstrates an implementation of `BoundedSource` and uses the SDK-provided `RangeTracker` called `OffsetRangeTracker`. -``` -{% github_sample /apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets.py tag:model_custom_source_new_source %}``` +{{< highlight >}} +{{< github_sample "/apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets.py" model_custom_source_new_source >}} +{{< /highlight >}} To read data from the source in your pipeline, use the `Read` transform: -``` -{% github_sample /apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets.py tag:model_custom_source_use_new_source %}``` +{{< highlight >}} +{{< github_sample "/apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets.py" model_custom_source_use_new_source >}} +{{< /highlight >}} **Note:** When you create a source that end-users are going to use, we recommended that you do not expose the code for the source itself as @@ -202,10 +201,10 @@ exposing your sources, and walks through how to create a wrapper. ## Using the FileBasedSink abstraction -If your data source uses files, you can implement the [FileBasedSink](https://beam.apache.org/releases/pydoc/{{ site.release_latest }}/apache_beam.io.filebasedsink.html) +If your data source uses files, you can implement the [FileBasedSink](https://beam.apache.org/releases/pydoc/{{< param release_latest >}}/apache_beam.io.filebasedsink.html) abstraction to create a file-based sink. For other sinks, use `ParDo`, `GroupByKey`, and other transforms offered by the Beam SDK for Python. See the -[developing I/O connectors overview]({{ site.baseurl }}/documentation/io/developing-io-overview/) +[developing I/O connectors overview](/documentation/io/developing-io-overview/) for more details. When using the `FileBasedSink` interface, you must provide the format-specific @@ -254,7 +253,7 @@ users would need to add the reshard themselves (using the `GroupByKey` transform). To solve this, we recommended that you expose the source as a composite `PTransform` that performs both the read operation and the reshard. -See Beam’s [PTransform style guide]({{ site.baseurl }}/contribute/ptransform-style-guide/#exposing-a-ptransform-vs-something-else) +See Beam’s [PTransform style guide](/contribute/ptransform-style-guide/#exposing-a-ptransform-vs-something-else) for additional information about wrapping with a `PTransform`. The following examples change the source and sink from the above sections so @@ -262,20 +261,24 @@ that they are not exposed to end-users. For the source, rename `CountingSource` to `_CountingSource`. Then, create the wrapper `PTransform`, called `ReadFromCountingSource`: -``` -{% github_sample /apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets.py tag:model_custom_source_new_ptransform %}``` +{{< highlight >}} +{{< github_sample "/apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets.py" model_custom_source_new_ptransform >}} +{{< /highlight >}} Finally, read from the source: -``` -{% github_sample /apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets.py tag:model_custom_source_use_ptransform %}``` +{{< highlight >}} +{{< github_sample "/apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets.py" model_custom_source_use_ptransform >}} +{{< /highlight >}} For the sink, rename `SimpleKVSink` to `_SimpleKVSink`. Then, create the wrapper `PTransform`, called `WriteToKVSink`: -``` -{% github_sample /apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets.py tag:model_custom_sink_new_ptransform %}``` +{{< highlight >}} +{{< github_sample "/apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets.py" model_custom_sink_new_ptransform >}} +{{< /highlight >}} Finally, write to the sink: -``` -{% github_sample /apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets.py tag:model_custom_sink_use_ptransform %}``` +{{< highlight >}} +{{< github_sample "/apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets.py" model_custom_sink_use_ptransform >}} +{{< /highlight >}} diff --git a/website/src/documentation/io/testing.md b/website/www/site/content/en/documentation/io/testing.md similarity index 97% rename from website/src/documentation/io/testing.md rename to website/www/site/content/en/documentation/io/testing.md index 4e8b9b78c49b..e9b6f1dfa632 100644 --- a/website/src/documentation/io/testing.md +++ b/website/www/site/content/en/documentation/io/testing.md @@ -1,8 +1,5 @@ --- -layout: section title: "Testing I/O Transforms" -section_menu: section-menu/documentation.html -permalink: /documentation/io/testing/ --- diff --git a/website/src/documentation/patterns/custom-io.md b/website/www/site/content/en/documentation/patterns/custom-io.md similarity index 73% rename from website/src/documentation/patterns/custom-io.md rename to website/www/site/content/en/documentation/patterns/custom-io.md index d8160105e1ec..9e3e7fbd57ca 100644 --- a/website/src/documentation/patterns/custom-io.md +++ b/website/www/site/content/en/documentation/patterns/custom-io.md @@ -1,8 +1,5 @@ --- -layout: section title: "Custom I/O patterns" -section_menu: section-menu/documentation.html -permalink: /documentation/patterns/custom-io/ --- # Custom window patterns -The samples on this page demonstrate common custom window patterns. You can create custom windows with [`WindowFn` functions]({{ site.baseurl }}/documentation/programming-guide/#provided-windowing-functions). For more information, see the [programming guide section on windowing]({{ site.baseurl }}/documentation/programming-guide/#windowing). +The samples on this page demonstrate common custom window patterns. You can create custom windows with [`WindowFn` functions](/documentation/programming-guide/#provided-windowing-functions). For more information, see the [programming guide section on windowing](/documentation/programming-guide/#windowing). **Note**: Custom merging windows isn't supported in Python (with fnapi). @@ -29,10 +26,9 @@ You can modify the [`assignWindows`](https://beam.apache.org/releases/javadoc/cu Access the `assignWindows` function through `WindowFn.AssignContext.element()`. The original, fixed-duration `assignWindows` function is: -```java -{% github_sample /apache/beam/blob/master/examples/java/src/main/java/org/apache/beam/examples/snippets/Snippets.java tag:CustomSessionWindow1 -%} -``` +{{< highlight java >}} +{{< github_sample "/apache/beam/blob/master/examples/java/src/main/java/org/apache/beam/examples/snippets/Snippets.java" CustomSessionWindow1 >}} +{{< /highlight >}} ### Creating data-driven gaps To create data-driven gaps, add the following snippets to the `assignWindows` function: @@ -41,34 +37,30 @@ To create data-driven gaps, add the following snippets to the `assignWindows` fu For example, the following function assigns each element to a window between the timestamp and `gapDuration`: -```java -{% github_sample /apache/beam/blob/master/examples/java/src/main/java/org/apache/beam/examples/snippets/Snippets.java tag:CustomSessionWindow3 -%} -``` +{{< highlight java >}} +{{< github_sample "/apache/beam/blob/master/examples/java/src/main/java/org/apache/beam/examples/snippets/Snippets.java" CustomSessionWindow3 >}} +{{< /highlight >}} Then, set the `gapDuration` field in a windowing function: -```java -{% github_sample /apache/beam/blob/master/examples/java/src/main/java/org/apache/beam/examples/snippets/Snippets.java tag:CustomSessionWindow2 -%} -``` +{{< highlight java >}} +{{< github_sample "/apache/beam/blob/master/examples/java/src/main/java/org/apache/beam/examples/snippets/Snippets.java" CustomSessionWindow2 >}} +{{< /highlight >}} ### Windowing messages into sessions After creating data-driven gaps, you can window incoming data into the new, custom sessions. First, set the session length to the gap duration: -```java -{% github_sample /apache/beam/blob/master/examples/java/src/main/java/org/apache/beam/examples/snippets/Snippets.java tag:CustomSessionWindow4 -%} -``` +{{< highlight java >}} +{{< github_sample "/apache/beam/blob/master/examples/java/src/main/java/org/apache/beam/examples/snippets/Snippets.java" CustomSessionWindow4 >}} +{{< /highlight >}} Lastly, window data into sessions in your pipeline: -```java -{% github_sample /apache/beam/blob/master/examples/java/src/main/java/org/apache/beam/examples/snippets/Snippets.java tag:CustomSessionWindow6 -%} -``` +{{< highlight java >}} +{{< github_sample "/apache/beam/blob/master/examples/java/src/main/java/org/apache/beam/examples/snippets/Snippets.java" CustomSessionWindow6 >}} +{{< /highlight >}} ### Example data and windows The following test data tallies two users' scores with and without the `gap` attribute: @@ -86,7 +78,7 @@ The following test data tallies two users' scores with and without the `gap` att The diagram below visualizes the test data: -![Two sets of data and the standard and dynamic sessions with which the data is windowed.]( {{ "/images/standard-vs-dynamic-sessions.png" | prepend: site.baseurl }}) +![Two sets of data and the standard and dynamic sessions with which the data is windowed.](/images/standard-vs-dynamic-sessions.png) #### Standard sessions diff --git a/website/src/documentation/patterns/file-processing.md b/website/www/site/content/en/documentation/patterns/file-processing.md similarity index 69% rename from website/src/documentation/patterns/file-processing.md rename to website/www/site/content/en/documentation/patterns/file-processing.md index 592a58b198d4..f119cd34da0a 100644 --- a/website/src/documentation/patterns/file-processing.md +++ b/website/www/site/content/en/documentation/patterns/file-processing.md @@ -1,8 +1,5 @@ --- -layout: section title: "File processing patterns" -section_menu: section-menu/documentation.html -permalink: /documentation/patterns/file-processing/ --- + # Create Your Pipeline -* TOC -{:toc} +{{< toc >}} Your Beam program expresses a data processing pipeline, from start to finish. This section explains the mechanics of using the classes in the Beam SDKs to build a pipeline. To construct a pipeline using the classes in the Beam SDKs, your program will need to perform the following general steps: @@ -36,15 +33,15 @@ A Beam program often starts by creating a `Pipeline` object. In the Beam SDKs, each pipeline is represented by an explicit object of type `Pipeline`. Each `Pipeline` object is an independent entity that encapsulates both the data the pipeline operates over and the transforms that get applied to that data. -To create a pipeline, declare a `Pipeline` object, and pass it some [configuration options]({{ site.baseurl }}/documentation/programming-guide#configuring-pipeline-options). +To create a pipeline, declare a `Pipeline` object, and pass it some [configuration options](/documentation/programming-guide#configuring-pipeline-options). -```java +{{< highlight java >}} // Start by defining the options for the pipeline. PipelineOptions options = PipelineOptionsFactory.create(); // Then create the pipeline. Pipeline p = Pipeline.create(options); -``` +{{< /highlight >}} ## Reading Data Into Your Pipeline @@ -54,24 +51,24 @@ There are two kinds of root transforms in the Beam SDKs: `Read` and `Create`. `R The following example code shows how to `apply` a `TextIO.Read` root transform to read data from a text file. The transform is applied to a `Pipeline` object `p`, and returns a pipeline data set in the form of a `PCollection`: -```java +{{< highlight java >}} PCollection lines = p.apply( "ReadLines", TextIO.read().from("gs://some/inputData.txt")); -``` +{{< /highlight >}} ## Applying Transforms to Process Pipeline Data -You can manipulate your data using the various [transforms]({{ site.baseurl }}/documentation/programming-guide/#transforms) provided in the Beam SDKs. To do this, you **apply** the transforms to your pipeline's `PCollection` by calling the `apply` method on each `PCollection` that you want to process and passing the desired transform object as an argument. +You can manipulate your data using the various [transforms](/documentation/programming-guide/#transforms) provided in the Beam SDKs. To do this, you **apply** the transforms to your pipeline's `PCollection` by calling the `apply` method on each `PCollection` that you want to process and passing the desired transform object as an argument. The following code shows how to `apply` a transform to a `PCollection` of strings. The transform is a user-defined custom transform that reverses the contents of each string and outputs a new `PCollection` containing the reversed strings. The input is a `PCollection` called `words`; the code passes an instance of a `PTransform` object called `ReverseWords` to `apply`, and saves the return value as the `PCollection` called `reversedWords`. -```java +{{< highlight java >}} PCollection words = ...; PCollection reversedWords = words.apply(new ReverseWords()); -``` +{{< /highlight >}} ## Writing or Outputting Your Final Pipeline Data @@ -79,27 +76,27 @@ Once your pipeline has applied all of its transforms, you'll usually need to out The following example code shows how to `apply` a `TextIO.Write` transform to write a `PCollection` of `String` to a text file: -```java +{{< highlight java >}} PCollection filteredWords = ...; filteredWords.apply("WriteMyFile", TextIO.write().to("gs://some/outputData.txt")); -``` +{{< /highlight >}} ## Running Your Pipeline Once you have constructed your pipeline, use the `run` method to execute the pipeline. Pipelines are executed asynchronously: the program you create sends a specification for your pipeline to a **pipeline runner**, which then constructs and runs the actual series of pipeline operations. -```java +{{< highlight java >}} p.run(); -``` +{{< /highlight >}} The `run` method is asynchronous. If you'd like a blocking execution instead, run your pipeline appending the `waitUntilFinish` method: -```java +{{< highlight java >}} p.run().waitUntilFinish(); -``` +{{< /highlight >}} ## What's next -* [Programming Guide]({{ site.baseurl }}/documentation/programming-guide) - Learn the details of creating your pipeline, configuring pipeline options, and applying transforms. -* [Test your pipeline]({{ site.baseurl }}/documentation/pipelines/test-your-pipeline). +* [Programming Guide](/documentation/programming-guide) - Learn the details of creating your pipeline, configuring pipeline options, and applying transforms. +* [Test your pipeline](/documentation/pipelines/test-your-pipeline). diff --git a/website/src/documentation/pipelines/design-your-pipeline.md b/website/www/site/content/en/documentation/pipelines/design-your-pipeline.md similarity index 87% rename from website/src/documentation/pipelines/design-your-pipeline.md rename to website/www/site/content/en/documentation/pipelines/design-your-pipeline.md index 5c4c28efb618..6c65efdf1852 100644 --- a/website/src/documentation/pipelines/design-your-pipeline.md +++ b/website/www/site/content/en/documentation/pipelines/design-your-pipeline.md @@ -1,8 +1,5 @@ --- -layout: section title: "Design Your Pipeline" -section_menu: section-menu/documentation.html -permalink: /documentation/pipelines/design-your-pipeline/ --- # Design Your Pipeline -* TOC -{:toc} +{{< toc >}} This page helps you design your Apache Beam pipeline. It includes information about how to determine your pipeline's structure, how to choose which transforms to apply to your data, and how to determine your input and output methods. -Before reading this section, it is recommended that you become familiar with the information in the [Beam programming guide]({{ site.baseurl }}/documentation/programming-guide). +Before reading this section, it is recommended that you become familiar with the information in the [Beam programming guide](/documentation/programming-guide). ## What to consider when designing your pipeline @@ -32,7 +28,7 @@ When designing your Beam pipeline, consider a few basic questions: * **Where is your input data stored?** How many sets of input data do you have? This will determine what kinds of `Read` transforms you'll need to apply at the start of your pipeline. * **What does your data look like?** It might be plaintext, formatted log files, or rows in a database table. Some Beam transforms work exclusively on `PCollection`s of key/value pairs; you'll need to determine if and how your data is keyed and how to best represent that in your pipeline's `PCollection`(s). -* **What do you want to do with your data?** The core transforms in the Beam SDKs are general purpose. Knowing how you need to change or manipulate your data will determine how you build core transforms like [ParDo]({{ site.baseurl }}/documentation/programming-guide/#pardo), or when you use pre-written transforms included with the Beam SDKs. +* **What do you want to do with your data?** The core transforms in the Beam SDKs are general purpose. Knowing how you need to change or manipulate your data will determine how you build core transforms like [ParDo](/documentation/programming-guide/#pardo), or when you use pre-written transforms included with the Beam SDKs. * **What does your output data look like, and where should it go?** This will determine what kinds of `Write` transforms you'll need to apply at the end of your pipeline. ## A basic pipeline @@ -40,8 +36,7 @@ When designing your Beam pipeline, consider a few basic questions: The simplest pipelines represent a linear flow of operations, as shown in figure 1. ![A linear pipeline starts with one input collection, sequentially applies - three transforms, and ends with one output collection.]( - {{ "/images/design-your-pipeline-linear.svg" | prepend: site.baseurl }}) + three transforms, and ends with one output collection.](/images/design-your-pipeline-linear.svg) *Figure 1: A linear pipeline.* @@ -58,15 +53,14 @@ You can use the same `PCollection` as input for multiple transforms without cons The pipeline in figure 2 is a branching pipeline. The pipeline reads its input (first names represented as strings) from a database table and creates a `PCollection` of table rows. Then, the pipeline applies multiple transforms to the **same** `PCollection`. Transform A extracts all the names in that `PCollection` that start with the letter 'A', and Transform B extracts all the names in that `PCollection` that start with the letter 'B'. Both transforms A and B have the same input `PCollection`. ![The pipeline applies two transforms to a single input collection. Each - transform produces an output collection.]( - {{ "/images/design-your-pipeline-multiple-pcollections.svg" | prepend: site.baseurl }}) + transform produces an output collection.](/images/design-your-pipeline-multiple-pcollections.svg) *Figure 2: A branching pipeline. Two transforms are applied to a single PCollection of database table rows.* The following example code applies two transforms to a single input collection. -```java +{{< highlight java >}} PCollection dbRowCollection = ...; PCollection aCollection = dbRowCollection.apply("aTrans", ParDo.of(new DoFn(){ @@ -86,16 +80,15 @@ PCollection bCollection = dbRowCollection.apply("bTrans", ParDo.of(new D } } })); -``` +{{< /highlight >}} ### A single transform that produces multiple outputs -Another way to branch a pipeline is to have a **single** transform output to multiple `PCollection`s by using [tagged outputs]({{ site.baseurl }}/documentation/programming-guide/#additional-outputs). Transforms that produce more than one output process each element of the input once, and output to zero or more `PCollection`s. +Another way to branch a pipeline is to have a **single** transform output to multiple `PCollection`s by using [tagged outputs](/documentation/programming-guide/#additional-outputs). Transforms that produce more than one output process each element of the input once, and output to zero or more `PCollection`s. Figure 3 illustrates the same example described above, but with one transform that produces multiple outputs. Names that start with 'A' are added to the main output `PCollection`, and names that start with 'B' are added to an additional output `PCollection`. -![The pipeline applies one transform that produces multiple output collections.]( - {{ "/images/design-your-pipeline-additional-outputs.svg" | prepend: site.baseurl }}) +![The pipeline applies one transform that produces multiple output collections.](/images/design-your-pipeline-additional-outputs.svg) *Figure 3: A pipeline with a transform that outputs multiple PCollections.* @@ -121,7 +114,7 @@ where each element in the input `PCollection` is processed once. The following example code applies one transform that processes each element once and outputs two collections. -```java +{{< highlight java >}} // Define two TupleTags, one for each output. final TupleTag startsWithATag = new TupleTag(){}; final TupleTag startsWithBTag = new TupleTag(){}; @@ -151,7 +144,7 @@ mixedCollection.get(startsWithATag).apply(...); // Get subset of the output with tag startsWithBTag. mixedCollection.get(startsWithBTag).apply(...); -``` +{{< /highlight >}} You can use either mechanism to produce multiple output `PCollection`s. However, using additional outputs makes more sense if the transform's computation per element is time-consuming. @@ -170,14 +163,13 @@ single `PCollection` that now contains all names that begin with either 'A' or 'B'. Here, it makes sense to use `Flatten` because the `PCollection`s being merged both contain the same type. -![The pipeline merges two collections into one collection with the Flatten transform.]( - {{ "/images/design-your-pipeline-flatten.svg" | prepend: site.baseurl }}) +![The pipeline merges two collections into one collection with the Flatten transform.](/images/design-your-pipeline-flatten.svg) *Figure 4: A pipeline that merges two collections into one collection with the Flatten transform.* The following example code applies `Flatten` to merge two collections. -```java +{{< highlight java >}} //merge the two PCollections with Flatten PCollectionList collectionList = PCollectionList.of(aCollection).and(bCollection); PCollection mergedCollectionWithFlatten = collectionList @@ -185,20 +177,19 @@ PCollection mergedCollectionWithFlatten = collectionList // continue with the new merged PCollection mergedCollectionWithFlatten.apply(...); -``` +{{< /highlight >}} ## Multiple sources Your pipeline can read its input from one or more sources. If your pipeline reads from multiple sources and the data from those sources is related, it can be useful to join the inputs together. In the example illustrated in figure 5 below, the pipeline reads names and addresses from a database table, and names and order numbers from a Kafka topic. The pipeline then uses `CoGroupByKey` to join this information, where the key is the name; the resulting `PCollection` contains all the combinations of names, addresses, and orders. -![The pipeline joins two input collections into one collection with the Join transform.]( - {{ "/images/design-your-pipeline-join.svg" | prepend: site.baseurl }}) +![The pipeline joins two input collections into one collection with the Join transform.](/images/design-your-pipeline-join.svg) *Figure 5: A pipeline that does a relational join of two input collections.* The following example code applies `Join` to join two input collections. -```java +{{< highlight java >}} PCollection> userAddress = pipeline.apply(JdbcIO.>read()...); PCollection> userOrder = pipeline.apply(KafkaIO.read()...); @@ -213,9 +204,9 @@ PCollection> joinedCollection = .apply(CoGroupByKey.create()); joinedCollection.apply(...); -``` +{{< /highlight >}} ## What's next -* [Create your own pipeline]({{ site.baseurl }}/documentation/pipelines/create-your-pipeline). -* [Test your pipeline]({{ site.baseurl }}/documentation/pipelines/test-your-pipeline). +* [Create your own pipeline](/documentation/pipelines/create-your-pipeline). +* [Test your pipeline](/documentation/pipelines/test-your-pipeline). diff --git a/website/src/documentation/pipelines/test-your-pipeline.md b/website/www/site/content/en/documentation/pipelines/test-your-pipeline.md similarity index 85% rename from website/src/documentation/pipelines/test-your-pipeline.md rename to website/www/site/content/en/documentation/pipelines/test-your-pipeline.md index 460416f81dfa..0fdab53d70db 100644 --- a/website/src/documentation/pipelines/test-your-pipeline.md +++ b/website/www/site/content/en/documentation/pipelines/test-your-pipeline.md @@ -1,8 +1,5 @@ --- -layout: section title: "Test Your Pipeline" -section_menu: section-menu/documentation.html -permalink: /documentation/pipelines/test-your-pipeline/ --- # Test Your Pipeline -* TOC -{:toc} +{{< toc >}} Testing your pipeline is a particularly important step in developing an effective data processing solution. The indirect nature of the Beam model, in which your user code constructs a pipeline graph to be executed remotely, can make debugging-failed runs a non-trivial task. Often it is faster and simpler to perform local unit testing on your pipeline code than to debug a pipeline's remote execution. Before running your pipeline on the runner of your choice, unit testing your pipeline code locally is often the best way to identify and fix bugs in your pipeline code. Unit testing your pipeline locally also allows you to use your familiar/favorite local debugging tools. -You can use [DirectRunner]({{ site.baseurl }}/documentation/runners/direct), a local runner helpful for testing and local development. +You can use [DirectRunner](/documentation/runners/direct), a local runner helpful for testing and local development. After you test your pipeline using the `DirectRunner`, you can use the runner of your choice to test on a small scale. For example, use the Flink runner with a local or remote Flink cluster. - - - - - The Beam SDKs provide a number of ways to unit test your pipeline code, from the lowest to the highest levels. From the lowest to the highest level, these are: -* You can test the individual function objects, such as [DoFn]({{ site.baseurl }}/documentation/programming-guide/#pardo)s, inside your pipeline's core transforms. -* You can test an entire [Composite Transform]({{ site.baseurl }}/documentation/programming-guide/#composite-transforms) as a unit. +* You can test the individual function objects, such as [DoFn](/documentation/programming-guide/#pardo)s, inside your pipeline's core transforms. +* You can test an entire [Composite Transform](/documentation/programming-guide/#composite-transforms) as a unit. * You can perform an end-to-end test for an entire pipeline. To support unit testing, the Beam SDK for Java provides a number of test classes in the [testing package](https://github.com/apache/beam/tree/master/sdks/java/core/src/test/java/org/apache/beam/sdk). You can use these tests as references and guides. @@ -52,7 +43,7 @@ The Beam SDK for Java provides a convenient way to test an individual `DoFn` cal `DoFnTester`uses the [JUnit](https://junit.org) framework. To use `DoFnTester`, you'll need to do the following: 1. Create a `DoFnTester`. You'll need to pass an instance of the `DoFn` you want to test to the static factory method for `DoFnTester`. -2. Create one or more main test inputs of the appropriate type for your `DoFn`. If your `DoFn` takes side inputs and/or produces [multiple outputs]({{ site.baseurl }}/documentation/programming-guide#additional-outputs), you should also create the side inputs and the output tags. +2. Create one or more main test inputs of the appropriate type for your `DoFn`. If your `DoFn` takes side inputs and/or produces [multiple outputs](/documentation/programming-guide#additional-outputs), you should also create the side inputs and the output tags. 3. Call `DoFnTester.processBundle` to process the main inputs. 4. Use JUnit's `Assert.assertThat` method to ensure the test outputs returned from `processBundle` match your expected values. @@ -60,30 +51,30 @@ The Beam SDK for Java provides a convenient way to test an individual `DoFn` cal To create a `DoFnTester`, first create an instance of the `DoFn` you want to test. You then use that instance when you create a `DoFnTester` using the `.of()` static factory method: -```java +{{< highlight java >}} static class MyDoFn extends DoFn { ... } MyDoFn myDoFn = ...; DoFnTester fnTester = DoFnTester.of(myDoFn); -``` +{{< /highlight >}} ### Creating Test Inputs You'll need to create one or more test inputs for `DoFnTester` to send to your `DoFn`. To create test inputs, simply create one or more input variables of the same input type that your `DoFn` accepts. In the case above: -```java +{{< highlight java >}} static class MyDoFn extends DoFn { ... } MyDoFn myDoFn = ...; DoFnTester fnTester = DoFnTester.of(myDoFn); String testInput = "test1"; -``` +{{< /highlight >}} #### Side Inputs If your `DoFn` accepts side inputs, you can create those side inputs by using the method `DoFnTester.setSideInputs`. -```java +{{< highlight java >}} static class MyDoFn extends DoFn { ... } MyDoFn myDoFn = ...; DoFnTester fnTester = DoFnTester.of(myDoFn); @@ -91,9 +82,9 @@ DoFnTester fnTester = DoFnTester.of(myDoFn); PCollectionView> sideInput = ...; Iterable value = ...; fnTester.setSideInputInGlobalWindow(sideInput, value); -``` +{{< /highlight >}} -See the `ParDo` documentation on [side inputs]({{ site.baseurl }}/documentation/programming-guide/#side-inputs) for more information. +See the `ParDo` documentation on [side inputs](/documentation/programming-guide/#side-inputs) for more information. #### Additional Outputs @@ -106,7 +97,7 @@ Suppose your `DoFn` produces outputs of type `String` and `Integer`. You create `TupleTag` objects for each, and bundle them into a `TupleTagList`, then set it for the `DoFnTester` as follows: -```java +{{< highlight java >}} static class MyDoFn extends DoFn { ... } MyDoFn myDoFn = ...; DoFnTester fnTester = DoFnTester.of(myDoFn); @@ -116,9 +107,9 @@ TupleTag tag2 = ...; TupleTagList tags = TupleTagList.of(tag1).and(tag2); fnTester.setOutputTags(tags); -``` +{{< /highlight >}} -See the `ParDo` documentation on [additional outputs]({{ site.baseurl }}/documentation/programming-guide/#additional-outputs) for more information. +See the `ParDo` documentation on [additional outputs](/documentation/programming-guide/#additional-outputs) for more information. ### Processing Test Inputs and Checking Results @@ -126,18 +117,18 @@ To process the inputs (and thus run the test on your `DoFn`), you call the metho `DoFnTester.processBundle` returns a `List` of outputs—that is, objects of the same type as the `DoFn`'s specified output type. For a `DoFn`, `processBundle` returns a `List`: -```java +{{< highlight java >}} static class MyDoFn extends DoFn { ... } MyDoFn myDoFn = ...; DoFnTester fnTester = DoFnTester.of(myDoFn); String testInput = "test1"; List testOutputs = fnTester.processBundle(testInput); -``` +{{< /highlight >}} To check the results of `processBundle`, you use JUnit's `Assert.assertThat` method to test if the `List` of outputs contains the values you expect: -```java +{{< highlight java >}} String testInput = "test1"; List testOutputs = fnTester.processBundle(testInput); @@ -145,7 +136,7 @@ Assert.assertThat(testOutputs, Matchers.hasItems(...)); // Process a larger batch in a single step. Assert.assertThat(fnTester.processBundle("input1", "input2", "input3"), Matchers.hasItems(...)); -``` +{{< /highlight >}} ## Testing Composite Transforms @@ -163,22 +154,23 @@ To test a composite transform you've created, you can use the following pattern: You create a `TestPipeline` as follows: -```java +{{< highlight java >}} Pipeline p = TestPipeline.create(); -``` +{{< /highlight >}} -> **Note:** Read about testing unbounded pipelines in Beam in [this blog post]({{ site.baseurl }}/blog/2016/10/20/test-stream.html). +> **Note:** Read about testing unbounded pipelines in Beam in [this blog post](/blog/2016/10/20/test-stream.html). ### Using the Create Transform -You can use the `Create` transform to create a `PCollection` out of a standard in-memory collection class, such as Java `List`. See [Creating a PCollection]({{ site.baseurl }}/documentation/programming-guide/#creating-a-pcollection) for more information. +You can use the `Create` transform to create a `PCollection` out of a standard in-memory collection class, such as Java `List`. See [Creating a PCollection](/documentation/programming-guide/#creating-a-pcollection) for more information. ### PAssert -[PAssert](https://beam.apache.org/releases/javadoc/{{ site.release_latest }}/index.html?org/apache/beam/sdk/testing/PAssert.html) is a class included in the Beam Java SDK that is an assertion on the contents of a `PCollection`. You can use `PAssert`to verify that a `PCollection` contains a specific set of expected elements. + +[PAssert](https://beam.apache.org/releases/javadoc/{{< param release_latest >}}/index.html?org/apache/beam/sdk/testing/PAssert.html) is a class included in the Beam Java SDK that is an assertion on the contents of a `PCollection`. You can use `PAssert`to verify that a `PCollection` contains a specific set of expected elements. For a given `PCollection`, you can use `PAssert` to verify the contents as follows: -```java +{{< highlight java >}} PCollection output = ...; // Check whether a PCollection contains some elements in any order. @@ -187,26 +179,26 @@ PAssert.that(output) "elem1", "elem3", "elem2"); -``` +{{< /highlight >}} Any code that uses `PAssert` must link in `JUnit` and `Hamcrest`. If you're using Maven, you can link in `Hamcrest` by adding the following dependency to your project's `pom.xml` file: -```java +{{< highlight java >}} org.hamcrest hamcrest-all 1.3 test -``` +{{< /highlight >}} -For more information on how these classes work, see the [org.apache.beam.sdk.testing](https://beam.apache.org/releases/javadoc/{{ site.release_latest }}/index.html?org/apache/beam/sdk/testing/package-summary.html) package documentation. +For more information on how these classes work, see the [org.apache.beam.sdk.testing](https://beam.apache.org/releases/javadoc/{{< param release_latest >}}/index.html?org/apache/beam/sdk/testing/package-summary.html) package documentation. ### An Example Test for a Composite Transform The following code shows a complete test for a composite transform. The test applies the `Count` transform to an input `PCollection` of `String` elements. The test uses the `Create` transform to create the input `PCollection` from a Java `List`. -```java +{{< highlight java >}} public class CountTest { // Our static input data, which will make up the initial PCollection. @@ -240,7 +232,7 @@ public void testCount() { // Run the pipeline. p.run(); } -``` +{{< /highlight >}} ## Testing a Pipeline End-to-End @@ -255,11 +247,11 @@ You can use the test classes in the Beam SDKs (such as `TestPipeline` and `PAsse ### Testing the WordCount Pipeline -The following example code shows how one might test the [WordCount example pipeline]({{ site.baseurl }}/get-started/wordcount-example/). `WordCount` usually reads lines from a text file for input data; instead, the test creates a Java `List` containing some text lines and uses a `Create` transform to create an initial `PCollection`. +The following example code shows how one might test the [WordCount example pipeline](/get-started/wordcount-example/). `WordCount` usually reads lines from a text file for input data; instead, the test creates a Java `List` containing some text lines and uses a `Create` transform to create an initial `PCollection`. `WordCount`'s final transform (from the composite transform `CountWords`) produces a `PCollection` of formatted word counts suitable for printing. Rather than write that `PCollection` to an output text file, our test pipeline uses `PAssert` to verify that the elements of the `PCollection` match those of a static `String` array containing our expected output data. -```java +{{< highlight java >}} public class WordCountTest { // Our static input data, which will comprise the initial PCollection. @@ -291,4 +283,4 @@ public class WordCountTest { p.run(); } } -``` +{{< /highlight >}} diff --git a/website/src/documentation/programming-guide.md b/website/www/site/content/en/documentation/programming-guide.md similarity index 90% rename from website/src/documentation/programming-guide.md rename to website/www/site/content/en/documentation/programming-guide.md index 9034eaa84370..c30578a0ec86 100644 --- a/website/src/documentation/programming-guide.md +++ b/website/www/site/content/en/documentation/programming-guide.md @@ -1,9 +1,6 @@ --- -layout: section title: "Beam Programming Guide" -section_menu: section-menu/documentation.html -permalink: /documentation/programming-guide/ -redirect_from: +aliases: - /learn/programming-guide/ - /docs/learn/programming-guide/ --- @@ -31,16 +28,11 @@ programmatically building your Beam pipeline. As the programming guide is filled out, the text will include code samples in multiple languages to help illustrate how to implement Beam concepts in your pipelines. - +{{< language-switcher java py >}} -{:.language-py} +{{< paragraph class="language-py" >}} The Python SDK supports Python 2.7, 3.5, 3.6, and 3.7. New Python SDK releases will stop supporting Python 2.7 in 2020 ([BEAM-8371](https://issues.apache.org/jira/browse/BEAM-8371)). For best results, use Beam with Python 3. +{{< /paragraph >}} ## 1. Overview {#overview} @@ -109,10 +101,10 @@ asynchronous "job" (or equivalent) on that back-end. The `Pipeline` abstraction encapsulates all the data and steps in your data processing task. Your Beam driver program typically starts by constructing a -[Pipeline](https://beam.apache.org/releases/javadoc/{{ site.release_latest }}/index.html?org/apache/beam/sdk/Pipeline.html) +[Pipeline](https://beam.apache.org/releases/javadoc/{{< param release_latest >}}/index.html?org/apache/beam/sdk/Pipeline.html) [Pipeline](https://github.com/apache/beam/blob/master/sdks/python/apache_beam/pipeline.py) object, and then using that object as the basis for creating the pipeline's data -sets as `PCollection`s and its operations as `PTransform`s. +sets as `PCollection`s and its operations as `Transform`s. To use Beam, your driver program must first create an instance of the Beam SDK class `Pipeline` (typically in the `main()` function). When you create your @@ -121,21 +113,22 @@ your pipeline's configuration options programatically, but it's often easier to set the options ahead of time (or read them from the command line) and pass them to the `Pipeline` object when you create the object. -```java +{{< highlight java >}} // Start by defining the options for the pipeline. PipelineOptions options = PipelineOptionsFactory.create(); // Then create the pipeline. Pipeline p = Pipeline.create(options); -``` -```py -{% github_sample /apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets.py tag:pipelines_constructing_creating -%} -``` -```go +{{< /highlight >}} + +{{< highlight py >}} +{{< github_sample "/apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets.py" pipelines_constructing_creating >}} +{{< /highlight >}} + +{{< highlight go >}} // In order to start creating the pipeline for execution, a Pipeline object and a Scope object are needed. p, s := beam.NewPipelineWithRoot() -``` +{{< /highlight >}} ### 2.1. Configuring pipeline options {#configuring-pipeline-options} @@ -158,18 +151,19 @@ you can use to set fields in `PipelineOptions` using command-line arguments. To read options from the command-line, construct your `PipelineOptions` object as demonstrated in the following example code: -```java +{{< highlight java >}} PipelineOptions options = PipelineOptionsFactory.fromArgs(args).withValidation().create(); -``` -```py -{% github_sample /apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets.py tag:pipelines_constructing_creating -%} -``` -```go +{{< /highlight >}} + +{{< highlight py >}} +{{< github_sample "/apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets.py" pipelines_constructing_creating >}} +{{< /highlight >}} + +{{< highlight go >}} // If beamx or Go flags are used, flags must be parsed first. flag.Parse() -``` +{{< /highlight >}} This interprets command-line arguments that follow the format: @@ -183,7 +177,7 @@ This interprets command-line arguments that follow the format: Building your `PipelineOptions` this way lets you specify any of the options as a command-line argument. -> **Note:** The [WordCount example pipeline]({{ site.baseurl }}/get-started/wordcount-example) +> **Note:** The [WordCount example pipeline](/get-started/wordcount-example) > demonstrates how to set pipeline options at runtime by using command-line > options. @@ -194,7 +188,7 @@ You can add your own custom options in addition to the standard setter methods for each option, as in the following example for adding `input` and `output` custom options: -```java +{{< highlight java >}} public interface MyOptions extends PipelineOptions { String getInput(); void setInput(String input); @@ -202,24 +196,25 @@ public interface MyOptions extends PipelineOptions { String getOutput(); void setOutput(String output); } -``` -```py -{% github_sample /apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets.py tag:pipeline_options_define_custom -%} -``` -```go +{{< /highlight >}} + +{{< highlight py >}} +{{< github_sample "/apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets.py" pipeline_options_define_custom >}} +{{< /highlight >}} + +{{< highlight go >}} var ( input = flag.String("input", "", "") output = flag.String("output", "", "") ) -``` +{{< /highlight >}} You can also specify a description, which appears when a user passes `--help` as a command-line argument, and a default value. You set the description and default value using annotations, as follows: -```java +{{< highlight java >}} public interface MyOptions extends PipelineOptions { @Description("Input for the pipeline") @Default.String("gs://my-bucket/input") @@ -231,43 +226,45 @@ public interface MyOptions extends PipelineOptions { String getOutput(); void setOutput(String output); } -``` -```py -{% github_sample /apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets.py tag:pipeline_options_define_custom_with_help_and_default -%} -``` -```go +{{< /highlight >}} + +{{< highlight py >}} +{{< github_sample "/apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets.py" pipeline_options_define_custom_with_help_and_default >}} +{{< /highlight >}} + +{{< highlight go >}} var ( input = flag.String("input", "gs://my-bucket/input", "Input for the pipeline") output = flag.String("output", "gs://my-bucket/output", "Output for the pipeline") ) -``` +{{< /highlight >}} - -{:.language-java} +{{< paragraph class="language-java" >}} It's recommended that you register your interface with `PipelineOptionsFactory` and then pass the interface when creating the `PipelineOptions` object. When you register your interface with `PipelineOptionsFactory`, the `--help` can find your custom options interface and add it to the output of the `--help` command. `PipelineOptionsFactory` will also validate that your custom options are compatible with all other registered options. +{{< /paragraph >}} -{:.language-java} +{{< paragraph class="language-java" >}} The following example code shows how to register your custom options interface with `PipelineOptionsFactory`: +{{< /paragraph >}} -```java +{{< highlight java >}} PipelineOptionsFactory.register(MyOptions.class); MyOptions options = PipelineOptionsFactory.fromArgs(args) .withValidation() .as(MyOptions.class); -``` +{{< /highlight >}} Now your pipeline can accept `--input=value` and `--output=value` as command-line arguments. ## 3. PCollections {#pcollections} -The [PCollection](https://beam.apache.org/releases/javadoc/{{ site.release_latest }}/index.html?org/apache/beam/sdk/values/PCollection.html) +The [PCollection](https://beam.apache.org/releases/javadoc/{{< param release_latest >}}/index.html?org/apache/beam/sdk/values/PCollection.html) `PCollection` abstraction represents a potentially distributed, multi-element data set. You can think of a `PCollection` as "pipeline" data; Beam transforms use `PCollection` objects as @@ -305,7 +302,7 @@ would apply `TextIO.Read` `io.TextFileSource` to your `Pipeline` to create a `PCollection`: -```java +{{< highlight java >}} public static void main(String[] args) { // Create the pipeline. PipelineOptions options = @@ -316,39 +313,43 @@ public static void main(String[] args) { PCollection lines = p.apply( "ReadMyFile", TextIO.read().from("gs://some/inputData.txt")); } -``` -```py -{% github_sample /apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets.py tag:pipelines_constructing_reading -%} -``` -```go +{{< /highlight >}} + +{{< highlight py >}} +{{< github_sample "/apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets.py" pipelines_constructing_reading >}} +{{< /highlight >}} + +{{< highlight go >}} lines := textio.Read(s, "gs://some/inputData.txt") -``` +{{< /highlight >}} See the [section on I/O](#pipeline-io) to learn more about how to read from the various data sources supported by the Beam SDK. #### 3.1.2. Creating a PCollection from in-memory data {#creating-pcollection-in-memory} -{:.language-java} +{{< paragraph class="language-java" >}} To create a `PCollection` from an in-memory Java `Collection`, you use the Beam-provided `Create` transform. Much like a data adapter's `Read`, you apply `Create` directly to your `Pipeline` object itself. +{{< /paragraph >}} -{:.language-java} +{{< paragraph class="language-java" >}} As parameters, `Create` accepts the Java `Collection` and a `Coder` object. The `Coder` specifies how the elements in the `Collection` should be [encoded](#element-type). +{{< /paragraph >}} -{:.language-py} +{{< paragraph class="language-py" >}} To create a `PCollection` from an in-memory `list`, you use the Beam-provided `Create` transform. Apply this transform directly to your `Pipeline` object itself. +{{< /paragraph >}} The following example code shows how to create a `PCollection` from an in-memory `List``list`: -```java +{{< highlight java >}} public static void main(String[] args) { // Create a Java Collection, in this case a List of Strings. final List LINES = Arrays.asList( @@ -365,11 +366,11 @@ public static void main(String[] args) { // Apply Create, passing the list and the coder, to create the PCollection. p.apply(Create.of(LINES)).setCoder(StringUtf8Coder.of()); } -``` -```py -{% github_sample /apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets.py tag:model_pcollection -%} -``` +{{< /highlight >}} + +{{< highlight py >}} +{{< github_sample "/apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets.py" model_pcollection >}} +{{< /highlight >}} ### 3.2. PCollection characteristics {#pcollection-characteristics} @@ -392,7 +393,7 @@ specifying custom encodings as needed. In many cases, the element type in a `PCollection` has a structure that can introspected. Examples are JSON, Protocol Buffer, Avro, and database records. Schemas provide a way to express types as a set of named fields, allowing for more-expressive aggregations. - + #### 3.2.3. Immutability {#immutability} A `PCollection` is immutable. Once created, you cannot add, remove, or change @@ -490,12 +491,13 @@ slight difference: You apply the transform to the input `PCollection`, passing the transform itself as an argument, and the operation returns the output `PCollection`. This takes the general form: -```java +{{< highlight java >}} [Output PCollection] = [Input PCollection].apply([Transform]) -``` -```py +{{< /highlight >}} + +{{< highlight py >}} [Output PCollection] = [Input PCollection] | [Transform] -``` +{{< /highlight >}} Because Beam uses a generic `apply` method for `PCollection`, you can both chain transforms sequentially and also apply transforms that contain other transforms @@ -505,22 +507,22 @@ SDKs). How you apply your pipeline's transforms determines the structure of your pipeline. The best way to think of your pipeline is as a directed acyclic graph, where `PTransform` nodes are subroutines that accept `PCollection` nodes as inputs and emit `PCollection` nodes as outputs. For example, you can chain together transforms to create a pipeline that successively modifies input data: -```java +{{< highlight java >}} [Final Output PCollection] = [Initial Input PCollection].apply([First Transform]) .apply([Second Transform]) .apply([Third Transform]) -``` -```py +{{< /highlight >}} + +{{< highlight py >}} [Final Output PCollection] = ([Initial Input PCollection] | [First Transform] | [Second Transform] | [Third Transform]) -``` +{{< /highlight >}} The graph of this pipeline looks like the following: ![This linear pipeline starts with one input collection, sequentially applies - three transforms, and ends with one output collection.]( - {{ "/images/design-your-pipeline-linear.svg" | prepend: site.baseurl }}) + three transforms, and ends with one output collection.](/images/design-your-pipeline-linear.svg) *Figure 1: A linear pipeline with three sequential transforms.* @@ -529,22 +531,22 @@ collection--remember that a `PCollection` is immutable by definition. This means that you can apply multiple transforms to the same input `PCollection` to create a branching pipeline, like so: -```java +{{< highlight java >}} [PCollection of database table rows] = [Database Table Reader].apply([Read Transform]) [PCollection of 'A' names] = [PCollection of database table rows].apply([Transform A]) [PCollection of 'B' names] = [PCollection of database table rows].apply([Transform B]) -``` -```py +{{< /highlight >}} + +{{< highlight py >}} [PCollection of database table rows] = [Database Table Reader] | [Read Transform] [PCollection of 'A' names] = [PCollection of database table rows] | [Transform A] [PCollection of 'B' names] = [PCollection of database table rows] | [Transform B] -``` +{{< /highlight >}} The graph of this branching pipeline looks like the following: ![This pipeline applies two transforms to a single input collection. Each - transform produces an output collection.]( - {{ "/images/design-your-pipeline-multiple-pcollections.svg" | prepend: site.baseurl }}) + transform produces an output collection.](/images/design-your-pipeline-multiple-pcollections.svg) *Figure 2: A branching pipeline. Two transforms are applied to a single PCollection of database table rows.* @@ -610,7 +612,7 @@ Like all Beam transforms, you apply `ParDo` by calling the `apply` method on the input `PCollection` and passing `ParDo` as an argument, as shown in the following example code: -```java +{{< highlight java >}} // The input PCollection of Strings. PCollection words = ...; @@ -622,17 +624,18 @@ PCollection wordLengths = words.apply( ParDo .of(new ComputeWordLengthFn())); // The DoFn to perform on each element, which // we define above. -``` -```py +{{< /highlight >}} + +{{< highlight py >}} # The input PCollection of Strings. words = ... # The DoFn to perform on each element in the input PCollection. -{% github_sample /apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets_test.py tag:model_pardo_pardo -%} -{% github_sample /apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets_test.py tag:model_pardo_apply -%}``` -```go +{{< github_sample "/apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets_test.py" model_pardo_pardo >}} +{{< github_sample "/apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets_test.py" model_pardo_apply >}} +{{< /highlight >}} + +{{< highlight go >}} // words is the input PCollection of strings var words beam.PCollection = ... @@ -641,7 +644,7 @@ func computeWordLengthFn(word string) int { } wordLengths := beam.ParDo(s, computeWordLengthFn, words) -``` +{{< /highlight >}} In the example, our input `PCollection` contains `String` values. We apply a `ParDo` transform that specifies a function (`ComputeWordLengthFn`) to compute @@ -659,19 +662,20 @@ define your pipeline's exact data processing tasks. > for writing user code for Beam transforms](#requirements-for-writing-user-code-for-beam-transforms) > and ensure that your code follows them. -{:.language-java} +{{< paragraph class="language-java" >}} A `DoFn` processes one element at a time from the input `PCollection`. When you create a subclass of `DoFn`, you'll need to provide type parameters that match the types of the input and output elements. If your `DoFn` processes incoming `String` elements and produces `Integer` elements for the output collection (like our previous example, `ComputeWordLengthFn`), your class declaration would look like this: +{{< /paragraph >}} -```java +{{< highlight java >}} static class ComputeWordLengthFn extends DoFn { ... } -``` +{{< /highlight >}} -{:.language-java} +{{< paragraph class="language-java" >}} Inside your `DoFn` subclass, you'll write a method annotated with `@ProcessElement` where you provide the actual processing logic. You don't need to manually extract the elements from the input collection; the Beam SDKs handle @@ -682,16 +686,18 @@ provides a method for emitting elements. The parameter types must match the inpu and output types of your `DoFn` or the framework will raise an error. Note: @Element and OutputReceiver were introduced in Beam 2.5.0; if using an earlier release of Beam, a ProcessContext parameter should be used instead. +{{< /paragraph >}} -{:.language-py} +{{< paragraph class="language-py" >}} Inside your `DoFn` subclass, you'll write a method `process` where you provide the actual processing logic. You don't need to manually extract the elements from the input collection; the Beam SDKs handle that for you. Your `process` method should accept an object of type `element`. This is the input element and output is emitted by using `yield` or `return` statement inside `process` method. +{{< /paragraph >}} -```java +{{< highlight java >}} static class ComputeWordLengthFn extends DoFn { @ProcessElement public void processElement(@Element String word, OutputReceiver out) { @@ -699,16 +705,17 @@ static class ComputeWordLengthFn extends DoFn { out.output(word.length()); } } -``` -```py -{% github_sample /apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets_test.py tag:model_pardo_pardo -%} -``` +{{< /highlight >}} + +{{< highlight py >}} +{{< github_sample "/apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets_test.py" model_pardo_pardo >}} +{{< /highlight >}} -{:.language-java} +{{< paragraph class="language-java" >}} > **Note:** If the elements in your input `PCollection` are key/value pairs, you > can access the key or value by using `element.getKey()` or > `element.getValue()`, respectively. +{{< /paragraph >}} A given `DoFn` instance generally gets invoked one or more times to process some arbitrary bundle of elements. However, Beam doesn't guarantee an exact number of @@ -722,12 +729,13 @@ requirements to ensure that Beam and the processing back-end can safely serialize and cache the values in your pipeline. Your method should meet the following requirements: -{:.language-java} +{{< paragraph class="language-java" >}} * You should not in any way modify an element returned by the `@Element` annotation or `ProcessContext.sideInput()` (the incoming elements from the input collection). * Once you output a value using `OutputReceiver.output()` you should not modify that value in any way. +{{< /paragraph >}} ##### 4.2.1.3. Lightweight DoFns and other abstractions {#lightweight-dofns} @@ -741,7 +749,7 @@ Here's the previous example, `ParDo` with `ComputeLengthWordsFn`, with the an anonymous inner class instance a lambda function: -```java +{{< highlight java >}} // The input PCollection. PCollection words = ...; @@ -755,23 +763,25 @@ PCollection wordLengths = words.apply( out.output(word.length()); } })); -``` -```py +{{< /highlight >}} + +{{< highlight py >}} # The input PCollection of strings. words = ... # Apply a lambda function to the PCollection words. # Save the result as the PCollection word_lengths. -{% github_sample /apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets_test.py tag:model_pardo_using_flatmap -%}``` -```go +{{< github_sample "/apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets_test.py" model_pardo_using_flatmap >}} +{{< /highlight >}} + +{{< highlight go >}} // words is the input PCollection of strings var words beam.PCollection = ... lengths := beam.ParDo(s, func (word string) int { return len(word) }, words) -``` +{{< /highlight >}} If your `ParDo` performs a one-to-one mapping of input elements to output elements--that is, for each input element, it applies a function that produces @@ -783,7 +793,7 @@ Java 8 lambda function for additional brevity. Here's the previous example using `MapElements` `Map`: -```java +{{< highlight java >}} // The input PCollection. PCollection words = ...; @@ -792,19 +802,21 @@ PCollection words = ...; PCollection wordLengths = words.apply( MapElements.into(TypeDescriptors.integers()) .via((String word) -> word.length())); -``` -```py +{{< /highlight >}} + +{{< highlight py >}} # The input PCollection of string. words = ... # Apply a Map with a lambda function to the PCollection words. # Save the result as the PCollection word_lengths. -{% github_sample /apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets_test.py tag:model_pardo_using_map -%}``` +{{< github_sample "/apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets_test.py" model_pardo_using_map >}} +{{< /highlight >}} -{:.language-java} +{{< paragraph class="language-java" >}} > **Note:** You can use Java 8 lambda functions with several other Beam > transforms, including `Filter`, `FlatMapElements`, and `Partition`. +{{< /paragraph >}} ##### 4.2.1.4. DoFn lifecycle {#dofn} Here is a sequence diagram that shows the lifecycle of the DoFn during @@ -814,8 +826,7 @@ Here is a sequence diagram that shows the lifecycle of the DoFn during instance reuse. They also give instanciation use cases. -![This is a sequence diagram that shows the lifecycle of the DoFn]( - {{ "/images/dofn-sequence-diagram.svg" | prepend: site.baseurl }}) +![This is a sequence diagram that shows the lifecycle of the DoFn](/images/dofn-sequence-diagram.svg) #### 4.2.2. GroupByKey {#groupbykey} @@ -904,7 +915,7 @@ IllegalStateException error at pipeline construction time. `CoGroupByKey` performs a relational join of two or more key/value `PCollection`s that have the same key type. -[Design Your Pipeline]({{ site.baseurl }}/documentation/pipelines/design-your-pipeline/#multiple-sources) +[Design Your Pipeline](/documentation/pipelines/design-your-pipeline/#multiple-sources) shows an example pipeline that uses a join. Consider using `CoGroupByKey` if you have multiple data sets that provide @@ -954,46 +965,50 @@ The first set of data contains names and email addresses. The second set of data contains names and phone numbers. -```java -{% github_sample /apache/beam/blob/master/examples/java/src/test/java/org/apache/beam/examples/snippets/SnippetsTest.java tag:CoGroupByKeyTupleInputs -%}``` -```py -{% github_sample /apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets_test.py tag:model_group_by_key_cogroupbykey_tuple_inputs -%}``` +{{< highlight java >}} +{{< github_sample "/apache/beam/blob/master/examples/java/src/test/java/org/apache/beam/examples/snippets/SnippetsTest.java" CoGroupByKeyTupleInputs >}} +{{< /highlight >}} + +{{< highlight py >}} +{{< github_sample "/apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets_test.py" model_group_by_key_cogroupbykey_tuple_inputs >}} +{{< /highlight >}} After `CoGroupByKey`, the resulting data contains all data associated with each unique key from any of the input collections. -```java -{% github_sample /apache/beam/blob/master/examples/java/src/test/java/org/apache/beam/examples/snippets/SnippetsTest.java tag:CoGroupByKeyTupleOutputs -%}``` -```py -{% github_sample /apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets_test.py tag:model_group_by_key_cogroupbykey_tuple_outputs -%}``` +{{< highlight java >}} +{{< github_sample "/apache/beam/blob/master/examples/java/src/test/java/org/apache/beam/examples/snippets/SnippetsTest.java" CoGroupByKeyTupleOutputs >}} +{{< /highlight >}} + +{{< highlight py >}} +{{< github_sample "/apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets_test.py" model_group_by_key_cogroupbykey_tuple_outputs >}} +{{< /highlight >}} The following code example joins the two `PCollection`s with `CoGroupByKey`, followed by a `ParDo` to consume the result. Then, the code uses tags to look up and format data from each collection. -```java -{% github_sample /apache/beam/blob/master/examples/java/src/main/java/org/apache/beam/examples/snippets/Snippets.java tag:CoGroupByKeyTuple -%}``` -```py -{% github_sample /apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets.py tag:model_group_by_key_cogroupbykey_tuple -%}``` +{{< highlight java >}} +{{< github_sample "/apache/beam/blob/master/examples/java/src/main/java/org/apache/beam/examples/snippets/Snippets.java" CoGroupByKeyTuple >}} +{{< /highlight >}} + +{{< highlight py >}} +{{< github_sample "/apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets.py" model_group_by_key_cogroupbykey_tuple >}} +{{< /highlight >}} The formatted data looks like this: -```java -{% github_sample /apache/beam/blob/master/examples/java/src/test/java/org/apache/beam/examples/snippets/SnippetsTest.java tag:CoGroupByKeyTupleFormattedOutputs -%}``` -```py -{% github_sample /apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets_test.py tag:model_group_by_key_cogroupbykey_tuple_formatted_outputs -%}``` +{{< highlight java >}} +{{< github_sample "/apache/beam/blob/master/examples/java/src/test/java/org/apache/beam/examples/snippets/SnippetsTest.java" CoGroupByKeyTupleFormattedOutputs >}} +{{< /highlight >}} + +{{< highlight py >}} +{{< github_sample "/apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets_test.py" model_group_by_key_cogroupbykey_tuple_formatted_outputs >}} +{{< /highlight >}} #### 4.2.4. Combine {#combine} -[`Combine`](https://beam.apache.org/releases/javadoc/{{ site.release_latest }}/index.html?org/apache/beam/sdk/transforms/Combine.html) +[`Combine`](https://beam.apache.org/releases/javadoc/{{< param release_latest >}}/index.html?org/apache/beam/sdk/transforms/Combine.html) [`Combine`](https://github.com/apache/beam/blob/master/sdks/python/apache_beam/transforms/core.py) is a Beam transform for combining collections of elements or values in your data. `Combine` has variants that work on entire `PCollection`s, and some that @@ -1018,7 +1033,7 @@ input/output type. The following example code shows a simple combine function. -```java +{{< highlight java >}} // Sum a collection of Integer values. The function SumInts implements the interface SerializableFunction. public static class SumInts implements SerializableFunction, Integer> { @Override @@ -1030,11 +1045,11 @@ public static class SumInts implements SerializableFunction, I return sum; } } -``` +{{< /highlight >}} -```py -{% github_sample /apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets_test.py tag:combine_bounded_sum -%}``` +{{< highlight py >}} +{{< github_sample "/apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets_test.py" combine_bounded_sum >}} +{{< /highlight >}} ##### 4.2.4.2. Advanced combinations using CombineFn {#advanced-combines} @@ -1070,7 +1085,7 @@ corresponding methods: The following example code shows how to define a `CombineFn` that computes a mean average: -```java +{{< highlight java >}} public class AverageFn extends CombineFn { public static class Accum { int sum = 0; @@ -1102,11 +1117,12 @@ public class AverageFn extends CombineFn { return ((double) accum.sum) / accum.count; } } -``` -```py +{{< /highlight >}} + +{{< highlight py >}} pc = ... -{% github_sample /apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets_test.py tag:combine_custom_average_define -%}``` +{{< github_sample "/apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets_test.py" combine_custom_average_define >}} +{{< /highlight >}} ##### 4.2.4.3. Combining a PCollection into a single value {#combining-pcollection} @@ -1116,20 +1132,21 @@ containing one element. The following example code shows how to apply the Beam provided sum combine function to produce a single sum value for a `PCollection` of integers. -```java +{{< highlight java >}} // Sum.SumIntegerFn() combines the elements in the input PCollection. The resulting PCollection, called sum, // contains one value: the sum of all the elements in the input PCollection. PCollection pc = ...; PCollection sum = pc.apply( Combine.globally(new Sum.SumIntegerFn())); -``` -```py +{{< /highlight >}} + +{{< highlight py >}} # sum combines the elements in the input PCollection. # The resulting PCollection, called result, contains one value: the sum of all # the elements in the input PCollection. pc = ... -{% github_sample /apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets_test.py tag:combine_custom_average_execute -%}``` +{{< github_sample "/apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets_test.py" combine_custom_average_execute >}} +{{< /highlight >}} ##### 4.2.4.4. Combine and global windowing {#combine-global-windowing} @@ -1144,15 +1161,16 @@ To have `Combine` instead return an empty `PCollection` if the input is empty, specify `.withoutDefaults` when you apply your `Combine` transform, as in the following code example: -```java +{{< highlight java >}} PCollection pc = ...; PCollection sum = pc.apply( Combine.globally(new Sum.SumIntegerFn()).withoutDefaults()); -``` -```py +{{< /highlight >}} + +{{< highlight py >}} pc = ... sum = pc | beam.CombineGlobally(sum).without_defaults() -``` +{{< /highlight >}} ##### 4.2.4.5. Combine and non-global windowing {#combine-non-global-windowing} @@ -1192,7 +1210,7 @@ create a single, merged value to be paired with each key. This pattern of a Beam's Combine PerKey transform. The combine function you supply to Combine PerKey must be an associative reduction function or a subclass of `CombineFn`. -```java +{{< highlight java >}} // PCollection is grouped by key and the Double values associated with each key are combined into a Double. PCollection> salesRecords = ...; PCollection> totalSalesPerPerson = @@ -1205,18 +1223,18 @@ PCollection> playerAccuracy = ...; PCollection> avgAccuracyPerPlayer = playerAccuracy.apply(Combine.perKey( new MeanInts()))); -``` -```py +{{< /highlight >}} + +{{< highlight py >}} # PCollection is grouped by key and the numeric values associated with each key # are averaged into a float. player_accuracies = ... -{% github_sample /apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets_test.py tag:combine_per_key -%} -``` +{{< github_sample "/apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets_test.py" combine_per_key >}} +{{< /highlight >}} #### 4.2.5. Flatten {#flatten} -[`Flatten`](https://beam.apache.org/releases/javadoc/{{ site.release_latest }}/index.html?org/apache/beam/sdk/transforms/Flatten.html) +[`Flatten`](https://beam.apache.org/releases/javadoc/{{< param release_latest >}}/index.html?org/apache/beam/sdk/transforms/Flatten.html) [`Flatten`](https://github.com/apache/beam/blob/master/sdks/python/apache_beam/transforms/core.py) is a Beam transform for `PCollection` objects that store the same data type. `Flatten` merges multiple `PCollection` objects into a single logical @@ -1225,7 +1243,7 @@ is a Beam transform for `PCollection` objects that store the same data type. The following example shows how to apply a `Flatten` transform to merge multiple `PCollection` objects. -```java +{{< highlight java >}} // Flatten takes a PCollectionList of PCollection objects of a given type. // Returns a single PCollection that contains all of the elements in the PCollection objects in that list. PCollection pc1 = ...; @@ -1234,15 +1252,14 @@ PCollection pc3 = ...; PCollectionList collections = PCollectionList.of(pc1).and(pc2).and(pc3); PCollection merged = collections.apply(Flatten.pCollections()); -``` +{{< /highlight >}} + -```py +{{< highlight py >}} # Flatten takes a tuple of PCollection objects. # Returns a single PCollection that contains all of the elements in the PCollection objects in that tuple. -{% -github_sample /apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets.py tag:model_multiple_pcollections_flatten -%} -``` +{{< github_sample "/apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets.py" model_multiple_pcollections_flatten >}} +{{< /highlight >}} ##### 4.2.5.1. Data encoding in merged collections {#data-encoding-merged-collections} @@ -1265,7 +1282,7 @@ pipeline is constructed. #### 4.2.6. Partition {#partition} -[`Partition`](https://beam.apache.org/releases/javadoc/{{ site.release_latest }}/index.html?org/apache/beam/sdk/transforms/Partition.html) +[`Partition`](https://beam.apache.org/releases/javadoc/{{< param release_latest >}}/index.html?org/apache/beam/sdk/transforms/Partition.html) [`Partition`](https://github.com/apache/beam/blob/master/sdks/python/apache_beam/transforms/core.py) is a Beam transform for `PCollection` objects that store the same data type. `Partition` splits a single `PCollection` into a fixed number of smaller @@ -1283,7 +1300,7 @@ for instance). The following example divides a `PCollection` into percentile groups. -```java +{{< highlight java >}} // Provide an int value with the desired number of result partitions, and a PartitionFn that represents the // partitioning function. In this example, we define the PartitionFn in-line. Returns a PCollectionList // containing each of the resulting partitions as individual PCollection objects. @@ -1298,18 +1315,17 @@ PCollectionList studentsByPercentile = // You can extract each partition from the PCollectionList using the get method, as follows: PCollection fortiethPercentile = studentsByPercentile.get(4); -``` -```py +{{< /highlight >}} + +{{< highlight py >}} # Provide an int value with the desired number of result partitions, and a partitioning function (partition_fn in this example). # Returns a tuple of PCollection objects containing each of the resulting partitions as individual PCollection objects. students = ... -{% github_sample /apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets.py tag:model_multiple_pcollections_partition -%} +{{< github_sample "/apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets.py" model_multiple_pcollections_partition >}} # You can extract each partition from the tuple of PCollection objects as follows: -{% github_sample /apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets.py tag:model_multiple_pcollections_partition_40th -%} -``` +{{< github_sample "/apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets.py" model_multiple_pcollections_partition_40th >}} +{{< /highlight >}} ### 4.3. Requirements for writing user code for Beam transforms {#requirements-for-writing-user-code-for-beam-transforms} @@ -1394,7 +1410,7 @@ determined by the input data, or depend on a different branch of your pipeline. #### 4.4.1. Passing side inputs to ParDo {#side-inputs-pardo} -```java +{{< highlight java >}} // Pass side inputs to your ParDo transform by invoking .withSideInputs. // Inside your DoFn, access the side input by using the method DoFn.ProcessContext.sideInput. @@ -1423,24 +1439,23 @@ determined by the input data, or depend on a different branch of your pipeline. } }).withSideInputs(maxWordLengthCutOffView) ); -``` -```py +{{< /highlight >}} + +{{< highlight py >}} # Side inputs are available as extra arguments in the DoFn's process method or Map / FlatMap's callable. # Optional, positional, and keyword arguments are all supported. Deferred arguments are unwrapped into their # actual values. For example, using pvalue.AsIteor(pcoll) at pipeline construction time results in an iterable # of the actual elements of pcoll being passed into each process invocation. In this example, side inputs are # passed to a FlatMap transform as extra arguments and consumed by filter_using_length. words = ... -{% github_sample /apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets_test.py tag:model_pardo_side_input -%} +{{< github_sample "/apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets_test.py" model_pardo_side_input >}} # We can also pass side inputs to a ParDo transform, which will get passed to its process method. # The first two arguments for the process method would be self and element. -{% github_sample /apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets_test.py tag:model_pardo_side_input_dofn -%} +{{< github_sample "/apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets_test.py" model_pardo_side_input_dofn >}} ... -``` +{{< /highlight >}} #### 4.4.2. Side inputs and windowing {#side-inputs-windowing} @@ -1481,7 +1496,7 @@ together. #### 4.5.1. Tags for multiple outputs {#output-tags} -```java +{{< highlight java >}} // To emit elements to multiple output PCollections, create a TupleTag object to identify each collection // that your ParDo produces. For example, if your ParDo produces three output PCollections (the main output // and two additional outputs), you must create three TupleTags. The following example code shows how to @@ -1527,26 +1542,25 @@ together. // Specify the tags for the two additional outputs as a TupleTagList. TupleTagList.of(wordLengthsAboveCutOffTag) .and(markedWordsTag))); -``` +{{< /highlight >}} -```py +{{< highlight py >}} # To emit elements to multiple output PCollections, invoke with_outputs() on the ParDo, and specify the # expected tags for the outputs. with_outputs() returns a DoOutputsTuple object. Tags specified in # with_outputs are attributes on the returned DoOutputsTuple object. The tags give access to the # corresponding output PCollections. -{% github_sample /apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets_test.py tag:model_pardo_with_tagged_outputs -%} +{{< github_sample "/apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets_test.py" model_pardo_with_tagged_outputs >}} # The result is also iterable, ordered in the same order that the tags were passed to with_outputs(), # the main tag (if specified) first. -{% github_sample /apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets_test.py tag:model_pardo_with_tagged_outputs_iter -%}``` +{{< github_sample "/apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets_test.py" model_pardo_with_tagged_outputs_iter >}} +{{< /highlight >}} #### 4.5.2. Emitting to multiple outputs in your DoFn {#multiple-outputs-dofn} -```java +{{< highlight java >}} // Inside your ParDo's DoFn, you can emit an element to a specific output PCollection by providing a // MultiOutputReceiver to your process method, and passing in the appropriate TupleTag to obtain an OutputReceiver. // After your ParDo, extract the resulting output PCollections from the returned PCollectionTuple. @@ -1567,47 +1581,50 @@ together. out.get(markedWordsTag).output(word); } }})); -``` +{{< /highlight >}} -```py +{{< highlight py >}} # Inside your ParDo's DoFn, you can emit an element to a specific output by wrapping the value and the output tag (str). # using the pvalue.OutputValue wrapper class. # Based on the previous example, this shows the DoFn emitting to the main output and two additional outputs. -{% github_sample /apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets_test.py tag:model_pardo_emitting_values_on_tagged_outputs -%} +{{< github_sample "/apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets_test.py" model_pardo_emitting_values_on_tagged_outputs >}} # Producing multiple outputs is also available in Map and FlatMap. # Here is an example that uses FlatMap and shows that the tags do not need to be specified ahead of time. -{% github_sample /apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets_test.py tag:model_pardo_with_undeclared_outputs -%}``` +{{< github_sample "/apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets_test.py" model_pardo_with_undeclared_outputs >}} +{{< /highlight >}} #### 4.5.3. Accessing additional parameters in your DoFn {#other-dofn-parameters} -{:.language-java} +{{< paragraph class="language-java" >}} In addition to the element and the `OutputReceiver`, Beam will populate other parameters to your DoFn's `@ProcessElement` method. Any combination of these parameters can be added to your process method in any order. +{{< /paragraph >}} -{:.language-py} +{{< paragraph class="language-py" >}} In addition to the element, Beam will populate other parameters to your DoFn's `process` method. Any combination of these parameters can be added to your process method in any order. +{{< /paragraph >}} -{:.language-java} +{{< paragraph class="language-java" >}} **Timestamp:** To access the timestamp of an input element, add a parameter annotated with `@Timestamp` of type `Instant`. For example: +{{< /paragraph >}} -{:.language-py} +{{< paragraph class="language-py" >}} **Timestamp:** To access the timestamp of an input element, add a keyword parameter default to `DoFn.TimestampParam`. For example: +{{< /paragraph >}} -```java +{{< highlight java >}} .of(new DoFn() { public void processElement(@Element String word, @Timestamp Instant timestamp) { }}) -``` +{{< /highlight >}} -```py +{{< highlight py >}} import apache_beam as beam class ProcessRecord(beam.DoFn): @@ -1616,29 +1633,31 @@ class ProcessRecord(beam.DoFn): # access timestamp of element. pass -``` +{{< /highlight >}} -{:.language-java} +{{< paragraph class="language-java" >}} **Window:** To access the window an input element falls into, add a parameter of the type of the window used for the input `PCollection`. If the parameter is a window type (a subclass of `BoundedWindow`) that does not match the input `PCollection`, then an error will be raised. If an element falls in multiple windows (for example, this will happen when using `SlidingWindows`), then the `@ProcessElement` method will be invoked multiple time for the element, once for each window. For example, when fixed windows are being used, the window is of type `IntervalWindow`. +{{< /paragraph >}} -{:.language-py} +{{< paragraph class="language-py" >}} **Window:** To access the window an input element falls into, add a keyword parameter default to `DoFn.WindowParam`. If an element falls in multiple windows (for example, this will happen when using `SlidingWindows`), then the -`process` method will be invoked multiple time for the element, once for each window. +`process` method will be invoked multiple time for the element, once for each window. +{{< /paragraph >}} -```java +{{< highlight java >}} .of(new DoFn() { public void processElement(@Element String word, IntervalWindow window) { }}) -``` +{{< /highlight >}} -```py +{{< highlight py >}} import apache_beam as beam class ProcessRecord(beam.DoFn): @@ -1647,26 +1666,28 @@ class ProcessRecord(beam.DoFn): # access window e.g window.end.micros pass -``` +{{< /highlight >}} -{:.language-java} +{{< paragraph class="language-java" >}} **PaneInfo:** When triggers are used, Beam provides a `PaneInfo` object that contains information about the current firing. Using `PaneInfo` you can determine whether this is an early or a late firing, and how many times this window has already fired for this key. +{{< /paragraph >}} -{:.language-py} +{{< paragraph class="language-py" >}} **PaneInfo:** When triggers are used, Beam provides a `DoFn.PaneInfoParam` object that contains information about the current firing. Using `DoFn.PaneInfoParam` you can determine whether this is an early or a late firing, and how many times this window has already fired for this key. This feature implementation in python sdk is not fully completed, see more at [BEAM-3759](https://issues.apache.org/jira/browse/BEAM-3759). +{{< /paragraph >}} -```java +{{< highlight java >}} .of(new DoFn() { public void processElement(@Element String word, PaneInfo paneInfo) { }}) -``` +{{< /highlight >}} -```py +{{< highlight py >}} import apache_beam as beam class ProcessRecord(beam.DoFn): @@ -1675,31 +1696,36 @@ class ProcessRecord(beam.DoFn): # access pane info e.g pane_info.is_first, pane_info.is_last, pane_info.timing pass -``` +{{< /highlight >}} -{:.language-java} +{{< paragraph class="language-java" >}} **PipelineOptions:** -The `PipelineOptions` for the current pipeline can always be accessed in a process method by adding it as a parameter: -```java +The `PipelineOptions` for the current pipeline can always be accessed in a process method by adding it +as a parameter: +{{< /paragraph >}} + +{{< highlight java >}} .of(new DoFn() { public void processElement(@Element String word, PipelineOptions options) { }}) -``` +{{< /highlight >}} -{:.language-java} +{{< paragraph class="language-java" >}} `@OnTimer` methods can also access many of these parameters. Timestamp, Window, key, `PipelineOptions`, `OutputReceiver`, and `MultiOutputReceiver` parameters can all be accessed in an `@OnTimer` method. In addition, an `@OnTimer` method can take a parameter of type `TimeDomain` which tells whether the timer is based on event time or processing time. Timers are explained in more detail in the -[Timely (and Stateful) Processing with Apache Beam]({{ site.baseurl }}/blog/2017/08/28/timely-processing.html) blog post. +[Timely (and Stateful) Processing with Apache Beam](/blog/2017/08/28/timely-processing.html) blog post. +{{< /paragraph >}} -{:.language-py} +{{< paragraph class="language-py" >}} **Timer and State:** In addition to aforementioned parameters, user defined Timer and State parameters can be used in a Stateful DoFn. Timers and States are explained in more detail in the -[Timely (and Stateful) Processing with Apache Beam]({{ site.baseurl }}/blog/2017/08/28/timely-processing.html) blog post. +[Timely (and Stateful) Processing with Apache Beam](/blog/2017/08/28/timely-processing.html) blog post. +{{< /paragraph >}} -```py +{{< highlight py >}} class StatefulDoFn(beam.DoFn): """An example stateful DoFn with state and timer""" @@ -1755,7 +1781,8 @@ class StatefulDoFn(beam.DoFn): # Some business logic return True -``` +{{< /highlight >}} + ### 4.6. Composite transforms {#composite-transforms} Transforms can have a nested structure, where a complex transform performs @@ -1766,12 +1793,12 @@ transform can make your code more modular and easier to understand. The Beam SDK comes packed with many useful composite transforms. See the API reference pages for a list of transforms: - * [Pre-written Beam transforms for Java](https://beam.apache.org/releases/javadoc/{{ site.release_latest }}/index.html?org/apache/beam/sdk/transforms/package-summary.html) - * [Pre-written Beam transforms for Python](https://beam.apache.org/releases/pydoc/{{ site.release_latest }}/apache_beam.transforms.html) + * [Pre-written Beam transforms for Java](https://beam.apache.org/releases/javadoc/{{< param release_latest >}}/index.html?org/apache/beam/sdk/transforms/package-summary.html) + * [Pre-written Beam transforms for Python](https://beam.apache.org/releases/pydoc/{{< param release_latest >}}/apache_beam.transforms.html) #### 4.6.1. An example composite transform {#composite-transform-example} -The `CountWords` transform in the [WordCount example program]({{ site.baseurl }}/get-started/wordcount-example/) +The `CountWords` transform in the [WordCount example program](/get-started/wordcount-example/) is an example of a composite transform. `CountWords` is a `PTransform` subclass that consists of multiple nested transforms. @@ -1792,7 +1819,7 @@ Your composite transform's parameters and return value must match the initial input type and final return type for the entire transform, even if the transform's intermediate data changes type multiple times. -```java +{{< highlight java >}} public static class CountWords extends PTransform, PCollection>> { @Override @@ -1809,11 +1836,11 @@ transform's intermediate data changes type multiple times. return wordCounts; } } -``` +{{< /highlight >}} -```py -{% github_sample /apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets.py tag:pipeline_monitoring_composite -%}``` +{{< highlight py >}} +{{< github_sample "/apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets.py" pipeline_monitoring_composite >}} +{{< /highlight >}} #### 4.6.2. Creating a composite transform {#composite-transform-creation} @@ -1822,25 +1849,26 @@ class and override the `expand` method to specify the actual processing logic. You can then use this transform just as you would a built-in transform from the Beam SDK. -{:.language-java} +{{< paragraph class="language-java" >}} For the `PTransform` class type parameters, you pass the `PCollection` types that your transform takes as input, and produces as output. To take multiple `PCollection`s as input, or produce multiple `PCollection`s as output, use one of the multi-collection types for the relevant type parameter. +{{< /paragraph >}} The following code sample shows how to declare a `PTransform` that accepts a `PCollection` of `String`s for input, and outputs a `PCollection` of `Integer`s: -```java +{{< highlight java >}} static class ComputeWordLengths extends PTransform, PCollection> { ... } -``` +{{< /highlight >}} -```py -{% github_sample /apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets_test.py tag:model_composite_transform -%}``` +{{< highlight py >}} +{{< github_sample "/apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets_test.py" model_composite_transform >}} +{{< /highlight >}} Within your `PTransform` subclass, you'll need to override the `expand` method. The `expand` method is where you add the processing logic for the `PTransform`. @@ -1851,7 +1879,7 @@ value. The following code sample shows how to override `expand` for the `ComputeWordLengths` class declared in the previous example: -```java +{{< highlight java >}} static class ComputeWordLengths extends PTransform, PCollection> { @Override @@ -1860,11 +1888,11 @@ The following code sample shows how to override `expand` for the // transform logic goes here ... } -``` +{{< /highlight >}} -```py -{% github_sample /apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets_test.py tag:model_composite_transform -%}``` +{{< highlight py >}} +{{< github_sample "/apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets_test.py" model_composite_transform >}} +{{< /highlight >}} As long as you override the `expand` method in your `PTransform` subclass to accept the appropriate input `PCollection`(s) and return the corresponding @@ -1879,7 +1907,7 @@ transforms to be nested within the structure of your pipeline. #### 4.6.3. PTransform Style Guide {#ptransform-style-guide} -The [PTransform Style Guide]({{ site.baseurl }}/contribute/ptransform-style-guide/) +The [PTransform Style Guide](/contribute/ptransform-style-guide/) contains additional information not included here, such as style guidelines, logging and testing guidance, and language-specific considerations. The guide is a useful starting point when you want to write new composite PTransforms. @@ -1890,10 +1918,10 @@ When you create a pipeline, you often need to read data from some external source, such as a file or a database. Likewise, you may want your pipeline to output its result data to an external storage system. Beam provides read and write transforms for a [number of common data storage -types]({{ site.baseurl }}/documentation/io/built-in/). If you want your pipeline +types](/documentation/io/built-in/). If you want your pipeline to read from or write to a data storage format that isn't supported by the built-in transforms, you can [implement your own read and write -transforms]({{site.baseurl }}/documentation/io/developing-io-overview/). +transforms](/documentation/io/developing-io-overview/). ### 5.1. Reading input data {#pipeline-io-reading-data} @@ -1902,13 +1930,13 @@ representation of the data for use by your pipeline. You can use a read transform at any point while constructing your pipeline to create a new `PCollection`, though it will be most common at the start of your pipeline. -```java +{{< highlight java >}} PCollection lines = p.apply(TextIO.read().from("gs://some/inputData.txt")); -``` +{{< /highlight >}} -```py +{{< highlight py >}} lines = pipeline | beam.io.ReadFromText('gs://some/inputData.txt') -``` +{{< /highlight >}} ### 5.2. Writing output data {#pipeline-io-writing-data} @@ -1917,13 +1945,13 @@ You will most often use write transforms at the end of your pipeline to output your pipeline's final results. However, you can use a write transform to output a `PCollection`'s data at any point in your pipeline. -```java +{{< highlight java >}} output.apply(TextIO.write().to("gs://some/outputData")); -``` +{{< /highlight >}} -```py +{{< highlight py >}} output | beam.io.WriteToText('gs://some/outputData') -``` +{{< /highlight >}} ### 5.3. File-based input and output data {#file-based-data} @@ -1935,15 +1963,14 @@ filesystem-specific consistency models. The following TextIO example uses a glob operator (\*) to read all matching input files that have prefix "input-" and the suffix ".csv" in the given location: -```java +{{< highlight java >}} p.apply("ReadFromText", TextIO.read().from("protocol://my_bucket/path/to/input-*.csv")); -``` +{{< /highlight >}} -```py -{% github_sample /apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets.py tag:model_pipelineio_read -%} -``` +{{< highlight py >}} +{{< github_sample "/apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets.py" model_pipelineio_read >}} +{{< /highlight >}} To read data from disparate sources into a single `PCollection`, read each one independently and then use the [Flatten](#flatten) transform to create a single @@ -1960,23 +1987,23 @@ The following write transform example writes multiple output files to a location. Each file has the prefix "numbers", a numeric tag, and the suffix ".csv". -```java +{{< highlight java >}} records.apply("WriteToText", TextIO.write().to("protocol://my_bucket/path/to/numbers") .withSuffix(".csv")); -``` +{{< /highlight >}} -```py -{% github_sample /apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets.py tag:model_pipelineio_write -%} -``` +{{< highlight py >}} +{{< github_sample "/apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets.py" model_pipelineio_write >}} +{{< /highlight >}} ### 5.4. Beam-provided I/O transforms {#provided-io-transforms} -See the [Beam-provided I/O Transforms]({{site.baseurl }}/documentation/io/built-in/) +See the [Beam-provided I/O Transforms](/documentation/io/built-in/) page for a list of the currently available I/O transforms. ## 6. Schemas {#schemas} + Often, the types of the records being processed have an obvious structure. Common Beam sources produce JSON, Avro, Protocol Buffer, or database row objects; all of these types have well defined structures, structures that can often be determined by examining the type. Even within a SDK pipeline, Simple Java POJOs @@ -1985,6 +2012,7 @@ structures that can often be determined by examining the type. Even within a SDK records, we can provide much more concise APIs for data processing. ### 6.1. What is a schema {#what-is-a-schema} + Most structured records share some common characteristics: * They can be subdivided into separate named fields. Fields usually have string names, but sometimes - as in the case of indexed tuples - have numerical indices instead. @@ -1999,6 +2027,7 @@ records. For example, consider the following schema, representing actions in a fictitious e-commerce company: **Purchase** + @@ -2032,6 +2061,7 @@ For example, consider the following schema, representing actions in a fictitious
    **ShippingAddress** +
    @@ -2065,6 +2095,7 @@ For example, consider the following schema, representing actions in a fictitious
    **Transaction** +
    @@ -2102,15 +2133,17 @@ A `PCollection` with a schema does not need to have a `Coder` specified, as Beam Schema rows; Beam uses a special coder to encode schema types. ### 6.2. Schemas for programming language types {#schemas-for-pl-types} + While schemas themselves are language independent, they are designed to embed naturally into the programming languages of the Beam SDK being used. This allows Beam users to continue using native types while reaping the advantage of having Beam understand their element schemas. - {:.language-java} - In Java you could use the following set of classes to represent the purchase schema. Beam will automatically - infer the correct schema based on the members of the class. +{{< paragraph class="language-java" >}} +In Java you could use the following set of classes to represent the purchase schema. Beam will automatically +infer the correct schema based on the members of the class. +{{< /paragraph >}} -```java +{{< highlight java >}} @DefaultSchema(JavaBeanSchema.class) public class Purchase { public String getUserId(); // Returns the id of the user who made the purchase. @@ -2151,52 +2184,53 @@ public class Transaction { ... } } -``` +{{< /highlight >}} Using JavaBean classes as above is one way to map a schema to Java classes. However multiple Java classes might have the same schema, in which case the different Java types can often be used interchangeably. Beam will add implicit conversions betweens types that have matching schemas. For example, the above `Transaction` class has the same schema as the following class: -```java +{{< highlight java >}} @DefaultSchema(JavaFieldSchema.class) public class TransactionPojo { public String bank; public double purchaseAmount; } -``` +{{< /highlight >}} So if we had two `PCollection`s as follows -```java +{{< highlight java >}} PCollection transactionBeans = readTransactionsAsJavaBean(); PCollection transactionPojos = readTransactionsAsPojo(); -``` +{{< /highlight >}} Then these two `PCollection`s would have the same schema, even though their Java types would be different. This means for example the following two code snippets are valid: -```java +{{< highlight java >}} transactionBeans.apply(ParDo.of(new DoFn<...>() { @ProcessElement public void process(@Element TransactionPojo pojo) { ... } })); -``` +{{< /highlight >}} and -```java +{{< highlight java >}} transactionPojos.apply(ParDo.of(new DoFn<...>() { @ProcessElement public void process(@Element Transaction row) { } })); -``` +{{< /highlight >}} Even though the in both cases the `@Element` parameter differs from the the `PCollection`'s Java type, since the schemas are the same Beam will automatically make the conversion. The built-in `Convert` transform can also be used to translate between Java types of equivalent schemas, as detailed below. ### 6.3. Schema definition {#schema-definition} + The schema for a `PCollection` defines elements of that `PCollection` as an ordered list of named fields. Each field has a name, a type, and possibly a set of user options. The type of a field can be primitive or composite. The following are the primitive types currently supported by Beam: @@ -2271,6 +2305,7 @@ available memory, and backed by external storage (for example, this can happen w Values that contain map types cannot be used as keys in any grouping operation. ### 6.4. Logical types {#logical-types} + Users can extend the schema type system to add custom logical types that can be used as a field. A logical type is identified by a unique identifier and an argument. A logical type also specifies an underlying schema type to be used for storage, along with conversions to and from that type. As an example, a logical union can always be represented as @@ -2281,6 +2316,7 @@ unique identifier, so they can be interpreted by other languages as well. More e below. #### 6.4.1. Defining a logical type {#defining-a-logical-type} + To define a logical type you must specify a Schema type to be used to represent the underlying type as well as a unique identifier for that type. A logical type imposes additional semantics on top a schema type. For example, a logical type to represent nanosecond timestamps is represented as a schema containing an INT64 and an INT32 field. This schema @@ -2291,13 +2327,11 @@ Logical types are also specified by an argument, which allows creating a class o limited-precision decimal type would have an integer argument indicating how many digits of precision are represented. The argument is represented by a schema type, so can itself be a complex type. - {:.language-java} -In Java, a logical type is specified as a subclass of the `LogicalType` class. A custom Java class can be specified to -represent the logical type and conversion functions must be supplied to convert back and forth between this Java class -and the underlying Schema type representation. For example, the logical type representing nanosecond timestamp might -be implemented as follows +{{< paragraph class="language-java" >}} +In Java, a logical type is specified as a subclass of the `LogicalType` class. A custom Java class can be specified to represent the logical type and conversion functions must be supplied to convert back and forth between this Java class and the underlying Schema type representation. For example, the logical type representing nanosecond timestamp might be implemented as follows +{{< /paragraph >}} -```java +{{< highlight java >}} // A Logical type using java.time.Instant to represent the logical type. public class TimestampNanos implements LogicalType { // The underlying schema used to represent rows. @@ -2317,41 +2351,44 @@ public class TimestampNanos implements LogicalType { ... } -``` +{{< /highlight >}} #### 6.4.2. Useful logical types {#built-in-logical-types} + ##### **EnumerationType** + This logical type allows creating an enumeration type consisting of a set of named constants. -```java +{{< highlight java >}} Schema schema = Schema.builder() … .addLogicalTypeField(“color”, EnumerationType.create(“RED”, “GREEN”, “BLUE”)) .build(); -``` +{{< /highlight >}} The value of this field is stored in the row as an INT32 type, however the logical type defines a value type that lets you access the enumeration either as a string or a value. For example: -```java +{{< highlight java >}} EnumerationType.Value enumValue = enumType.valueOf(“RED”); enumValue.getValue(); // Returns 0, the integer value of the constant. enumValue.toString(); // Returns “RED”, the string value of the constant -``` +{{< /highlight >}} Given a row object with an enumeration field, you can also extract the field as the enumeration value. -```java +{{< highlight java >}} EnumerationType.Value enumValue = row.getLogicalTypeValue(“color”, EnumerationType.Value.class); -``` +{{< /highlight >}} Automatic schema inference from Java POJOs and JavaBeans automatically converts Java enums to EnumerationType logical types. ##### **OneOfType** + OneOfType allows creating a disjoint union type over a set of schema fields. For example: -```java +{{< highlight java >}} Schema schema = Schema.builder() … .addLogicalTypeField(“oneOfField”, @@ -2359,13 +2396,13 @@ Schema schema = Schema.builder() Field.of(“stringField”, FieldType.STRING), Field.of(“bytesField”, FieldType.BYTES))) .build(); -``` +{{< /highlight >}} The value of this field is stored in the row as another Row type, where all the fields are marked as nullable. The logical type however defines a Value object that contains an enumeration value indicating which field was set and allows getting just that field: -```java +{{< highlight java >}} // Returns an enumeration indicating all possible case values for the enum. // For the above example, this will be // EnumerationType.create(“intField”, “stringField”, “bytesField”); @@ -2383,27 +2420,23 @@ switch (oneOfValue.getCaseEnumType().toString()) { case “bytesField”: return processBytes(oneOfValue.getValue(bytes[].class)); } -``` +{{< /highlight >}} In the above example we used the field names in the switch statement for clarity, however the enum integer values could also be used. ### 6.5. Creating Schemas {#creating-schemas} -In order to take advantage of schemas, your `PCollection`s must have a schema attached to it. Often, the source -itself will attach a schema to the PCollection. For example, when using `AvroIO` to read Avro files, the source can -automatically infer a Beam schema from the Avro schema and attach that to the Beam `PCollection`. However not all sources -produce schemas. In addition, often Beam pipelines have intermediate stages and types, and those also can benefit from -the expressiveness of schemas. +In order to take advantage of schemas, your `PCollection`s must have a schema attached to it. Often, the source itself will attach a schema to the PCollection. For example, when using `AvroIO` to read Avro files, the source can automatically infer a Beam schema from the Avro schema and attach that to the Beam `PCollection`. However not all sources produce schemas. In addition, often Beam pipelines have intermediate stages and types, and those also can benefit from the expressiveness of schemas. #### 6.5.1. Inferring schemas {#inferring-schemas} -{:.language-java} -Beam is able to infer schemas from a variety of common Java types. The `@DefaultSchema` annotation can be used to tell -Beam to infer schemas from a specific type. The annotation takes a `SchemaProvider` as an argument, and `SchemaProvider` -classes are already built in for common Java types. The `SchemaRegistry` can also be invoked programmatically for cases -where it is not practical to annotate the Java type itself. + +{{< paragraph class="language-java" >}} +Beam is able to infer schemas from a variety of common Java types. The `@DefaultSchema` annotation can be used to tell Beam to infer schemas from a specific type. The annotation takes a `SchemaProvider` as an argument, and `SchemaProvider` classes are already built in for common Java types. The `SchemaRegistry` can also be invoked programmatically for cases where it is not practical to annotate the Java type itself. +{{< /paragraph >}} ##### **Java POJOs** + A POJO (Plain Old Java Object) is a Java object that is not bound by any restriction other than the Java Language Specification. A POJO can contain member variables that are primitives, that are other POJOs, or are collections maps or arrays thereof. POJOs do not have to extend prespecified classes or extend any specific interfaces. @@ -2414,7 +2447,7 @@ this class. Nested classes are supported as are classes with `List`, array, and For example, annotating the following class tells Beam to infer a schema from this POJO class and apply it to any `PCollection`. -```java +{{< highlight java >}} @DefaultSchema(JavaFieldSchema.class) public class TransactionPojo { public final String bank; @@ -2427,7 +2460,7 @@ public class TransactionPojo { } // Beam will automatically infer the correct schema for this PCollection. No coder is needed as a result. PCollection pojos = readPojos(); -```` +{{< /highlight >}} The `@SchemaCreate` annotation tells Beam that this constructor can be used to create instances of TransactionPojo, assuming that constructor parameters have the same names as the field names. `@SchemaCreate` can also be used to annotate @@ -2445,17 +2478,18 @@ In some cases it is not convenient to annotate the POJO class, for example if th not owned by the Beam pipeline author. In these cases the schema inference can be triggered programmatically in pipeline’s main function as follows: -```java +{{< highlight java >}} pipeline.getSchemaRegistry().registerPOJO(TransactionPOJO.class); -``` +{{< /highlight >}} ##### **Java Beans** + Java Beans are a de-facto standard for creating reusable property classes in Java. While the full standard has many characteristics, the key ones are that all properties are accessed via getter and setter classes, and the name format for these getters and setters is standardized. A Java Bean class can be annotated with `@DefaultSchema(JavaBeanSchema.class)` and Beam will automatically infer a schema for this class. For example: -```java +{{< highlight java >}} @DefaultSchema(JavaBeanSchema.class) public class TransactionBean { public TransactionBean() { … } @@ -2466,12 +2500,12 @@ public class TransactionBean { } // Beam will automatically infer the correct schema for this PCollection. No coder is needed as a result. PCollection beans = readBeans(); -``` +{{< /highlight >}} The `@SchemaCreate` annotation can be used to specify a constructor or a static factory method, in which case the setters and zero-argument constructor can be omitted. -```java +{{< highlight java >}} @DefaultSchema(JavaBeanSchema.class) public class TransactionBean { @SchemaCreate @@ -2479,25 +2513,26 @@ public class TransactionBean { public String getBank() { … } public double getPurchaseAmount() { … } } -``` +{{< /highlight >}} `@SchemaFieldName` and `@SchemaIgnore` can be used to alter the schema inferred, just like with POJO classes. ##### **AutoValue** + Java value classes are notoriously difficult to generate correctly. There is a lot of boilerplate you must create in order to properly implement a value class. AutoValue is a popular library for easily generating such classes by i mplementing a simple abstract base class. Beam can infer a schema from an AutoValue class. For example: -```java +{{< highlight java >}} @DefaultSchema(AutoValueSchema.class) @AutoValue public abstract class TransactionValue { public abstract String getBank(); public abstract double getPurchaseAmount(); } -``` +{{< /highlight >}} This is all that’s needed to generate a simple AutoValue class, and the above `@DefaultSchema` annotation tells Beam to infer a schema from it. This also allows AutoValue elements to be used inside of `PCollection`s. @@ -2505,11 +2540,13 @@ infer a schema from it. This also allows AutoValue elements to be used inside of `@SchemaFieldName` and `@SchemaIgnore` can be used to alter the schema inferred. ### 6.6. Using Schema Transforms {#using-schemas} + A schema on a `PCollection` enables a rich variety of relational transforms. The fact that each record is composed of named fields allows for simple and readable aggregations that reference fields by name, similar to the aggregations in a SQL expression. #### 6.6.1. Field selection syntax + The advantage of schemas is that they allow referencing of element fields by name. Beam provides a selection syntax for referencing fields, including nested and repeated fields. This syntax is used by all of the schema transforms when referencing the fields they operate on. The syntax can also be used inside of a DoFn to specify which schema fields to @@ -2523,36 +2560,40 @@ launch. The following characters are not allowed in field names: . * [ ] { } ##### **Top-level fields** + In order to select a field at the top level of a schema, the name of the field is specified. For example, to select just the user ids from a `PCollection` of purchases one would write (using the `Select` transform) -```java +{{< highlight java >}} purchases.apply(Select.fieldNames(“userId”)); -``` +{{< /highlight >}} ##### **Nested fields** + Individual nested fields can be specified using the dot operator. For example, to select just the postal code from the shipping address one would write -```java +{{< highlight java >}} purchases.apply(Select.fieldNames(“shippingAddress.postCode”)); -``` +{{< /highlight >}} ##### **Wildcards** + The * operator can be specified at any nesting level to represent all fields at that level. For example, to select all shipping-address fields one would write -```java +{{< highlight java >}} purchases.apply(Select.fieldNames(“shippingAddress.*”)); -``` +{{< /highlight >}} ##### **Arrays** + An array field, where the array element type is a row, can also have subfields of the element type addressed. When selected, the result is an array of the selected subfield type. For example -```java +{{< highlight java >}} purchases.apply(Select.fieldNames(“transactions[].bank”)); -``` +{{< /highlight >}} Will result in a row containing an array field with element-type string, containing the list of banks for each transaction. @@ -2562,6 +2603,7 @@ they can be omitted for brevity. In the future, array slicing will be supported, array. ##### **Maps** + A map field, where the value type is a row, can also have subfields of the value type addressed. When selected, the result is a map where the keys are the same as in the original map but the value is the specified type. Similar to arrays, the use of {} curly brackets in the selector is recommended, to make it clear that map value elements are being @@ -2569,6 +2611,7 @@ selected, they can be omitted for brevity. In the future, map key selectors will specific keys from the map. For example, given the following schema: **PurchasesByType** +
    @@ -2587,9 +2630,9 @@ specific keys from the map. For example, given the following schema: The following -```java +{{< highlight java >}} purchasesByType.apply(Select.fieldNames(“purchases{}.userId”)); -``` +{{< /highlight >}} Will result in a row containing an map field with key-type string and value-type string. The selected map will contain all of the keys from the original map, and the values will be the userId contained in the purchasee reecord. @@ -2599,19 +2642,21 @@ they can be omitted for brevity. In the future, map slicing will be supported, a the map. #### 6.6.2. Schema transforms + Beam provides a collection of transforms that operate natively on schemas. These transforms are very expressive, allowing selections and aggregations in terms of named schema fields. Following are some examples of useful schema transforms. ##### **Selecting input** + Often a computation is only interested in a subset of the fields in an input `PCollection`. The `Select` transform allows one to easily project out only the fields of interest. The resulting `PCollection` has a schema containing each selected field as a top-level field. Both top-level and nested fields can be selected. For example, in the Purchase schema, one could select only the userId and streetAddress fields as follows -```java +{{< highlight java >}} purchases.apply(Select.fieldNames(“userId”, shippingAddress.streetAddress”)); -``` +{{< /highlight >}} The resulting `PCollection` will have the following schema @@ -2637,9 +2682,9 @@ The resulting `PCollection` will have the following schema The same is true for wildcard selections. The following -```java +{{< highlight java >}} purchases.apply(Select.fieldNames(“userId”, shippingAddress.*”)); -``` +{{< /highlight >}} Will result in the following schema @@ -2683,9 +2728,9 @@ When selecting fields nested inside of an array, the same rule applies that each top-level field in the resulting row. This means that if multiple fields are selected from the same nested row, each selected field will appear as its own array field. For example -```java +{{< highlight java >}} purchases.apply(Select.fieldNames( “transactions.bank”, transactions.purchaseAmount”)); -``` +{{< /highlight >}} Will result in the following schema
    @@ -2720,9 +2765,9 @@ a name conflict, as all selected fields are put in the same row schema. When thi Another use of the Select transform is to flatten a nested schema into a single flat schema. For example -```java +{{< highlight java >}} purchases.apply(Select.flattenedSchema()); -``` +{{< /highlight >}} Will result in the following schema
    @@ -2778,6 +2823,7 @@ Will result in the following schema
    ##### **Grouping aggregations** + The `Group` transform allows simply grouping data by any number of fields in the input schema, applying aggregations to those groupings, and storing the result of those aggregations in a new schema field. The output of the `Group` transform has a schema with one field corresponding to each aggregation performed. @@ -2785,9 +2831,9 @@ has a schema with one field corresponding to each aggregation performed. The simplest usage of `Group` specifies no aggregations, in which case all inputs matching the provided set of fields are grouped together into an `ITERABLE` field. For example -```java +{{< highlight java >}} purchases.apply(Group.byFieldNames(“userId”, shippingAddress.streetAddress”)); -``` +{{< /highlight >}} The output schema of this is: @@ -2816,23 +2862,23 @@ The key field contains the grouping key and the values field contains a list of The names of the key and values fields in the output schema can be controlled using this withKeyField and withValueField builders, as follows: -```java +{{< highlight java >}} purchases.apply(Group.byFieldNames(“userId”, shippingAddress.streetAddress”) .withKeyField(“userAndStreet”) .withValueField(“matchingPurchases”)); -``` +{{< /highlight >}} It is quite common to apply one or more aggregations to the grouped result. Each aggregation can specify one or more fields to aggregate, an aggregation function, and the name of the resulting field in the output schema. For example, the following application computes three aggregations grouped by userId, with all aggregations represented in a single output schema: -```java +{{< highlight java >}} purchases.apply(Group.byFieldNames(“userId”) .aggregateField(“itemId”, Count.combineFn(), “numPurchases”) .aggregateField(“costCents”, Sum.ofLongs(), “totalSpendCents”) .aggregateField(“costCents”, Top.largestLongsFn(10), “topPurchases”)); -``` +{{< /highlight >}} The result of this aggregation will have the following schema:
    @@ -2858,18 +2904,19 @@ The result of this aggregation will have the following schema: Often `Selected.flattenedSchema` will be use to flatten the result into a non-nested, flat schema. ##### **Joins** + Beam supports equijoins on schema `PCollections` - namely joins where the join condition depends on the equality of a subset of fields. For example, the following examples uses the Purchases schema to join transactions with the reviews that are likely associated with that transaction (both the user and product match that in the transaction). This is a "natural join" - one in which the same field names are used on both the left-hand and right-hand sides of the join - and is specified with the `using` keyword: -```java +{{< highlight java >}} PCollection transactions = readTransactions(); PCollection reviews = readReviews(); PCollection joined = transactions.apply( Join.innerJoin(reviews).using(“userId”, “productId”)); -``` +{{< /highlight >}} The resulting schema is the following:
    @@ -2897,17 +2944,18 @@ Each resulting row contains one Review and one Review that matched the join cond If the fields to match in the two schemas have different names, then the on function can be used. For example, if the Review schema named those fields differently than the Transaction schema, then we could write the following: -```java +{{< highlight java >}} PCollection joined = transactions.apply( Join.innerJoin(reviews).on( FieldsEqual .left(“userId”, “productId”) .right(“reviewUserId”, “reviewProductId”))); -``` +{{< /highlight >}} In addition to inner joins, the Join transform supports full outer joins, left outer joins, and right outer joins. ##### **Complex joins** + While most joins tend to be binary joins - joining two inputs together - sometimes you have more than two input streams that all need to be joined on a common key. The `CoGroup` transform allows joining multiple `PCollections` together based on equality of schema fields. Each `PCollection` can be marked as required or optional in the final @@ -2917,19 +2965,21 @@ processed in unexpanded format - providing the join key along with Iterables of that key. ##### **Filtering events** + The `Filter` transform can be configured with a set of predicates, each one based one specified fields. Only records for which all predicates return true will pass the filter. For example the following -```java +{{< highlight java >}} purchases.apply(Filter .whereFieldName(“costCents”, c -> c > 100 * 20) .whereFieldName(“shippingAddress.country”, c -> c.equals(“de”)); -``` +{{< /highlight >}} Will produce all purchases made from Germany with a purchase price of greater than twenty cents. ##### **Adding fields to a schema** + The AddFields transform can be used to extend a schema with new fields. Input rows will be extended to the new schema by inserting null values for the new fields, though alternate default values can be specified; if the default null value is used then the new field type will be marked as nullable. Nested subfields can be added using the field selection @@ -2937,31 +2987,33 @@ syntax, including nested fields inside arrays or map values. For example, the following application -```java +{{< highlight java >}} purchases.apply(AddFields.create() .field(“timeOfDaySeconds”, FieldType.INT32) .field(“shippingAddress.deliveryNotes”, FieldType.STRING) .field(“transactions.isFlagged, FieldType.BOOLEAN, false)); -``` +{{< /highlight >}} Results in a `PCollection` with an expanded schema. All of the rows and fields of the input, but also with the specified fields added to the schema. All resulting rows will have null values filled in for the **timeOfDaySeconds** and the **shippingAddress.deliveryNotes** fields, and a false value filled in for the **transactions.isFlagged** field. ##### **Removing fields from a schema** + `DropFields` allows specific fields to be dropped from a schema. Input rows will have their schemas truncated, and any values for dropped fields will be removed from the output. Nested fields can also be dropped using the field selection syntax. For example, the following snippet -```java +{{< highlight java >}} purchases.apply(DropFields.fields(“userId”, “shippingAddress.streetAddress”)); -``` +{{< /highlight >}} Results in a copy of the input with those two fields and their corresponding values removed. ##### **Renaming schema fields** + `RenameFields` allows specific fields in a schema to be renamed. The field values in input rows are left unchanged, only the schema is modified. This transform is often used to prepare records for output to a schema-aware sink, such as an RDBMS, to make sure that the `PCollection` schema field names match that of the output. It can also be used to rename @@ -2970,24 +3022,25 @@ renamed using the field-selection syntax. For example, the following snippet -```java +{{< highlight java >}} purchases.apply(RenameFields.create() .rename(“userId”, “userIdentifier”) .rename(“shippingAddress.streetAddress”, “shippingAddress.street”)); -``` +{{< /highlight >}} Results in the same set of unmodified input elements, however the schema on the PCollection has been changed to rename **userId** to **userIdentifier** and **shippingAddress.streetAddress** to **shippingAddress.street**. ##### **Converting between types** + As mentioned, Beam can automatically convert between different Java types, as long as those types have equivalent schemas. One way to do this is by using the `Convert` transform, as follows. -```java +{{< highlight java >}} PCollection purchaseBeans = readPurchasesAsBeans(); PCollection pojoPurchases = purchaseBeans.apply(Convert.to(PurchasePojo.class)); -``` +{{< /highlight >}} Beam will validate that the inferred schema for `PurchasePojo` matches that of the input `PCollection`, and will then cast to a `PCollection`. @@ -2995,44 +3048,46 @@ then cast to a `PCollection`. Since the `Row` class can support any schema, any `PCollection` with schema can be cast to a `PCollection` of rows, as follows. -```java +{{< highlight java >}} PCollection purchaseRows = purchaseBeans.apply(Convert.toRows()); -``` +{{< /highlight >}} If the source type is a single-field schema, Convert will also convert to the type of the field if asked, effectively unboxing the row. For example, give a schema with a single INT64 field, the following will convert it to a `PCollection` -```java +{{< highlight java >}} PCollection longs = rows.apply(Convert.to(TypeDescriptors.longs())); -``` +{{< /highlight >}} In all cases, type checking is done at pipeline graph construction, and if the types do not match the schema then the pipeline will fail to launch. #### 6.6.3. Schemas in ParDo + A `PCollection` with a schema can apply a `ParDo`, just like any other `PCollection`. However the Beam runner is aware of schemas when applying a `ParDo`, which enables additional functionality. ##### **Input conversion** + Since Beam knows the schema of the source `PCollection`, it can automatically convert the elements to any Java type for which a matching schema is known. For example, using the above-mentioned Transaction schema, say we have the following `PCollection`: -```java +{{< highlight java >}} PCollection purchases = readPurchases(); -``` +{{< /highlight >}} If there were no schema, then the applied `DoFn` would have to accept an element of type `TransactionPojo`. However since there is a schema, you could apply the following DoFn: -```java +{{< highlight java >}} purchases.appy(ParDo.of(new DoFn() { @ProcessElement public void process(@Element PurchaseBean purchase) { ... } })); -``` +{{< /highlight >}} Even though the `@Element` parameter does not match the Java type of the `PCollection`, since it has a matching schema Beam will automatically convert elements. If the schema does not match, Beam will detect this at graph-construction time @@ -3040,39 +3095,40 @@ and will fail the job with a type error. Since every schema can be represented by a Row type, Row can also be used here: -```java +{{< highlight java >}} purchases.appy(ParDo.of(new DoFn() { @ProcessElement public void process(@Element Row purchase) { ... } })); -``` +{{< /highlight >}} ##### **Input selection** + Since the input has a schema, you can also automatically select specific fields to process in the DoFn. Given the above purchases `PCollection`, say you want to process just the userId and the itemId fields. You can do these using the above-described selection expressions, as follows: -```java +{{< highlight java >}} purchases.appy(ParDo.of(new DoFn() { @ProcessElement public void process( @FieldAccess(“userId”) String userId, @FieldAccess(“itemId”) long itemId) { ... } })); -``` +{{< /highlight >}} You can also select nested fields, as follows. -```java +{{< highlight java >}} purchases.appy(ParDo.of(new DoFn() { @ProcessElement public void process( @FieldAccess(“shippingAddress.street”) String street) { ... } })); -``` +{{< /highlight >}} For more information, see the section on field-selection expressions. When selecting subschemas, Beam will automatically convert to any matching schema type, just like when reading the entire row. @@ -3090,14 +3146,15 @@ the elements of a given `PCollection` may be encoded and decoded. > typically be done explicitly, using transforms such as `ParDo` or > `MapElements`. -{:.language-java} +{{< paragraph class="language-java" >}} In the Beam SDK for Java, the type `Coder` provides the methods required for encoding and decoding data. The SDK for Java provides a number of Coder subclasses that work with a variety of standard Java types, such as Integer, Long, Double, StringUtf8 and more. You can find all of the available Coder subclasses in the [Coder package](https://github.com/apache/beam/tree/master/sdks/java/core/src/main/java/org/apache/beam/sdk/coders). +{{< /paragraph >}} -{:.language-py} +{{< paragraph class="language-py" >}} In the Beam SDK for Python, the type `Coder` provides the methods required for encoding and decoding data. The SDK for Python provides a number of Coder subclasses that work with a variety of standard Python types, such as primitive @@ -3105,6 +3162,7 @@ types, Tuple, Iterable, StringUtf8 and more. You can find all of the available Coder subclasses in the [apache_beam.coders](https://github.com/apache/beam/tree/master/sdks/python/apache_beam/coders) package. +{{< /paragraph >}} > Note that coders do not necessarily have a 1:1 relationship with types. For > example, the Integer type can have multiple valid coders, and input and output @@ -3120,30 +3178,34 @@ based on its element type or the transform that produces it, however, in some cases the pipeline author will need to specify a `Coder` explicitly, or develop a `Coder` for their custom type. -{:.language-java} +{{< paragraph class="language-java" >}} You can explicitly set the coder for an existing `PCollection` by using the method `PCollection.setCoder`. Note that you cannot call `setCoder` on a `PCollection` that has been finalized (e.g. by calling `.apply` on it). +{{< /paragraph >}} -{:.language-java} +{{< paragraph class="language-java" >}} You can get the coder for an existing `PCollection` by using the method `getCoder`. This method will fail with an `IllegalStateException` if a coder has not been set and cannot be inferred for the given `PCollection`. +{{< /paragraph >}} Beam SDKs use a variety of mechanisms when attempting to automatically infer the `Coder` for a `PCollection`. -{:.language-java} +{{< paragraph class="language-java" >}} Each pipeline object has a `CoderRegistry`. The `CoderRegistry` represents a mapping of Java types to the default coders that the pipeline should use for `PCollection`s of each type. +{{< /paragraph >}} -{:.language-py} +{{< paragraph class="language-py" >}} The Beam SDK for Python has a `CoderRegistry` that represents a mapping of Python types to the default coder that should be used for `PCollection`s of each type. +{{< /paragraph >}} -{:.language-java} +{{< paragraph class="language-java" >}} By default, the Beam SDK for Java automatically infers the `Coder` for the elements of a `PCollection` produced by a `PTransform` using the type parameter from the transform's function object, such as `DoFn`. In the case of `ParDo`, @@ -3152,8 +3214,9 @@ of type `Integer` and produces an output element of type `String`. In such a case, the SDK for Java will automatically infer the default `Coder` for the output `PCollection` (in the default pipeline `CoderRegistry`, this is `StringUtf8Coder`). +{{< /paragraph >}} -{:.language-py} +{{< paragraph class="language-py" >}} By default, the Beam SDK for Python automatically infers the `Coder` for the elements of an output `PCollection` using the typehints from the transform's function object, such as `DoFn`. In the case of `ParDo`, for example a `DoFn` @@ -3162,6 +3225,7 @@ with the typehints `@beam.typehints.with_input_types(int)` and and produces an output element of type str. In such a case, the Beam SDK for Python will automatically infer the default `Coder` for the output `PCollection` (in the default pipeline `CoderRegistry`, this is `BytesCoder`). +{{< /paragraph >}} > NOTE: If you create your `PCollection` from in-memory data by using the > `Create` transform, you cannot rely on coder inference and default coders. @@ -3169,9 +3233,10 @@ Python will automatically infer the default `Coder` for the output `PCollection` > may not be able to infer a coder if the argument list contains a value whose > exact run-time class doesn't have a default coder registered. -{:.language-java} +{{< paragraph class="language-java" >}} When using `Create`, the simplest way to ensure that you have the correct coder is by invoking `withCoder` when you apply the `Create` transform. +{{< /paragraph >}} ### 7.2. Default coders and the CoderRegistry {#default-coders-and-the-coderregistry} @@ -3186,7 +3251,7 @@ types for any pipeline you create using the Beam SDK for JavaPython. The following table shows the standard mapping: -{:.language-java} +{{< paragraph class="language-java" >}}
    @@ -3249,8 +3314,9 @@ The following table shows the standard mapping:
    +{{< /paragraph >}} -{:.language-py} +{{< paragraph class="language-py" >}} @@ -3281,21 +3347,24 @@ The following table shows the standard mapping:
    +{{< /paragraph >}} #### 7.2.1. Looking up a default coder {#default-coder-lookup} -{:.language-java} +{{< paragraph class="language-java" >}} You can use the method `CoderRegistry.getCoder` to determine the default Coder for a Java type. You can access the `CoderRegistry` for a given pipeline by using the method `Pipeline.getCoderRegistry`. This allows you to determine (or set) the default Coder for a Java type on a per-pipeline basis: i.e. "for this pipeline, verify that Integer values are encoded using `BigEndianIntegerCoder`." +{{< /paragraph >}} -{:.language-py} +{{< paragraph class="language-py" >}} You can use the method `CoderRegistry.get_coder` to determine the default Coder for a Python type. You can use `coders.registry` to access the `CoderRegistry`. This allows you to determine (or set) the default Coder for a Python type. +{{< /paragraph >}} #### 7.2.2. Setting the default coder for a type {#setting-default-coder} @@ -3315,39 +3384,41 @@ The following example code demonstrates how to set a default Coder, in this case Integerint values for a pipeline. -```java +{{< highlight java >}} PipelineOptions options = PipelineOptionsFactory.create(); Pipeline p = Pipeline.create(options); CoderRegistry cr = p.getCoderRegistry(); cr.registerCoder(Integer.class, BigEndianIntegerCoder.class); -``` +{{< /highlight >}} -```py +{{< highlight py >}} apache_beam.coders.registry.register_coder(int, BigEndianIntegerCoder) -``` +{{< /highlight >}} #### 7.2.3. Annotating a custom data type with a default coder {#annotating-custom-type-default-coder} -{:.language-java} +{{< paragraph class="language-java" >}} If your pipeline program defines a custom data type, you can use the `@DefaultCoder` annotation to specify the coder to use with that type. For example, let's say you have a custom data type for which you want to use `SerializableCoder`. You can use the `@DefaultCoder` annotation as follows: +{{< /paragraph >}} -```java +{{< highlight java >}} @DefaultCoder(AvroCoder.class) public class MyCustomDataType { ... } -``` +{{< /highlight >}} -{:.language-java} +{{< paragraph class="language-java" >}} If you've created a custom coder to match your data type, and you want to use the `@DefaultCoder` annotation, your coder class must implement a static `Coder.of(Class)` factory method. +{{< /paragraph >}} -```java +{{< highlight java >}} public class MyCustomCoder implements Coder { public static Coder of(Class clazz) {...} ... @@ -3357,12 +3428,13 @@ public class MyCustomCoder implements Coder { public class MyCustomDataType { ... } -``` +{{< /highlight >}} -{:.language-py} +{{< paragraph class="language-py" >}} The Beam SDK for Python does not support annotating data types with a default coder. If you would like to set a default coder, use the method described in the previous section, *Setting the default coder for a type*. +{{< /paragraph >}} ## 8. Windowing {#windowing} @@ -3421,7 +3493,7 @@ windows are not considered until `GroupByKey` or `Combine` aggregates across a window and key. This can have different effects on your pipeline. Consider the example pipeline in the figure below: -![Diagram of pipeline applying windowing]({{ "/images/windowing-pipeline-unbounded.svg" | prepend: site.baseurl }} "Pipeline applying windowing") +![Diagram of pipeline applying windowing](/images/windowing-pipeline-unbounded.svg) **Figure 3:** Pipeline applying windowing @@ -3445,13 +3517,13 @@ all the elements are by default part of a single, global window. To use windowing with fixed data sets, you can assign your own timestamps to each element. To assign timestamps to elements, use a `ParDo` transform with a `DoFn` that outputs each element with a new timestamp (for example, the -[WithTimestamps](https://beam.apache.org/releases/javadoc/{{ site.release_latest }}/index.html?org/apache/beam/sdk/transforms/WithTimestamps.html) +[WithTimestamps](https://beam.apache.org/releases/javadoc/{{< param release_latest >}}/index.html?org/apache/beam/sdk/transforms/WithTimestamps.html) transform in the Beam SDK for Java). To illustrate how windowing with a bounded `PCollection` can affect how your pipeline processes data, consider the following pipeline: -![Diagram of GroupByKey and ParDo without windowing, on a bounded collection]({{ "/images/unwindowed-pipeline-bounded.svg" | prepend: site.baseurl }} "GroupByKey and ParDo without windowing, on a bounded collection") +![Diagram of GroupByKey and ParDo without windowing, on a bounded collection](/images/unwindowed-pipeline-bounded.svg) **Figure 4:** `GroupByKey` and `ParDo` without windowing, on a bounded collection. @@ -3466,7 +3538,7 @@ all elements in your `PCollection` are assigned to a single global window. Now, consider the same pipeline, but using a windowing function: -![Diagram of GroupByKey and ParDo with windowing, on a bounded collection]({{ "/images/windowing-pipeline-bounded.svg" | prepend: site.baseurl }} "GroupByKey and ParDo with windowing, on a bounded collection") +![Diagram of GroupByKey and ParDo with windowing, on a bounded collection](/images/windowing-pipeline-bounded.svg) **Figure 5:** `GroupByKey` and `ParDo` with windowing, on a bounded collection. @@ -3510,7 +3582,7 @@ of the elements in your unbounded `PCollection` with timestamp values from with timestamp values from 0:00:30 up to (but not including) 0:01:00 belong to the second window, and so on. -![Diagram of fixed time windows, 30s in duration]({{ "/images/fixed-time-windows.png" | prepend: site.baseurl }} "Fixed time windows, 30s in duration") +![Diagram of fixed time windows, 30s in duration](/images/fixed-time-windows.png) **Figure 6:** Fixed time windows, 30s in duration. @@ -3529,7 +3601,7 @@ averages of data; using sliding time windows, you can compute a running average of the past 60 seconds' worth of data, updated every 30 seconds, in our example. -![Diagram of sliding time windows, with 1 minute window duration and 30s window period]({{ "/images/sliding-time-windows.png" | prepend: site.baseurl }} "Sliding time windows, with 1 minute window duration and 30s window period") +![Diagram of sliding time windows, with 1 minute window duration and 30s window period](/images/sliding-time-windows.png) **Figure 7:** Sliding time windows, with 1 minute window duration and 30s window period. @@ -3544,7 +3616,7 @@ have long periods of idle time interspersed with high concentrations of clicks. If data arrives after the minimum specified gap duration time, this initiates the start of a new window. -![Diagram of session windows with a minimum gap duration]({{ "/images/session-windows.png" | prepend: site.baseurl }} "Session windows, with a minimum gap duration") +![Diagram of session windows with a minimum gap duration](/images/session-windows.png) **Figure 8:** Session windows, with a minimum gap duration. Note how each data key has different windows, according to its data distribution. @@ -3581,15 +3653,15 @@ for more information. The following example code shows how to apply `Window` to divide a `PCollection` into fixed windows, each 60 seconds in length: -```java +{{< highlight java >}} PCollection items = ...; PCollection fixedWindowedItems = items.apply( Window.into(FixedWindows.of(Duration.standardSeconds(60)))); -``` -```py -{% github_sample /apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets_test.py tag:setting_fixed_windows -%} -``` +{{< /highlight >}} + +{{< highlight py >}} +{{< github_sample "/apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets_test.py" setting_fixed_windows >}} +{{< /highlight >}} #### 8.3.2. Sliding time windows {#using-sliding-time-windows} @@ -3597,15 +3669,15 @@ The following example code shows how to apply `Window` to divide a `PCollection` into sliding time windows. Each window is 30 seconds in length, and a new window begins every five seconds: -```java +{{< highlight java >}} PCollection items = ...; PCollection slidingWindowedItems = items.apply( Window.into(SlidingWindows.of(Duration.standardSeconds(30)).every(Duration.standardSeconds(5)))); -``` -```py -{% github_sample /apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets_test.py tag:setting_sliding_windows -%} -``` +{{< /highlight >}} + +{{< highlight py >}} +{{< github_sample "/apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets_test.py" setting_sliding_windows >}} +{{< /highlight >}} #### 8.3.3. Session windows {#using-session-windows} @@ -3613,15 +3685,15 @@ The following example code shows how to apply `Window` to divide a `PCollection` into session windows, where each session must be separated by a time gap of at least 10 minutes (600 seconds): -```java +{{< highlight java >}} PCollection items = ...; PCollection sessionWindowedItems = items.apply( Window.into(Sessions.withGapDuration(Duration.standardSeconds(600)))); -``` -```py -{% github_sample /apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets_test.py tag:setting_session_windows -%} -``` +{{< /highlight >}} + +{{< highlight py >}} +{{< github_sample "/apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets_test.py" setting_session_windows >}} +{{< /highlight >}} Note that the sessions are per-key — each key in the collection will have its own session groupings depending on the data distribution. @@ -3632,15 +3704,15 @@ If your `PCollection` is bounded (the size is fixed), you can assign all the elements to a single global window. The following example code shows how to set a single global window for a `PCollection`: -```java +{{< highlight java >}} PCollection items = ...; PCollection batchItems = items.apply( Window.into(new GlobalWindows())); -``` -```py -{% github_sample /apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets_test.py tag:setting_global_window -%} -``` +{{< /highlight >}} + +{{< highlight py >}} +{{< github_sample "/apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets_test.py" setting_global_window >}} +{{< /highlight >}} ### 8.4. Watermarks and late data {#watermarks-and-late-data} @@ -3691,14 +3763,15 @@ you set your `PCollection`'s windowing strategy. The following code example demonstrates a windowing strategy that will allow late data up to two days after the end of a window. -```java +{{< highlight java >}} PCollection items = ...; PCollection fixedWindowedItems = items.apply( Window.into(FixedWindows.of(Duration.standardMinutes(1))) .withAllowedLateness(Duration.standardDays(2))); -``` +{{< /highlight >}} + -```py +{{< highlight py >}} pc = [Initial PCollection] pc | beam.WindowInto( FixedWindows(60), @@ -3706,7 +3779,8 @@ the end of a window. accumulation_mode=accumulation_mode, timestamp_combiner=timestamp_combiner, allowed_lateness=Duration(seconds=2*24*60*60)) # 2 days -``` +{{< /highlight >}} + When you set `.withAllowedLateness` on a `PCollection`, that allowed lateness propagates forward to any subsequent `PCollection` derived from the first `PCollection` you applied allowed lateness to. If you want to change the allowed @@ -3733,7 +3807,7 @@ records in from a file, the file source doesn't assign timestamps automatically. You can parse the timestamp field from each record and use a `ParDo` transform with a `DoFn` to attach the timestamps to each element in your `PCollection`. -```java +{{< highlight java >}} PCollection unstampedLogs = ...; PCollection stampedLogs = unstampedLogs.apply(ParDo.of(new DoFn() { @@ -3745,11 +3819,11 @@ with a `DoFn` to attach the timestamps to each element in your `PCollection`. out.outputWithTimestamp(element, logTimeStamp); } })); -``` -```py -{% github_sample /apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets_test.py tag:setting_timestamp -%} -``` +{{< /highlight >}} + +{{< highlight py >}} +{{< github_sample "/apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets_test.py" setting_timestamp >}} +{{< /highlight >}} ## 9. Triggers {#triggers} @@ -3824,7 +3898,7 @@ before or after the end of the window. The following example shows a billing scenario, and uses both early and late firings: -```java +{{< highlight java >}} // Create a bill at the end of the month. AfterWatermark.pastEndOfWindow() // During the month, get near real-time estimates. @@ -3834,10 +3908,11 @@ firings: .plusDuration(Duration.standardMinutes(1)) // Fire on any late data so the bill can be corrected. .withLateFirings(AfterPane.elementCountAtLeast(1)) -``` -```py -{% github_sample /apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets_test.py tag:model_early_late_triggers -%}``` +{{< /highlight >}} + +{{< highlight py >}} +{{< github_sample "/apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets_test.py" model_early_late_triggers >}} +{{< /highlight >}} #### 9.1.1. Default trigger {#default-trigger} @@ -3887,31 +3962,34 @@ When you set a windowing function for a `PCollection` by using the `Window``WindowInto` transform, you can also specify a trigger. -{:.language-java} +{{< paragraph class="language-java" >}} You set the trigger(s) for a `PCollection` by invoking the method `.triggering()` on the result of your `Window.into()` transform. This code sample sets a time-based trigger for a `PCollection`, which emits results one minute after the first element in that window has been processed. The last line in the code sample, `.discardingFiredPanes()`, sets the window's **accumulation mode**. +{{< /paragraph >}} -{:.language-py} +{{< paragraph class="language-py" >}} You set the trigger(s) for a `PCollection` by setting the `trigger` parameter when you use the `WindowInto` transform. This code sample sets a time-based trigger for a `PCollection`, which emits results one minute after the first element in that window has been processed. The `accumulation_mode` parameter sets the window's **accumulation mode**. +{{< /paragraph >}} -```java +{{< highlight java >}} PCollection pc = ...; pc.apply(Window.into(FixedWindows.of(1, TimeUnit.MINUTES)) .triggering(AfterProcessingTime.pastFirstElementInPane() .plusDelayOf(Duration.standardMinutes(1))) .discardingFiredPanes()); -``` -```py -{% github_sample /apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets_test.py tag:model_setting_trigger -%}``` +{{< /highlight >}} + +{{< highlight py >}} +{{< github_sample "/apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets_test.py" model_setting_trigger >}} +{{< /highlight >}} #### 9.4.1. Window accumulation modes {#window-accumulation-modes} @@ -3921,16 +3999,18 @@ pane. Since a trigger can fire multiple times, the accumulation mode determines whether the system *accumulates* the window panes as the trigger fires, or *discards* them. -{:.language-java} +{{< paragraph class="language-java" >}} To set a window to accumulate the panes that are produced when the trigger fires, invoke`.accumulatingFiredPanes()` when you set the trigger. To set a window to discard fired panes, invoke `.discardingFiredPanes()`. +{{< /paragraph >}} -{:.language-py} +{{< paragraph class="language-py" >}} To set a window to accumulate the panes that are produced when the trigger fires, set the `accumulation_mode` parameter to `ACCUMULATING` when you set the trigger. To set a window to discard fired panes, set `accumulation_mode` to `DISCARDING`. +{{< /paragraph >}} Let's look an example that uses a `PCollection` with fixed-time windowing and a data-based trigger. This is something you might do if, for example, each window @@ -3946,7 +4026,7 @@ The following diagram shows data events for key X as they arrive in the PCollection and are assigned to windows. To keep the diagram a bit simpler, we'll assume that the events all arrive in the pipeline in order. -![Diagram of data events for acculumating mode example]({{ "/images/trigger-accumulation.png" | prepend: site.baseurl }} "Data events for accumulating mode example") +![Diagram of data events for acculumating mode example](/images/trigger-accumulation.png) ##### 9.4.1.1. Accumulating mode {#accumulating-mode} @@ -3984,14 +4064,15 @@ results immediately whenever late data arrives. You set the allowed lateness by using `.withAllowedLateness()` when you set your windowing function: -```java +{{< highlight java >}} PCollection pc = ...; pc.apply(Window.into(FixedWindows.of(1, TimeUnit.MINUTES)) .triggering(AfterProcessingTime.pastFirstElementInPane() .plusDelayOf(Duration.standardMinutes(1))) .withAllowedLateness(Duration.standardMinutes(30)); -``` -```py +{{< /highlight >}} + +{{< highlight py >}} pc = [Initial PCollection] pc | beam.WindowInto( FixedWindows(60), @@ -3999,7 +4080,7 @@ windowing function: allowed_lateness=1800) # 30 minutes | ... -``` +{{< /highlight >}} This allowed lateness propagates to all `PCollection`s derived as a result of applying transforms to the original `PCollection`. If you want to change the @@ -4057,11 +4138,12 @@ example trigger code fires on the following conditions: * Any time late data arrives, after a ten-minute delay -{:.language-java} +{{< paragraph class="language-java" >}} * After two days, we assume no more data of interest will arrive, and the trigger stops executing +{{< /paragraph >}} -```java +{{< highlight java >}} .apply(Window .configure() .triggering(AfterWatermark @@ -4070,10 +4152,11 @@ example trigger code fires on the following conditions: .pastFirstElementInPane() .plusDelayOf(Duration.standardMinutes(10)))) .withAllowedLateness(Duration.standardDays(2))); -``` -```py -{% github_sample /apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets_test.py tag:model_composite_triggers -%}``` +{{< /highlight >}} + +{{< highlight py >}} +{{< github_sample "/apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets_test.py" model_composite_triggers >}} +{{< /highlight >}} #### 9.5.3. Other composite triggers {#other-composite-triggers} @@ -4081,14 +4164,15 @@ You can also build other sorts of composite triggers. The following example code shows a simple composite trigger that fires whenever the pane has at least 100 elements, or after a minute. -```java +{{< highlight java >}} Repeatedly.forever(AfterFirst.of( AfterPane.elementCountAtLeast(100), AfterProcessingTime.pastFirstElementInPane().plusDelayOf(Duration.standardMinutes(1)))) -``` -```py -{% github_sample /apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets_test.py tag:model_other_composite_triggers -%}``` +{{< /highlight >}} + +{{< highlight py >}} +{{< github_sample "/apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets_test.py" model_other_composite_triggers >}} +{{< /highlight >}} ## 10. Metrics {#metrics} In the Beam model, metrics provide some insight into the current state of a user pipeline, @@ -4124,7 +4208,7 @@ There are three types of metrics that are supported for the moment: `Counter`, ` **Counter**: A metric that reports a single long value and can be incremented or decremented. -```java +{{< highlight java >}} Counter counter = Metrics.counter( "namespace", "counter1"); @ProcessElement @@ -4133,11 +4217,11 @@ public void processElement(ProcessContext context) { counter.inc(); ... } -``` +{{< /highlight >}} **Distribution**: A metric that reports information about the distribution of reported values. -```java +{{< highlight java >}} Distribution distribution = Metrics.distribution( "namespace", "distribution1"); @ProcessElement @@ -4147,12 +4231,12 @@ public void processElement(ProcessContext context) { distribution.update(element); ... } -``` +{{< /highlight >}} **Gauge**: A metric that reports the latest value out of reported values. Since metrics are collected from many workers the value may not be the absolute last, but one of the latest values. -```java +{{< highlight java >}} Gauge gauge = Metrics.gauge( "namespace", "gauge1"); @ProcessElement @@ -4162,14 +4246,14 @@ public void processElement(ProcessContext context) { gauge.set(element); ... } -``` +{{< /highlight >}} ### 10.3 Querying metrics {#querying-metrics} `PipelineResult` has a method `metrics()` which returns a `MetricResults` object that allows accessing metrics. The main method available in `MetricResults` allows querying for all metrics matching a given filter. -```java +{{< highlight java >}} public interface PipelineResult { MetricResults metrics(); } @@ -4190,12 +4274,12 @@ public interface MetricResult { T getCommitted(); T getAttempted(); } -``` +{{< /highlight >}} ### 10.4 Using metrics in pipeline {#using-metrics} Below, there is a simple example of how to use a `Counter` metric in a user pipeline. -```java +{{< highlight java >}} // creating a pipeline with custom metrics DoFn pipeline .apply(...) @@ -4228,8 +4312,10 @@ public class MyMetricsDoFn extends DoFn { context.output(context.element()); } } -``` -### 9.5 Export metrics {#export-metrics} +{{< /highlight >}} + +### 10.5 Export metrics {#export-metrics} + Beam metrics can be exported to external sinks. If a metrics sink is set up in the configuration, the runner will push metrics to it at a default 5s period. The configuration is held in the [MetricsOptions](https://beam.apache.org/releases/javadoc/2.19.0/org/apache/beam/sdk/metrics/MetricsOptions.html) class. It contains push period configuration and also sink specific options such as type and URL. As for now only the REST HTTP and the Graphite sinks are supported and only @@ -4237,9 +4323,8 @@ Flink and Spark runners support metrics export. Also Beam metrics are exported to inner Spark and Flink dashboards to be consulted in their respective UI. +## 11. State and Timers {#state-and-timers} - -## 10. State and Timers {#state-and-timers} Beam's windowing and triggering facilities provide a powerful abstraction for grouping and aggregating unbounded input data based on timestamps. However there are aggregation use cases for which developers may require a higher degree of control than provided by windows and triggers. Beam provides an API for manually managing per-key state, allowing for @@ -4265,17 +4350,19 @@ In Java DoFn declares states to be accessed by creating final `StateSpec` member state must be named using the `StateId` annotation; this name is unique to a ParDo in the graph and has no relation to other nodes in the graph. A `DoFn` can declare multiple state variables. -### 10.1 Types of state {#types-of-state} +### 11.1 Types of state {#types-of-state} + Beam provides several types of state: #### ValueState + A ValueState is a scalar state value. For each key in the input, a ValueState will store a typed value that can be read and modified inside the DoFn's `@ProcessElement` or `@OnTimer` methods. If the type of the ValueState has a coder registered, then Beam will automatically infer the coder for the state value. Otherwise, a coder can be explicitly specified when creating the ValueState. For example, the following ParDo creates a single state variable that accumulates the number of elements seen. -```java +{{< highlight java >}} PCollection> perUser = readPerUser(); perUser.apply(ParDo.of(new DoFn, OutputT>() { @StateId("state") private final StateSpec> numElements = StateSpecs.value(); @@ -4288,22 +4375,24 @@ perUser.apply(ParDo.of(new DoFn, OutputT>() { state.write(currentValue + 1); } })); -``` +{{< /highlight >}} Beam also allows explicitly specifying a coder for `ValueState` values. For example: -```java +{{< highlight java >}} PCollection> perUser = readPerUser(); perUser.apply(ParDo.of(new DoFn, OutputT>() { @StateId("state") private final StateSpec> numElements = StateSpecs.value(new MyTypeCoder()); ... })); -``` +{{< /highlight >}} #### CombiningState + `CombiningState` allows you to create a state object that is updated using a Beam combiner. For example, the previous `ValueState` example could be rewritten to use `CombiningState` -```java + +{{< highlight java >}} PCollection> perUser = readPerUser(); perUser.apply(ParDo.of(new DoFn, OutputT>() { @StateId("state") private final StateSpec> numElements = @@ -4313,15 +4402,16 @@ perUser.apply(ParDo.of(new DoFn, OutputT>() { state.add(1); } })); -``` +{{< /highlight >}} #### BagState + A common use case for state is to accumulate multiple elements. `BagState` allows for accumulating an unordered set ofelements. This allows for addition of elements to the collection without requiring the reading of the entire collection first, which is an efficiency gain. In addition, runners that support paged reads can allow individual bags larger than available memory. -```java +{{< highlight java >}} PCollection> perUser = readPerUser(); perUser.apply(ParDo.of(new DoFn, OutputT>() { @StateId("state") private final StateSpec> numElements = StateSpecs.bag(); @@ -4339,14 +4429,16 @@ perUser.apply(ParDo.of(new DoFn, OutputT>() { } } })); -``` -### 10.2 Deferred state reads {#deferred-state-reads} +{{< /highlight >}} + +### 11.2 Deferred state reads {#deferred-state-reads} + When a `DoFn` contains multiple state specifications, reading each one in order can be slow. Calling the `read()` function on a state can cause the runner to perform a blocking read. Performing multiple blocking reads in sequence adds latency to element processing. If you know that a state will always be read, you can annotate it as @AlwaysFetched, and then the runner can prefetch all of the states necessary. For example: -```java +{{< highlight java >}} PCollection> perUser = readPerUser(); perUser.apply(ParDo.of(new DoFn, OutputT>() { @StateId("state1") private final StateSpec> state1 = StateSpecs.value(); @@ -4362,13 +4454,13 @@ perUser.apply(ParDo.of(new DoFn, OutputT>() { state3.read(); } })); -``` +{{< /highlight >}} If however there are code paths in which the states are not fetched, then annotating with @AlwaysFetched will add unnecessary fetching for those paths. In this case, the readLater method allows the runner to know that the state will be read in the future, allowing multiple state reads to be batched together. -```java +{{< highlight java >}} PCollection> perUser = readPerUser(); perUser.apply(ParDo.of(new DoFn, OutputT>() { @StateId("state1") private final StateSpec> state1 = StateSpecs.value(); @@ -4391,20 +4483,22 @@ perUser.apply(ParDo.of(new DoFn, OutputT>() { processState3(state3.read()); } })); -``` +{{< /highlight >}} + +### 11.3 Timers {#timers} -### 10.3 Timers {#timers} Beam provides a per-key timer callback API. This allows for delayed processing of data stored using the state API. Timers can be set to callback at either an event-time or a processing-time timestamp. Every timer is identified with a TimerId. A given timer for a key can only be set for a single timestamp. Calling set on a timer overwrites the previous firing time for that key's timer. -#### 10.3.1 Event-time timers {#event-time-timers} +#### 11.3.1 Event-time timers {#event-time-timers} + Event-time timers fire when the input watermark for the DoFn passes the time at which the timer is set, meaning that the runner believes that there are no more elements to be processed with timestamps before the timer timestamp. This allows for event-time aggregations. -```java +{{< highlight java >}} PCollection> perUser = readPerUser(); perUser.apply(ParDo.of(new DoFn, OutputT>() { @StateId("state") private final StateSpec> state = StateSpecs.value(); @@ -4424,9 +4518,10 @@ perUser.apply(ParDo.of(new DoFn, OutputT>() { //Process timer. } })); +{{< /highlight >}} + +#### 11.3.2 Processing-time timers {#processing-time-timers} -``` -#### 10.3.2 Processing-time timers {#processing-time-timers} Processing-time timers fire when the real wall-clock time passes. This is often used to create larger batches of data before processing. It can also be used to schedule events that should occur at a specific time. Just like with event-time timers, processing-time timers are per key - each key has a separate copy of the timer. @@ -4434,7 +4529,7 @@ event-time timers, processing-time timers are per key - each key has a separate While processing-time timers can be set to an absolute timestamp, it is very common to set them to an offset relative to the current time. The `Timer.offset` and `Timer.setRelative` methods can be used to accomplish this. -```java +{{< highlight java >}} PCollection> perUser = readPerUser(); perUser.apply(ParDo.of(new DoFn, OutputT>() { @TimerId("timer") private final TimerSpec timer = TimerSpecs.timer(TimeDomain.PROCESSING_TIME); @@ -4449,17 +4544,17 @@ perUser.apply(ParDo.of(new DoFn, OutputT>() { //Process timer. } })); +{{< /highlight >}} -``` +#### 11.3.3 Dynamic timer tags {#dynamic-timer-tags} -#### 10.3.3 Dynamic timer tags {#dynamic-timer-tags} Beam also supports dynamically setting a timer tag using `TimerMap`. This allows for setting multiple different timers in a `DoFn` and allowing for the timer tags to be dynamically chosen - e.g. based on data in the input elements. A timer with a specific tag can only be set to a single timestamp, so setting the timer again has the effect of overwriting the previous expiration time for the timer with that tag. Each `TimerMap` is identified with a timer family id, and timers in different timer families are independent. -```java +{{< highlight java >}} PCollection> perUser = readPerUser(); perUser.apply(ParDo.of(new DoFn, OutputT>() { @TimerFamily("actionTimers") private final TimerSpec timer = @@ -4476,10 +4571,10 @@ perUser.apply(ParDo.of(new DoFn, OutputT>() { LOG.info("Timer fired with id " + timerId); } })); +{{< /highlight >}} -``` +#### 11.3.4 Timer output timestamps {#timer-output-timestamps} -#### 10.3.4 Timer output timestamps {#timer-output-timestamps} By default, event-time timers will hold the output watermark of the `ParDo` to the timestamp of the timer. This means that if a timer is set to 12pm, any windowed aggregations or event-time timers later in the pipeline graph that finish after 12pm will not expire. The timestamp of the timer is also the default output timestamp for the timer callback. This @@ -4491,7 +4586,7 @@ In some cases, a DoFn needs to output timestamps earlier than the timer expirati hold its output watermark to those timestamps. For example, consider the following pipeline that temporarily batches records into state, and sets a timer to drain the state. This code may appear correct, but will not work properly. -```java +{{< highlight java >}} PCollection> perUser = readPerUser(); perUser.apply(ParDo.of(new DoFn, OutputT>() { @StateId("elementBag") private final StateSpec> elementBag = StateSpecs.bag(); @@ -4525,13 +4620,14 @@ perUser.apply(ParDo.of(new DoFn, OutputT>() { timerSet.clear(); } })); -``` +{{< /highlight >}} + The problem with this code is that the ParDo is buffering elements, however nothing is preventing the watermark from advancing past the timestamp of those elements, so all those elements might be dropped as late data. In order to prevent this from happening, an output timestamp needs to be set on the timer to prevent the watermark from advancing past the timestamp of the minimum element. The following code demonstrates this. -```java +{{< highlight java >}} PCollection> perUser = readPerUser(); perUser.apply(ParDo.of(new DoFn, OutputT>() { // The bag of elements accumulated. @@ -4578,12 +4674,13 @@ perUser.apply(ParDo.of(new DoFn, OutputT>() { timerTimestamp.clear(); } })); -``` -### 10.4 Garbage collecting state {#garbage-collecting-state} +{{< /highlight >}} + +### 11.4 Garbage collecting state {#garbage-collecting-state} Per-key state needs to be garbage collected, or eventually the increasing size of state may negatively impact performance. There are two common strategies for garbage collecting state. -##### 10.4.1 **Using windows for garbage collection** {#using-windows-for-garbage-collection} +##### 11.4.1 **Using windows for garbage collection** {#using-windows-for-garbage-collection} All state and timers for a key is scoped to the window it is in. This means that depending on the timestamp of the input element the ParDo will see different values for the state depending on the window that element falls into. In addition, once the input watermark passes the end of the window, the runner should garbage collect all state for that @@ -4593,7 +4690,7 @@ garbage-collection strategy. For example, given the following: -```java +{{< highlight java >}} PCollection> perUser = readPerUser(); perUser.apply(Window.into(CalendarWindows.days(1) .withTimeZone(DateTimeZone.forID("America/Los_Angeles")))); @@ -4605,17 +4702,18 @@ perUser.apply(Window.into(CalendarWindows.days(1) // midnight PST, then a new copy of the state will be seen for the next day. } })); -``` +{{< /highlight >}} This `ParDo` stores state per day. Once the pipeline is done processing data for a given day, all the state for that day is garbage collected. -##### 10.4.1 **Using timers For garbage collection** {#using-timers-for-garbage-collection} +##### 11.4.1 **Using timers For garbage collection** {#using-timers-for-garbage-collection} + In some cases, it is difficult to find a windowing strategy that models the desired garbage-collection strategy. For example, a common desire is to garbage collect state for a key once no activity has been seen on the key for some time. This can be done by updating a timer that garbage collects state. For example -```java +{{< highlight java >}} PCollection> perUser = readPerUser(); perUser.apply(ParDo.of(new DoFn, OutputT>() { // The state for the key. @@ -4651,12 +4749,14 @@ perUser.apply(ParDo.of(new DoFn, OutputT>() { maxTimestamp.clear(); } } -```` +{{< /highlight >}} + +### 11.5 State and timers examples {#state-timers-examples} -### 10.5 State and timers examples {#state-timers-examples} Following are some example uses of state and timers -#### 10.5.1. Joining clicks and views {#joining-clicks-and-views} +#### 11.5.1. Joining clicks and views {#joining-clicks-and-views} + In this example, the pipeline is processing data from an e-commerce site's home page. There are two input streams: a stream of views, representing suggested product links displayed to the user on the home page, and a stream of clicks, representing actual user clicks on these links. The goal of the pipeline is to join click events with view @@ -4669,7 +4769,7 @@ lost and never make it to the Beam pipeline; the pipeline will similarly wait on give up if the view event does not arrive in that time. Input events are not ordered - it is possible to see the click event before the view event. The one hour join timeout should be based on event time, not on processing time. -```java +{{< highlight java >}} // Read the event stream and key it by the link id. PCollection> eventsPerLinkId = readEvents() @@ -4735,15 +4835,15 @@ perUser.apply(ParDo.of(new DoFn, JoinedEvent>() { maxTimestampState.clear(); } })); -```` +{{< /highlight >}} -#### 10.5.2 Batching RPCs {#batching-rpcs} +#### 11.5.2 Batching RPCs {#batching-rpcs} In this example, input elements are being forwarded to an external RPC service. The RPC accepts batch requests - multiple events for the same user can be batched in a single RPC call. Since this RPC service also imposes rate limits, we want to batch ten seconds worth of events together in order to reduce the number of calls. -```java +{{< highlight java >}} PCollection> perUser = readPerUser(); perUser.apply(ParDo.of(new DoFn, OutputT>() { // Store the elements buffered so far. @@ -4776,4 +4876,4 @@ perUser.apply(ParDo.of(new DoFn, OutputT>() { isTimerSetState.clear(); } })); -``` \ No newline at end of file +{{< /highlight >}} \ No newline at end of file diff --git a/website/src/documentation/resources/learning-resources.md b/website/www/site/content/en/documentation/resources/learning-resources.md similarity index 99% rename from website/src/documentation/resources/learning-resources.md rename to website/www/site/content/en/documentation/resources/learning-resources.md index 0289932b1e4e..2276387637c6 100644 --- a/website/src/documentation/resources/learning-resources.md +++ b/website/www/site/content/en/documentation/resources/learning-resources.md @@ -1,8 +1,5 @@ --- -layout: section title: "Learning Resources" -section_menu: section-menu/documentation.html -permalink: /documentation/resources/learning-resources/ --- # Using the Apache Apex Runner -The Apex Runner executes Apache Beam pipelines using [Apache Apex](https://apex.apache.org/) as an underlying engine. The runner has broad support for the [Beam model and supports streaming and batch pipelines]({{ site.baseurl }}/documentation/runners/capability-matrix/). +The Apex Runner executes Apache Beam pipelines using [Apache Apex](https://apex.apache.org/) as an underlying engine. The runner has broad support for the [Beam model and supports streaming and batch pipelines](/documentation/runners/capability-matrix/). [Apache Apex](https://apex.apache.org/) is a stream processing platform and framework for low-latency, high-throughput and fault-tolerant analytics applications on Apache Hadoop. Apex has a unified streaming architecture and can be used for real-time and batch processing. The following instructions are for running Beam pipelines with Apex on a YARN cluster. -They are not required for Apex in embedded mode (see [quickstart]({{ site.baseurl }}/get-started/quickstart-java/)). +They are not required for Apex in embedded mode (see [quickstart](/get-started/quickstart-java/)). ## Apex Runner prerequisites diff --git a/website/src/documentation/runners/capability-matrix.md b/website/www/site/content/en/documentation/runners/capability-matrix.md similarity index 78% rename from website/src/documentation/runners/capability-matrix.md rename to website/www/site/content/en/documentation/runners/capability-matrix.md index 2e7d845b6f7c..52cfd34aaa24 100644 --- a/website/src/documentation/runners/capability-matrix.md +++ b/website/www/site/content/en/documentation/runners/capability-matrix.md @@ -1,9 +1,7 @@ --- -layout: section +type: runners title: "Apache Beam Capability Matrix" -section_menu: section-menu/runners.html -permalink: /documentation/runners/capability-matrix/ -redirect_from: +aliases: - /learn/runners/capability-matrix/ - /capability-matrix/ --- @@ -35,26 +33,10 @@ For more details on the What /